71 files changed, 3801 insertions, 938 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 7c8db19f5c..dcebab7c0f 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -124,6 +124,7 @@ C_SOURCES = \
 	util/u_linear.c \
 	util/u_network.c \
 	util/u_math.c \
+	util/u_mempool.c \
 	util/u_mm.c \
 	util/u_rect.c \
 	util/u_ringbuffer.c \
@@ -154,6 +155,8 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_flow.c \
         gallivm/lp_bld_format_aos.c \
         gallivm/lp_bld_format_soa.c \
+        gallivm/lp_bld_format_yuv.c \
+        gallivm/lp_bld_gather.c \
         gallivm/lp_bld_init.c \
         gallivm/lp_bld_intr.c \
         gallivm/lp_bld_logic.c \
@@ -167,8 +170,10 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
+        draw/draw_vs_llvm.c \
         draw/draw_pt_fetch_shade_pipeline_llvm.c \
-        draw/draw_llvm_translate.c
+        draw/draw_llvm_translate.c \
+        draw/draw_llvm_sample.c
 
 GALLIVM_CPP_SOURCES = \
     gallivm/lp_bld_misc.cpp
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 6242ab0c59..72a16617db 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -32,8 +32,8 @@ env.CodeGenerate(
 
 env.CodeGenerate(
     target = 'util/u_format_table.c',
-    script = 'util/u_format_table.py',
-    source = ['util/u_format.csv'],
+    script = '#src/gallium/auxiliary/util/u_format_table.py',
+    source = ['#src/gallium/auxiliary/util/u_format.csv'],
     command = 'python $SCRIPT $SOURCE > $TARGET'
 )
 
@@ -45,7 +45,7 @@ env.CodeGenerate(
 )
 
 env.Depends('util/u_format_table.c', [
-    'util/u_format_parse.py', 
+    '#src/gallium/auxiliary/util/u_format_parse.py',
     'util/u_format_pack.py', 
 ])
 
@@ -172,6 +172,7 @@ source = [
     'util/u_keymap.c',
     'util/u_network.c',
     'util/u_math.c',
+    'util/u_mempool.c',
     'util/u_mm.c',
     'util/u_rect.c',
     'util/u_resource.c',
@@ -203,6 +204,8 @@ if env['llvm']:
     'gallivm/lp_bld_flow.c',
     'gallivm/lp_bld_format_aos.c',
     'gallivm/lp_bld_format_soa.c',
+    'gallivm/lp_bld_format_yuv.c',
+    'gallivm/lp_bld_gather.c',
     'gallivm/lp_bld_intr.c',
     'gallivm/lp_bld_logic.c',
     'gallivm/lp_bld_init.c',
@@ -218,7 +221,9 @@ if env['llvm']:
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
     'draw/draw_pt_fetch_shade_pipeline_llvm.c',
-    'draw/draw_llvm_translate.c'
+    'draw/draw_llvm_translate.c',
+    'draw/draw_vs_llvm.c',
+    'draw/draw_llvm_sample.c'
     ]
 
 gallium = env.ConvenienceLibrary(
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 20a8612dca..58b022d531 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -289,6 +289,9 @@ void cso_release_all( struct cso_context *ctx )
       ctx->pipe->bind_fs_state( ctx->pipe, NULL );
       ctx->pipe->bind_vs_state( ctx->pipe, NULL );
       ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL );
+      ctx->pipe->set_fragment_sampler_views(ctx->pipe, 0, NULL);
+      if (ctx->pipe->set_vertex_sampler_views)
+         ctx->pipe->set_vertex_sampler_views(ctx->pipe, 0, NULL);
    }
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
@@ -1029,6 +1032,7 @@ static INLINE void
 clip_state_cpy(struct pipe_clip_state *dst,
                const struct pipe_clip_state *src)
 {
+   dst->depth_clamp = src->depth_clamp;
    dst->nr = src->nr;
    if (src->nr) {
       memcpy(dst->ucp, src->ucp, src->nr * sizeof(src->ucp[0]));
@@ -1039,6 +1043,9 @@ static INLINE int
 clip_state_cmp(const struct pipe_clip_state *a,
                const struct pipe_clip_state *b)
 {
+   if (a->depth_clamp != b->depth_clamp) {
+      return 1;
+   }
    if (a->nr != b->nr) {
       return 1;
    }
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index dab95e5051..c127f74188 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -40,6 +40,7 @@
 
 #if HAVE_LLVM
 #include "gallivm/lp_bld_init.h"
+#include "draw_llvm.h"
 #endif
 
 struct draw_context *draw_create( struct pipe_context *pipe )
@@ -52,6 +53,7 @@ struct draw_context *draw_create( struct pipe_context *pipe )
    lp_build_init();
    assert(lp_build_engine);
    draw->engine = lp_build_engine;
+   draw->llvm = draw_llvm_create(draw);
 #endif
 
    if (!draw_init(draw))
@@ -132,6 +134,9 @@ void draw_destroy( struct draw_context *draw )
    draw_pt_destroy( draw );
    draw_vs_destroy( draw );
    draw_gs_destroy( draw );
+#ifdef HAVE_LLVM
+   draw_llvm_destroy( draw->llvm );
+#endif
 
    FREE( draw );
 }
@@ -211,6 +216,7 @@ void draw_set_clip_state( struct draw_context *draw,
    assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
    memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
    draw->nr_planes = 6 + clip->nr;
+   draw->depth_clamp = clip->depth_clamp;
 }
 
 
@@ -601,3 +607,54 @@ draw_set_so_state(struct draw_context *draw,
           state,
           sizeof(struct pipe_stream_output_state));
 }
+
+void
+draw_set_sampler_views(struct draw_context *draw,
+                       struct pipe_sampler_view **views,
+                       unsigned num)
+{
+   unsigned i;
+
+   debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   for (i = 0; i < num; ++i)
+      draw->sampler_views[i] = views[i];
+   for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      draw->sampler_views[i] = NULL;
+
+   draw->num_sampler_views = num;
+}
+
+void
+draw_set_samplers(struct draw_context *draw,
+                  struct pipe_sampler_state **samplers,
+                  unsigned num)
+{
+   unsigned i;
+
+   debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   for (i = 0; i < num; ++i)
+      draw->samplers[i] = samplers[i];
+   for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      draw->samplers[i] = NULL;
+
+   draw->num_samplers = num;
+}
+
+void
+draw_set_mapped_texture(struct draw_context *draw,
+                        unsigned sampler_idx,
+                        uint32_t width, uint32_t height, uint32_t depth,
+                        uint32_t last_level,
+                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+{
+#ifdef HAVE_LLVM
+   draw_llvm_set_mapped_texture(draw,
+                                sampler_idx,
+                                width, height, depth, last_level,
+                                row_stride, img_stride, data);
+#endif
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index c0122f2aca..116716af6f 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -47,11 +47,14 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct tgsi_sampler;
 
+#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
 void draw_destroy( struct draw_context *draw );
 
+void draw_flush(struct draw_context *draw);
+
 void draw_set_viewport_state( struct draw_context *draw,
                               const struct pipe_viewport_state *viewport );
 
@@ -101,6 +104,23 @@ draw_texture_samplers(struct draw_context *draw,
                       uint num_samplers,
                       struct tgsi_sampler **samplers);
 
+void
+draw_set_sampler_views(struct draw_context *draw,
+                       struct pipe_sampler_view **views,
+                       unsigned num);
+void
+draw_set_samplers(struct draw_context *draw,
+                  struct pipe_sampler_state **samplers,
+                  unsigned num);
+
+void
+draw_set_mapped_texture(struct draw_context *draw,
+                        unsigned sampler_idx,
+                        uint32_t width, uint32_t height, uint32_t depth,
+                        uint32_t last_level,
+                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
 
 
 /*
@@ -173,7 +193,7 @@ draw_set_so_state(struct draw_context *draw,
 
 
 /***********************************************************************
- * draw_prim.c 
+ * draw_pt.c 
  */
 
 void draw_arrays(struct draw_context *draw, unsigned prim,
@@ -187,8 +207,6 @@ draw_arrays_instanced(struct draw_context *draw,
                       unsigned startInstance,
                       unsigned instanceCount);
 
-void draw_flush(struct draw_context *draw);
-
 
 /*******************************************************************************
  * Driver backend interface 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 9117c1303d..19f96c37ab 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1,3 +1,30 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #include "draw_llvm.h"
 
 #include "draw_context.h"
@@ -15,14 +42,13 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_cpu_detect.h"
-#include "util/u_string.h"
 #include "util/u_pointer.h"
+#include "util/u_string.h"
 
 #include <llvm-c/Transforms/Scalar.h>
 
 #define DEBUG_STORE 0
 
-
 /* generates the draw jit function */
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var);
@@ -36,12 +62,19 @@ init_globals(struct draw_llvm *llvm)
 
    /* struct draw_jit_texture */
    {
-      LLVMTypeRef elem_types[4];
+      LLVMTypeRef elem_types[DRAW_JIT_TEXTURE_NUM_FIELDS];
 
       elem_types[DRAW_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
-      elem_types[DRAW_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
-      elem_types[DRAW_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+      elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_DATA] =
+         LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
+                       DRAW_MAX_TEXTURE_LEVELS);
 
       texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -51,9 +84,18 @@ init_globals(struct draw_llvm *llvm)
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, height,
                              llvm->target, texture_type,
                              DRAW_JIT_TEXTURE_HEIGHT);
-      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, stride,
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, depth,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_DEPTH);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, last_level,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_LAST_LEVEL);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, row_stride,
                              llvm->target, texture_type,
-                             DRAW_JIT_TEXTURE_STRIDE);
+                             DRAW_JIT_TEXTURE_ROW_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, img_stride,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_IMG_STRIDE);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, data,
                              llvm->target, texture_type,
                              DRAW_JIT_TEXTURE_DATA);
@@ -71,7 +113,8 @@ init_globals(struct draw_llvm *llvm)
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
       elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[2] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[2] = LLVMArrayType(texture_type,
+                                    PIPE_MAX_VERTEX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -81,7 +124,7 @@ init_globals(struct draw_llvm *llvm)
                              llvm->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures,
                              llvm->target, context_type,
-                             DRAW_JIT_CONTEXT_TEXTURES_INDEX);
+                             DRAW_JIT_CTX_TEXTURES);
       LP_CHECK_STRUCT_SIZE(struct draw_jit_context,
                            llvm->target, context_type);
 
@@ -195,9 +238,22 @@ draw_llvm_create(struct draw_context *draw)
       /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
        * but there are more on SVN. */
       /* TODO: Add more passes */
+
       LLVMAddCFGSimplificationPass(llvm->pass);
-      LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
-      LLVMAddConstantPropagationPass(llvm->pass);
+
+      if (HAVE_LLVM >= 0x207 && sizeof(void*) == 4) {
+         /* For LLVM >= 2.7 and 32-bit build, use this order of passes to
+          * avoid generating bad code.
+          * Test with piglit glsl-vs-sqrt-zero test.
+          */
+         LLVMAddConstantPropagationPass(llvm->pass);
+         LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
+      }
+      else {
+         LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
+         LLVMAddConstantPropagationPass(llvm->pass);
+      }
+
       if(util_cpu_caps.has_sse4_1) {
          /* FIXME: There is a bug in this pass, whereby the combination of fptosi
           * and sitofp (necessary for trunc/floor/ceil/round implementation)
@@ -219,6 +275,9 @@ draw_llvm_create(struct draw_context *draw)
       LLVMDumpModule(llvm->module);
    }
 
+   llvm->nr_variants = 0;
+   make_empty_list(&llvm->vs_variants_list);
+
    return llvm;
 }
 
@@ -231,9 +290,13 @@ draw_llvm_destroy(struct draw_llvm *llvm)
 }
 
 struct draw_llvm_variant *
-draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs)
+draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs)
 {
    struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant));
+   struct llvm_vertex_shader *shader =
+      llvm_vertex_shader(llvm->draw->vs.vertex_shader);
+
+   variant->llvm = llvm;
 
    draw_llvm_make_variant_key(llvm, &variant->key);
 
@@ -242,6 +305,12 @@ draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs)
    draw_llvm_generate(llvm, variant);
    draw_llvm_generate_elts(llvm, variant);
 
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   /*variant->no = */shader->variants_created++;
+   variant->list_item_global.base = variant;
+
    return variant;
 }
 
@@ -250,11 +319,13 @@ generate_vs(struct draw_llvm *llvm,
             LLVMBuilderRef builder,
             LLVMValueRef (*outputs)[NUM_CHANNELS],
             const LLVMValueRef (*inputs)[NUM_CHANNELS],
-            LLVMValueRef context_ptr)
+            LLVMValueRef context_ptr,
+            struct lp_build_sampler_soa *draw_sampler)
 {
    const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
    struct lp_type vs_type;
    LLVMValueRef consts_ptr = draw_jit_context_vs_constants(builder, context_ptr);
+   struct lp_build_sampler_soa *sampler = 0;
 
    memset(&vs_type, 0, sizeof vs_type);
    vs_type.floating = TRUE; /* floating point values */
@@ -270,6 +341,10 @@ generate_vs(struct draw_llvm *llvm,
       tgsi_dump(tokens, 0);
    }
 
+   if (llvm->draw->num_sampler_views &&
+       llvm->draw->num_samplers)
+      sampler = draw_sampler;
+
    lp_build_tgsi_soa(builder,
                      tokens,
                      vs_type,
@@ -278,7 +353,7 @@ generate_vs(struct draw_llvm *llvm,
                      NULL /*pos*/,
                      inputs,
                      outputs,
-                     NULL/*sampler*/,
+                     sampler,
                      &llvm->draw->vs.vertex_shader->info);
 }
 
@@ -306,7 +381,8 @@ generate_fetch(LLVMBuilderRef builder,
                LLVMValueRef *res,
                struct pipe_vertex_element *velem,
                LLVMValueRef vbuf,
-               LLVMValueRef index)
+               LLVMValueRef index,
+               LLVMValueRef instance_id)
 {
    LLVMValueRef indices = LLVMConstInt(LLVMInt64Type(), velem->vertex_buffer_index, 0);
    LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
@@ -317,8 +393,15 @@ generate_fetch(LLVMBuilderRef builder,
    LLVMValueRef cond;
    LLVMValueRef stride;
 
-   cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
+   if (velem->instance_divisor) {
+      /* array index = instance_id / instance_divisor */
+      index = LLVMBuildUDiv(builder, instance_id,
+                            LLVMConstInt(LLVMInt32Type(), velem->instance_divisor, 0),
+                            "instance_divisor");
+   }
 
+   /* limit index to min(inex, vb_max_index) */
+   cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
    index = LLVMBuildSelect(builder, cond, index, vb_max_index, "");
 
    stride = LLVMBuildMul(builder, vb_stride, index, "");
@@ -586,13 +669,14 @@ convert_to_aos(LLVMBuilderRef builder,
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
-   LLVMTypeRef arg_types[7];
+   LLVMTypeRef arg_types[8];
    LLVMTypeRef func_type;
    LLVMValueRef context_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef start, end, count, stride, step, io_itr;
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
+   LLVMValueRef instance_id;
    struct draw_context *draw = llvm->draw;
    unsigned i, j;
    struct lp_build_context bld;
@@ -601,6 +685,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    const int max_vertices = 4;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    void *code;
+   struct lp_build_sampler_soa *sampler = 0;
 
    arg_types[0] = llvm->context_ptr_type;           /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;     /* vertex_header */
@@ -609,6 +694,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    arg_types[4] = LLVMInt32Type();                  /* count */
    arg_types[5] = LLVMInt32Type();                  /* stride */
    arg_types[6] = llvm->vb_ptr_type;                /* pipe_vertex_buffer's */
+   arg_types[7] = LLVMInt32Type();                  /* instance_id */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
@@ -625,6 +711,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    count        = LLVMGetParam(variant->function, 4);
    stride       = LLVMGetParam(variant->function, 5);
    vb_ptr       = LLVMGetParam(variant->function, 6);
+   instance_id  = LLVMGetParam(variant->function, 7);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(io_ptr, "io");
@@ -633,6 +720,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    lp_build_name(count, "count");
    lp_build_name(stride, "stride");
    lp_build_name(vb_ptr, "vb");
+   lp_build_name(instance_id, "instance_id");
 
    /*
     * Function body
@@ -648,6 +736,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* code generated texture sampling */
+   sampler = draw_llvm_sampler_soa_create(variant->key.sampler,
+                                          context_ptr);
+
 #if DEBUG_STORE
    lp_build_printf(builder, "start = %d, end = %d, step = %d\n",
                    start, end, step);
@@ -678,7 +770,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
             LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr,
                                            &vb_index, 1, "");
             generate_fetch(builder, vbuffers_ptr,
-                           &aos_attribs[j][i], velem, vb, true_index);
+                           &aos_attribs[j][i], velem, vb, true_index,
+                           instance_id);
          }
       }
       convert_to_soa(builder, aos_attribs, inputs,
@@ -689,7 +782,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                   builder,
                   outputs,
                   ptr_aos,
-                  context_ptr);
+                  context_ptr,
+                  sampler);
 
       convert_to_aos(builder, io, outputs,
                      draw->vs.vertex_shader->info.num_outputs,
@@ -697,6 +791,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    }
    lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop);
 
+   sampler->destroy(sampler);
+
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
@@ -730,13 +826,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 static void
 draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
-   LLVMTypeRef arg_types[7];
+   LLVMTypeRef arg_types[8];
    LLVMTypeRef func_type;
    LLVMValueRef context_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef fetch_elts, fetch_count, stride, step, io_itr;
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
+   LLVMValueRef instance_id;
    struct draw_context *draw = llvm->draw;
    unsigned i, j;
    struct lp_build_context bld;
@@ -747,6 +844,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef fetch_max;
    void *code;
+   struct lp_build_sampler_soa *sampler = 0;
 
    arg_types[0] = llvm->context_ptr_type;               /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;         /* vertex_header */
@@ -755,14 +853,17 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    arg_types[4] = LLVMInt32Type();                      /* fetch_count */
    arg_types[5] = LLVMInt32Type();                      /* stride */
    arg_types[6] = llvm->vb_ptr_type;                    /* pipe_vertex_buffer's */
+   arg_types[7] = LLVMInt32Type();                      /* instance_id */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
-   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type);
+   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts",
+                                            func_type);
    LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv);
    for(i = 0; i < Elements(arg_types); ++i)
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-         LLVMAddAttribute(LLVMGetParam(variant->function_elts, i), LLVMNoAliasAttribute);
+         LLVMAddAttribute(LLVMGetParam(variant->function_elts, i),
+                          LLVMNoAliasAttribute);
 
    context_ptr  = LLVMGetParam(variant->function_elts, 0);
    io_ptr       = LLVMGetParam(variant->function_elts, 1);
@@ -771,6 +872,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    fetch_count  = LLVMGetParam(variant->function_elts, 4);
    stride       = LLVMGetParam(variant->function_elts, 5);
    vb_ptr       = LLVMGetParam(variant->function_elts, 6);
+   instance_id  = LLVMGetParam(variant->function_elts, 7);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(io_ptr, "io");
@@ -779,6 +881,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    lp_build_name(fetch_count, "fetch_count");
    lp_build_name(stride, "stride");
    lp_build_name(vb_ptr, "vb");
+   lp_build_name(instance_id, "instance_id");
 
    /*
     * Function body
@@ -793,6 +896,10 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* code generated texture sampling */
+   sampler = draw_llvm_sampler_soa_create(variant->key.sampler,
+                                          context_ptr);
+
    fetch_max = LLVMBuildSub(builder, fetch_count,
                             LLVMConstInt(LLVMInt32Type(), 1, 0),
                             "fetch_max");
@@ -833,7 +940,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
             LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr,
                                            &vb_index, 1, "");
             generate_fetch(builder, vbuffers_ptr,
-                           &aos_attribs[j][i], velem, vb, true_index);
+                           &aos_attribs[j][i], velem, vb, true_index,
+                           instance_id);
          }
       }
       convert_to_soa(builder, aos_attribs, inputs,
@@ -844,7 +952,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                   builder,
                   outputs,
                   ptr_aos,
-                  context_ptr);
+                  context_ptr,
+                  sampler);
 
       convert_to_aos(builder, io, outputs,
                      draw->vs.vertex_shader->info.num_outputs,
@@ -852,6 +961,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    }
    lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop);
 
+   sampler->destroy(sampler);
+
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
@@ -885,6 +996,8 @@ void
 draw_llvm_make_variant_key(struct draw_llvm *llvm,
                            struct draw_llvm_variant_key *key)
 {
+   unsigned i;
+
    memset(key, 0, sizeof(struct draw_llvm_variant_key));
 
    key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
@@ -896,4 +1009,72 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm,
    memcpy(&key->vs,
           &llvm->draw->vs.vertex_shader->state,
           sizeof(struct pipe_shader_state));
+
+   /* if the driver implemented the sampling hooks then
+    * setup our sampling state */
+   if (llvm->draw->num_sampler_views && llvm->draw->num_samplers) {
+      for(i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) {
+         struct draw_vertex_shader *shader = llvm->draw->vs.vertex_shader;
+         if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+            lp_sampler_static_state(&key->sampler[i],
+                                    llvm->draw->sampler_views[i],
+                                    llvm->draw->samplers[i]);
+      }
+   }
+}
+
+void
+draw_llvm_set_mapped_texture(struct draw_context *draw,
+                             unsigned sampler_idx,
+                             uint32_t width, uint32_t height, uint32_t depth,
+                             uint32_t last_level,
+                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+{
+   unsigned j;
+   struct draw_jit_texture *jit_tex;
+
+   assert(sampler_idx < PIPE_MAX_VERTEX_SAMPLERS);
+
+
+   jit_tex = &draw->llvm->jit_context.textures[sampler_idx];
+
+   jit_tex->width = width;
+   jit_tex->height = height;
+   jit_tex->depth = depth;
+   jit_tex->last_level = last_level;
+
+   for (j = 0; j <= last_level; j++) {
+      jit_tex->data[j] = data[j];
+      jit_tex->row_stride[j] = row_stride[j];
+      jit_tex->img_stride[j] = img_stride[j];
+   }
+}
+
+void
+draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
+{
+   struct draw_llvm *llvm = variant->llvm;
+   struct draw_context *draw = llvm->draw;
+
+   if (variant->function_elts) {
+      if (variant->function_elts)
+         LLVMFreeMachineCodeForFunction(draw->engine,
+                                        variant->function_elts);
+      LLVMDeleteFunction(variant->function_elts);
+   }
+
+   if (variant->function) {
+      if (variant->function)
+         LLVMFreeMachineCodeForFunction(draw->engine,
+                                        variant->function);
+      LLVMDeleteFunction(variant->function);
+   }
+
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+   remove_from_list(&variant->list_item_global);
+   llvm->nr_variants--;
+   FREE(variant);
 }
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 58fee7f9d6..4addb47d2d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -1,28 +1,71 @@
-#ifndef HAVE_LLVM_H
-#define HAVE_LLVM_H
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef DRAW_LLVM_H
+#define DRAW_LLVM_H
 
 #include "draw/draw_private.h"
 
+#include "draw/draw_vs.h"
+#include "gallivm/lp_bld_sample.h"
+
 #include "pipe/p_context.h"
+#include "util/u_simple_list.h"
 
 #include <llvm-c/Core.h>
 #include <llvm-c/Analysis.h>
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
+#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
+
+struct draw_llvm;
+struct llvm_vertex_shader;
+
 struct draw_jit_texture
 {
    uint32_t width;
    uint32_t height;
-   uint32_t stride;
-   const void *data;
+   uint32_t depth;
+   uint32_t last_level;
+   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
+   const void *data[DRAW_MAX_TEXTURE_LEVELS];
 };
 
 enum {
    DRAW_JIT_TEXTURE_WIDTH = 0,
    DRAW_JIT_TEXTURE_HEIGHT,
-   DRAW_JIT_TEXTURE_STRIDE,
-   DRAW_JIT_TEXTURE_DATA
+   DRAW_JIT_TEXTURE_DEPTH,
+   DRAW_JIT_TEXTURE_LAST_LEVEL,
+   DRAW_JIT_TEXTURE_ROW_STRIDE,
+   DRAW_JIT_TEXTURE_IMG_STRIDE,
+   DRAW_JIT_TEXTURE_DATA,
+   DRAW_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
 };
 
 enum {
@@ -48,7 +91,7 @@ struct draw_jit_context
    const float *gs_constants;
 
 
-   struct draw_jit_texture textures[PIPE_MAX_SAMPLERS];
+   struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS];
 };
 
 
@@ -58,10 +101,10 @@ struct draw_jit_context
 #define draw_jit_context_gs_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 1, "gs_constants")
 
-#define DRAW_JIT_CONTEXT_TEXTURES_INDEX 2
+#define DRAW_JIT_CTX_TEXTURES 2
 
 #define draw_jit_context_textures(_builder, _ptr) \
-   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
 
 
 
@@ -92,7 +135,8 @@ typedef void
                       unsigned start,
                       unsigned count,
                       unsigned stride,
-                      struct pipe_vertex_buffer *vertex_buffers);
+                      struct pipe_vertex_buffer *vertex_buffers,
+                      unsigned instance_id);
 
 
 typedef void
@@ -102,13 +146,54 @@ typedef void
                            const unsigned *fetch_elts,
                            unsigned fetch_count,
                            unsigned stride,
-                           struct pipe_vertex_buffer *vertex_buffers);
+                           struct pipe_vertex_buffer *vertex_buffers,
+                           unsigned instance_id);
+
+struct draw_llvm_variant_key
+{
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned                   nr_vertex_elements;
+   struct pipe_shader_state   vs;
+   struct lp_sampler_static_state sampler[PIPE_MAX_VERTEX_SAMPLERS];
+};
+
+struct draw_llvm_variant_list_item
+{
+   struct draw_llvm_variant *base;
+   struct draw_llvm_variant_list_item *next, *prev;
+};
+
+struct draw_llvm_variant
+{
+   struct draw_llvm_variant_key key;
+   LLVMValueRef function;
+   LLVMValueRef function_elts;
+   draw_jit_vert_func jit_func;
+   draw_jit_vert_func_elts jit_func_elts;
+
+   struct llvm_vertex_shader *shader;
+
+   struct draw_llvm *llvm;
+   struct draw_llvm_variant_list_item list_item_global;
+   struct draw_llvm_variant_list_item list_item_local;
+};
+
+struct llvm_vertex_shader {
+   struct draw_vertex_shader base;
+
+   struct draw_llvm_variant_list_item variants;
+   unsigned variants_created;
+   unsigned variants_cached;
+};
 
 struct draw_llvm {
    struct draw_context *draw;
 
    struct draw_jit_context jit_context;
 
+   struct draw_llvm_variant_list_item vs_variants_list;
+   int nr_variants;
+
    LLVMModuleRef module;
    LLVMExecutionEngineRef engine;
    LLVMModuleProviderRef provider;
@@ -121,23 +206,12 @@ struct draw_llvm {
    LLVMTypeRef vb_ptr_type;
 };
 
-struct draw_llvm_variant_key
+static INLINE struct llvm_vertex_shader *
+llvm_vertex_shader(struct draw_vertex_shader *vs)
 {
-   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
-   unsigned                   nr_vertex_elements;
-   struct pipe_shader_state   vs;
-};
+   return (struct llvm_vertex_shader *)vs;
+}
 
-struct draw_llvm_variant
-{
-   struct draw_llvm_variant_key key;
-   LLVMValueRef function;
-   LLVMValueRef function_elts;
-   draw_jit_vert_func jit_func;
-   draw_jit_vert_func_elts jit_func_elts;
-
-   struct draw_llvm_variant *next;
-};
 
 struct draw_llvm *
 draw_llvm_create(struct draw_context *draw);
@@ -146,7 +220,10 @@ void
 draw_llvm_destroy(struct draw_llvm *llvm);
 
 struct draw_llvm_variant *
-draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs);
+draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs);
+
+void
+draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
 
 void
 draw_llvm_make_variant_key(struct draw_llvm *llvm,
@@ -156,4 +233,18 @@ LLVMValueRef
 draw_llvm_translate_from(LLVMBuilderRef builder,
                          LLVMValueRef vbuffer,
                          enum pipe_format from_format);
+
+struct lp_build_sampler_soa *
+draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                             LLVMValueRef context_ptr);
+
+void
+draw_llvm_set_mapped_texture(struct draw_context *draw,
+                             unsigned sampler_idx,
+                             uint32_t width, uint32_t height, uint32_t depth,
+                             uint32_t last_level,
+                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
new file mode 100644
index 0000000000..e9811010db
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -0,0 +1,215 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_string.h"
+
+#include "draw_llvm.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in
+ * lp_jit_context and lp_jit_texture and the sampler code
+ * generator. It provides the texture layout information required by
+ * the texture sampler code generator in terms of the state stored in
+ * lp_jit_context and lp_jit_texture in runtime.
+ */
+struct draw_llvm_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct draw_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct draw_llvm_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+draw_llvm_texture_member(const struct lp_sampler_dynamic_state *base,
+                         LLVMBuilderRef builder,
+                         unsigned unit,
+                         unsigned member_index,
+                         const char *member_name,
+                         boolean emit_load)
+{
+   struct draw_llvm_sampler_dynamic_state *state =
+      (struct draw_llvm_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   debug_assert(unit < PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), DRAW_JIT_CTX_TEXTURES, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to
+ * fetch the members of lp_jit_texture to fulfill the sampler code
+ * generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture
+ * sampler code generator a reusable module without dependencies to
+ * llvmpipe internals.
+ */
+#define DRAW_LLVM_TEXTURE_MEMBER(_name, _index, _emit_load)  \
+   static LLVMValueRef \
+   draw_llvm_texture_##_name( const struct lp_sampler_dynamic_state *base, \
+                              LLVMBuilderRef builder,                   \
+                              unsigned unit)                            \
+   { \
+      return draw_llvm_texture_member(base, builder, unit, _index, #_name, _emit_load ); \
+   }
+
+
+DRAW_LLVM_TEXTURE_MEMBER(width,      DRAW_JIT_TEXTURE_WIDTH, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(height,     DRAW_JIT_TEXTURE_HEIGHT, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(depth,      DRAW_JIT_TEXTURE_DEPTH, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(last_level, DRAW_JIT_TEXTURE_LAST_LEVEL, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(row_stride, DRAW_JIT_TEXTURE_ROW_STRIDE, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(img_stride, DRAW_JIT_TEXTURE_IMG_STRIDE, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(data_ptr,   DRAW_JIT_TEXTURE_DATA, FALSE)
+
+
+static void
+draw_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+/**
+ * Fetch filtered values from texture.
+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
+ */
+static void
+draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
+                                       LLVMBuilderRef builder,
+                                       struct lp_type type,
+                                       unsigned unit,
+                                       unsigned num_coords,
+                                       const LLVMValueRef *coords,
+                                       const LLVMValueRef *ddx,
+                                       const LLVMValueRef *ddy,
+                                       LLVMValueRef lod_bias, /* optional */
+                                       LLVMValueRef explicit_lod, /* optional */
+                                       LLVMValueRef *texel)
+{
+   struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_VERTEX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords, coords,
+                       ddx, ddy,
+                       lod_bias, explicit_lod,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                             LLVMValueRef context_ptr)
+{
+   struct draw_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(draw_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = draw_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = draw_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = draw_llvm_texture_width;
+   sampler->dynamic_state.base.height = draw_llvm_texture_height;
+   sampler->dynamic_state.base.depth = draw_llvm_texture_depth;
+   sampler->dynamic_state.base.last_level = draw_llvm_texture_last_level;
+   sampler->dynamic_state.base.row_stride = draw_llvm_texture_row_stride;
+   sampler->dynamic_state.base.img_stride = draw_llvm_texture_img_stride;
+   sampler->dynamic_state.base.data_ptr = draw_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
index d7da7ed357..6ebe1f7de4 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_translate.c
@@ -7,6 +7,7 @@
 #include "gallivm/lp_bld_struct.h"
 #include "gallivm/lp_bld_format.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
 
 #include "util/u_memory.h"
 #include "util/u_format.h"
@@ -466,6 +467,7 @@ draw_llvm_translate_from(LLVMBuilderRef builder,
    const struct util_format_description *format_desc;
    LLVMValueRef zero;
    int i;
+   struct lp_type type = lp_float32_vec4_type();
 
    /*
     * The above can only cope with straight arrays: no bitfields,
@@ -493,5 +495,5 @@ draw_llvm_translate_from(LLVMBuilderRef builder,
 
    format_desc = util_format_description(from_format);
    zero = LLVMConstNull(LLVMInt32Type());
-   return lp_build_fetch_rgba_aos(builder, format_desc, vbuffer, zero, zero);
+   return lp_build_fetch_rgba_aos(builder, format_desc, type, vbuffer, zero, zero, zero);
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 83556f10a8..8cd75ecf9a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -177,15 +177,15 @@ static void do_triangle( struct draw_context *draw,
                 ( DRAW_PIPE_RESET_STIPPLE |     \
                   DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_1 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i2]);     \
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK));    \
    do_triangle( draw,                           \
                 ( DRAW_PIPE_EDGE_FLAG_1 |       \
                   DRAW_PIPE_EDGE_FLAG_2 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i2],      \
-                verts + stride * elts[i3])
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK))
 
 /* emit last quad vertex as last vertex in triangles */
 #define QUAD_LAST_PV(i0,i1,i2,i3)               \
@@ -193,15 +193,15 @@ static void do_triangle( struct draw_context *draw,
                 ( DRAW_PIPE_RESET_STIPPLE |     \
                   DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_2 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i3]);     \
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK));    \
    do_triangle( draw,                           \
                 ( DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_1 ),      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i2],      \
-                verts + stride * elts[i3])
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK))
 
 #define TRIANGLE(flags,i0,i1,i2)                                        \
    do_triangle( draw,                                                   \
@@ -218,7 +218,7 @@ static void do_triangle( struct draw_context *draw,
 
 #define POINT(i0)                               \
    do_point( draw,                              \
-             verts + stride * elts[i0] )
+             verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK) )
 
 #define FUNC pipe_run
 #define ARGS                                    \
@@ -260,7 +260,7 @@ void draw_pipeline_run( struct draw_context *draw,
                         const struct draw_prim_info *prim_info)
 {
    unsigned i, start;
-   
+
    draw->pipeline.verts = (char *)vert_info->verts;
    draw->pipeline.vertex_stride = vert_info->stride;
    draw->pipeline.vertex_count = vert_info->count;
@@ -296,14 +296,14 @@ void draw_pipeline_run( struct draw_context *draw,
                   DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_1 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i2));                          \
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK)); \
    do_triangle( draw,                                            \
                 ( DRAW_PIPE_EDGE_FLAG_1 |                        \
                   DRAW_PIPE_EDGE_FLAG_2 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i2),                           \
-                verts + stride * (i3))
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK))
 
 /* emit last quad vertex as last vertex in triangles */
 #define QUAD_LAST_PV(i0,i1,i2,i3)                                \
@@ -312,31 +312,31 @@ void draw_pipeline_run( struct draw_context *draw,
                   DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_2 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i3));                          \
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK)); \
    do_triangle( draw,                                            \
                 ( DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_1 ),                       \
                 verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i2),                           \
-                verts + stride * (i3))
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK))
 
 #define TRIANGLE(flags,i0,i1,i2)                                 \
    do_triangle( draw,                                            \
                 flags,  /* flags */                              \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i2))
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK))
 
 #define LINE(flags,i0,i1)                                   \
    do_line( draw,                                           \
             flags,                                          \
             verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
-            verts + stride * (i1))
+            verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK))
 
 #define POINT(i0)                               \
    do_point( draw,                              \
-             verts + stride * i0 )
+             verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK) )
 
 #define FUNC pipe_run_linear
 #define ARGS                                    \
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index debd17fd74..c0135f5bb7 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -425,7 +425,8 @@ aaline_create_texture(struct aaline_stage *aaline)
 
    /* Fill in mipmap images.
     * Basically each level is solid opaque, except for the outermost
-    * texels which are zero.  Special case the 1x1 and 2x2 levels.
+    * texels which are zero.  Special case the 1x1 and 2x2 levels
+    * (though, those levels shouldn't be used - see the max_lod setting).
     */
    for (level = 0; level <= MAX_TEXTURE_LEVEL; level++) {
       struct pipe_transfer *transfer;
@@ -497,7 +498,8 @@ aaline_create_sampler(struct aaline_stage *aaline)
    sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
    sampler.normalized_coords = 1;
    sampler.min_lod = 0.0f;
-   sampler.max_lod = MAX_TEXTURE_LEVEL;
+   /* avoid using the 1x1 and 2x2 mipmap levels */
+   sampler.max_lod = MAX_TEXTURE_LEVEL - 2;
 
    aaline->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
    if (aaline->sampler_cso == NULL)
@@ -669,8 +671,8 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
 
    assert(draw->rasterizer->line_smooth);
 
-   if (draw->rasterizer->line_width <= 3.0)
-      aaline->half_line_width = 1.5f;
+   if (draw->rasterizer->line_width <= 2.2)
+      aaline->half_line_width = 1.1f;
    else
       aaline->half_line_width = 0.5f * draw->rasterizer->line_width;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 122b1c7968..1cf6ee7a7f 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -262,6 +262,7 @@ do_clip_tri( struct draw_stage *stage,
 
       clipmask &= ~(1<<plane_idx);
 
+      assert(n < MAX_CLIPPED_VERTICES);
       inlist[n] = inlist[0]; /* prevent rotation of vertices */
 
       for (i = 1; i <= n; i++) {
@@ -270,11 +271,17 @@ do_clip_tri( struct draw_stage *stage,
 	 float dp = dot4( vert->clip, plane );
 
 	 if (!IS_NEGATIVE(dp_prev)) {
+            assert(outcount < MAX_CLIPPED_VERTICES);
 	    outlist[outcount++] = vert_prev;
 	 }
 
 	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
-	    struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++];
+	    struct vertex_header *new_vert;
+
+            assert(tmpnr < MAX_CLIPPED_VERTICES+1);
+            new_vert = clipper->stage.tmp[tmpnr++];
+
+            assert(outcount < MAX_CLIPPED_VERTICES);
 	    outlist[outcount++] = new_vert;
 
 	    if (IS_NEGATIVE(dp)) {
@@ -317,12 +324,14 @@ do_clip_tri( struct draw_stage *stage,
    if (clipper->flat) {
       if (stage->draw->rasterizer->flatshade_first) {
          if (inlist[0] != header->v[0]) {
+            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
             inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
             copy_colors(stage, inlist[0], header->v[0]);
          }
       }
       else {
          if (inlist[0] != header->v[2]) {
+            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
             inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
             copy_colors(stage, inlist[0], header->v[2]);
          }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index fff960c7eb..ed9a53e154 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -363,8 +363,12 @@ generate_pstip_fs(struct pstip_stage *pstip)
    assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS);
 
    pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs);
-
+   
    FREE((void *)pstip_fs.tokens);
+
+   if (!pstip->fs->pstip_fs)
+      return FALSE;
+
    return TRUE;
 }
 
@@ -603,13 +607,16 @@ pstip_destroy(struct draw_stage *stage)
 }
 
 
+/** Create a new polygon stipple drawing stage object */
 static struct pstip_stage *
-draw_pstip_stage(struct draw_context *draw)
+draw_pstip_stage(struct draw_context *draw, struct pipe_context *pipe)
 {
    struct pstip_stage *pstip = CALLOC_STRUCT(pstip_stage);
    if (pstip == NULL)
       goto fail;
 
+   pstip->pipe = pipe;
+
    pstip->stage.draw = draw;
    pstip->stage.name = "pstip";
    pstip->stage.next = NULL;
@@ -765,14 +772,12 @@ draw_install_pstipple_stage(struct draw_context *draw,
    /*
     * Create / install pgon stipple drawing / prim stage
     */
-   pstip = draw_pstip_stage( draw );
+   pstip = draw_pstip_stage( draw, pipe );
    if (pstip == NULL)
       goto fail;
 
    draw->pipeline.pstipple = &pstip->stage;
 
-   pstip->pipe = pipe;
-
    /* create special texture, sampler state */
    if (!pstip_create_texture(pstip))
       goto fail;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 3e6e538995..ee2945c7c9 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -226,6 +226,7 @@ static void widepoint_first_point( struct draw_stage *stage,
 
    if (rast->gl_rasterization_rules) {
       wide->xbias = 0.125;
+      wide->ybias = -0.125;
    }
 
    /* Disable triangle culling, stippling, unfilled mode etc. */
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 4584033bc2..058aeedc17 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -48,6 +48,7 @@
 
 #ifdef HAVE_LLVM
 #include <llvm-c/ExecutionEngine.h>
+struct draw_llvm;
 #endif
 
 
@@ -81,6 +82,9 @@ struct vertex_header {
 #define UNDEFINED_VERTEX_ID 0xffff
 
 
+/* maximum number of shader variants we can cache */
+#define DRAW_MAX_SHADER_VARIANTS 1024
+
 /**
  * Private context for the drawing module.
  */
@@ -245,6 +249,7 @@ struct draw_context
     */
    float plane[12][4];
    unsigned nr_planes;
+   boolean depth_clamp;
 
    /* If a prim stage introduces new vertex attributes, they'll be stored here
     */
@@ -259,9 +264,15 @@ struct draw_context
    unsigned instance_id;
 
 #ifdef HAVE_LLVM
+   struct draw_llvm *llvm;
    LLVMExecutionEngineRef engine;
 #endif
 
+   struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned num_sampler_views;
+   const struct pipe_sampler_state *samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned num_samplers;
+
    void *driver_private;
 };
 
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 02c97fec81..92d4113b4c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -34,9 +34,11 @@
 #include "draw/draw_gs.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "util/u_math.h"
 #include "util/u_prim.h"
+#include "util/u_format.h"
 
 
 DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE)
@@ -69,7 +71,6 @@ draw_pt_arrays(struct draw_context *draw,
    struct draw_pt_front_end *frontend = NULL;
    struct draw_pt_middle_end *middle = NULL;
    unsigned opt = 0;
-   unsigned out_prim = prim;
 
    /* Sanitize primitive length:
     */
@@ -80,18 +81,19 @@ draw_pt_arrays(struct draw_context *draw,
       if (count < first)
          return TRUE;
    }
-   if (draw->gs.geometry_shader) {
-      out_prim = draw->gs.geometry_shader->output_primitive;
-   }
 
    if (!draw->force_passthrough) {
+      unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                              draw->gs.geometry_shader->output_primitive :
+                              prim);
+
       if (!draw->render) {
          opt |= PT_PIPELINE;
       }
 
       if (draw_need_pipeline(draw,
                              draw->rasterizer,
-                             out_prim)) {
+                             gs_out_prim)) {
          opt |= PT_PIPELINE;
       }
 
@@ -102,7 +104,7 @@ draw_pt_arrays(struct draw_context *draw,
       opt |= PT_SHADE;
    }
 
-   if (draw->pt.middle.llvm && !draw->gs.geometry_shader) {
+   if (draw->pt.middle.llvm) {
       middle = draw->pt.middle.llvm;
    } else {
       if (opt == 0)
@@ -122,7 +124,7 @@ draw_pt_arrays(struct draw_context *draw,
       frontend = draw->pt.front.varray;
    }
 
-   frontend->prepare( frontend, prim, out_prim, middle, opt );
+   frontend->prepare( frontend, prim, middle, opt );
 
    frontend->run(frontend,
                  draw_pt_elt_func(draw),
@@ -265,31 +267,38 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
          case PIPE_FORMAT_R32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f  @ %p\n", v[0], (void *) v);
+               debug_printf("R %f  @ %p\n", v[0], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f  @ %p\n", v[0], v[1], (void *) v);
+               debug_printf("RG %f %f  @ %p\n", v[0], v[1], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32B32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
+               debug_printf("RGB %f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32B32A32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
+               debug_printf("RGBA %f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
                             (void *) v);
             }
             break;
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            {
+               ubyte *u = (ubyte *) ptr;
+               debug_printf("BGRA %d %d %d %d  @ %p\n", u[0], u[1], u[2], u[3],
+                            (void *) u);
+            }
+            break;
          default:
-            debug_printf("other format (fix me)\n");
-            ;
+            debug_printf("other format %s (fix me)\n",
+                     util_format_name(draw->pt.vertex_element[j].src_format));
          }
       }
    }
@@ -297,11 +306,8 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
 
 
 /**
- * Draw vertex arrays
- * This is the main entrypoint into the drawing module.
- * \param prim  one of PIPE_PRIM_x
- * \param start  index of first vertex to draw
- * \param count  number of vertices to draw
+ * Non-instanced drawing.
+ * \sa draw_arrays_instanced
  */
 void
 draw_arrays(struct draw_context *draw, unsigned prim,
@@ -310,6 +316,20 @@ draw_arrays(struct draw_context *draw, unsigned prim,
    draw_arrays_instanced(draw, prim, start, count, 0, 1);
 }
 
+
+/**
+ * Draw vertex arrays.
+ * This is the main entrypoint into the drawing module.
+ * If drawing an indexed primitive, the draw_set_mapped_element_buffer_range()
+ * function should have already been called to specify the element/index buffer
+ * information.
+ *
+ * \param prim  one of PIPE_PRIM_x
+ * \param start  index of first vertex to draw
+ * \param count  number of vertices to draw
+ * \param startInstance  number for the first primitive instance (usually 0).
+ * \param instanceCount  number of instances to draw (1=non-instanced)
+ */
 void
 draw_arrays_instanced(struct draw_context *draw,
                       unsigned mode,
@@ -329,26 +349,30 @@ draw_arrays_instanced(struct draw_context *draw,
    if (0)
       draw_print_arrays(draw, mode, start, MIN2(count, 20));
 
-#if 0
-   {
-      int i;
+   if (0) {
+      unsigned int i;
       debug_printf("draw_arrays(mode=%u start=%u count=%u):\n",
                    mode, start, count);
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
       debug_printf("Elements:\n");
       for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
-         debug_printf("  format=%s\n",
+         debug_printf("  %u: src_offset=%u  inst_div=%u   vbuf=%u  format=%s\n",
+                      i,
+                      draw->pt.vertex_element[i].src_offset,
+                      draw->pt.vertex_element[i].instance_divisor,
+                      draw->pt.vertex_element[i].vertex_buffer_index,
                       util_format_name(draw->pt.vertex_element[i].src_format));
       }
       debug_printf("Buffers:\n");
       for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
-         debug_printf("  stride=%u offset=%u ptr=%p\n",
+         debug_printf("  %u: stride=%u maxindex=%u offset=%u ptr=%p\n",
+                      i,
                       draw->pt.vertex_buffer[i].stride,
+                      draw->pt.vertex_buffer[i].max_index,
                       draw->pt.vertex_buffer[i].buffer_offset,
                       draw->pt.user.vbuffer[i]);
       }
    }
-#endif
 
    for (instance = 0; instance < instanceCount; instance++) {
       draw->instance_id = instance + startInstance;
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index b6741ca83c..44356fba4c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -62,8 +62,7 @@ struct draw_vertex_info;
  */
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
-                    unsigned input_prim,
-                    unsigned output_prim,
+                    unsigned prim,
                     struct draw_pt_middle_end *,
 		    unsigned opt );
 
@@ -87,8 +86,7 @@ struct draw_pt_front_end {
  */
 struct draw_pt_middle_end {
    void (*prepare)( struct draw_pt_middle_end *,
-                    unsigned input_prim,
-                    unsigned output_prim,
+                    unsigned prim,
 		    unsigned opt,
                     unsigned *max_vertices );
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index bf799db352..ae12ee24bd 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -68,31 +68,12 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
 
    fetch->vertex_size = vertex_size;
 
-   /* Always emit/leave space for a vertex header.
-    *
-    * It's worth considering whether the vertex headers should contain
-    * a pointer to the 'data', rather than having it inline.
-    * Something to look at after we've fully switched over to the pt
-    * paths.
+   /* Leave the clipmask/edgeflags/pad/vertex_id untouched
     */
-   {
-      /* Need to set header->vertex_id = 0xffff somehow.
-       */
-      key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
-      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
-      key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
-      key.element[nr].input_offset = 0;
-      key.element[nr].instance_divisor = 0;
-      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
-      key.element[nr].output_offset = dst_offset;
-      dst_offset += 1 * sizeof(float);
-      nr++;
-
-
-      /* Just leave the clip[] array untouched.
-       */
-      dst_offset += 4 * sizeof(float);
-   }
+   dst_offset += 1 * sizeof(float);
+   /* Just leave the clip[] array untouched.
+    */
+   dst_offset += 4 * sizeof(float);
 
    if (instance_id_index != ~0) {
       num_extra_inputs++;
@@ -131,26 +112,11 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
    key.nr_elements = nr;
    key.output_stride = vertex_size;
 
-
    if (!fetch->translate ||
        translate_key_compare(&fetch->translate->key, &key) != 0)
    {
       translate_key_sanitize(&key);
       fetch->translate = translate_cache_find(fetch->cache, &key);
-
-      {
-         static struct vertex_header vh = { 0,
-                                            1,
-                                            0,
-                                            UNDEFINED_VERTEX_ID,
-                                            { .0f, .0f, .0f, .0f } };
-
-	 fetch->translate->set_buffer(fetch->translate,
-				      draw->pt.nr_vertex_buffers,
-				      &vh,
-				      0,
-				      ~0);
-      }
    }
 
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index c629d55563..5c8af17c8e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_gs.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
@@ -90,7 +91,6 @@ struct fetch_emit_middle_end {
 
 static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
                                 unsigned prim,
-                                unsigned out_prim,
 				unsigned opt,
                                 unsigned *max_vertices )
 {
@@ -101,9 +101,14 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    boolean ok;
    struct translate_key key;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
+
 
    ok = draw->render->set_primitive( draw->render, 
-                                     out_prim );
+                                     gs_out_prim );
    if (!ok) {
       assert(0);
       return;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 5483a25f1d..b8270280b6 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -68,8 +68,7 @@ struct fetch_shade_emit {
 
 
 static void fse_prepare( struct draw_pt_middle_end *middle,
-                         unsigned in_prim,
-                         unsigned out_prim,
+                         unsigned prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
@@ -80,9 +79,12 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    unsigned i;
    unsigned nr_vbs = 0;
 
+   /* Can't support geometry shader on this path.
+    */
+   assert(!draw->gs.geometry_shader);
 
    if (!draw->render->set_primitive( draw->render,
-                                     out_prim )) {
+                                     prim )) {
       assert(0);
       return;
    }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 43b08a030c..121dfc414a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -48,13 +48,11 @@ struct fetch_pipeline_middle_end {
    unsigned vertex_data_offset;
    unsigned vertex_size;
    unsigned input_prim;
-   unsigned output_prim;
    unsigned opt;
 };
 
 static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
-                                    unsigned in_prim,
-                                    unsigned out_prim,
+                                    unsigned prim,
 				    unsigned opt,
                                     unsigned *max_vertices )
 {
@@ -64,6 +62,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    unsigned i;
    unsigned instance_id_index = ~0;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
@@ -79,8 +81,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-   fpme->input_prim = in_prim;
-   fpme->output_prim = out_prim;
+   fpme->input_prim = prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -102,13 +103,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 			    (boolean)draw->bypass_clipping,
 			    (boolean)draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
    draw_pt_so_emit_prepare( fpme->so_emit );
 
    if (!(opt & PT_PIPELINE)) {
       draw_pt_emit_prepare( fpme->emit,
-			    out_prim,
+			    gs_out_prim,
                             max_vertices );
 
       *max_vertices = MAX2( *max_vertices,
@@ -183,7 +184,7 @@ static void draw_vertex_shader_run(struct draw_vertex_shader *vshader,
    output_verts->count = input_verts->count;
    output_verts->verts =
       (struct vertex_header *)MALLOC(output_verts->vertex_size *
-                                     output_verts->count);
+                                     align(output_verts->count, 4));
 
    vshader->run_linear(vshader,
                        (const float (*)[4])input_verts->verts->data,
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 7d2de58e73..bc074df8c2 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -49,48 +49,50 @@ struct llvm_middle_end {
    unsigned vertex_data_offset;
    unsigned vertex_size;
    unsigned input_prim;
-   unsigned output_prim;
    unsigned opt;
 
    struct draw_llvm *llvm;
-   struct draw_llvm_variant *variants;
    struct draw_llvm_variant *current_variant;
-   int nr_variants;
 };
 
 
 static void
 llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
                          unsigned in_prim,
-                         unsigned out_prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   struct llvm_vertex_shader *shader =
+      llvm_vertex_shader(draw->vs.vertex_shader);
    struct draw_llvm_variant_key key;
    struct draw_llvm_variant *variant = NULL;
+   struct draw_llvm_variant_list_item *li;
    unsigned i;
    unsigned instance_id_index = ~0;
 
+
+   unsigned out_prim = (draw->gs.geometry_shader ? 
+                        draw->gs.geometry_shader->output_primitive :
+                        in_prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
-   unsigned nr = MAX2( vs->info.num_inputs,
-		       vs->info.num_outputs + 1 );
+   unsigned nr = MAX2( shader->base.info.num_inputs,
+		       shader->base.info.num_outputs + 1 );
 
    /* Scan for instanceID system value.
     */
-   for (i = 0; i < vs->info.num_inputs; i++) {
-      if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
+   for (i = 0; i < shader->base.info.num_inputs; i++) {
+      if (shader->base.info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
          instance_id_index = i;
          break;
       }
    }
 
    fpme->input_prim = in_prim;
-   fpme->output_prim = out_prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -107,7 +109,7 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
 			    (boolean)draw->bypass_clipping,
 			    (boolean)(draw->identity_viewport),
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
    draw_pt_so_emit_prepare( fpme->so_emit );
 
@@ -128,20 +130,41 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
 
    draw_llvm_make_variant_key(fpme->llvm, &key);
 
-   variant = fpme->variants;
-   while(variant) {
-      if(memcmp(&variant->key, &key, sizeof key) == 0)
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+         variant = li->base;
          break;
+      }
+      li = next_elem(li);
+   }
 
-      variant = variant->next;
+   if (variant) {
+      move_to_head(&fpme->llvm->vs_variants_list, &variant->list_item_global);
    }
+   else {
+      unsigned i;
+      if (fpme->llvm->nr_variants >= DRAW_MAX_SHADER_VARIANTS) {
+         /*
+          * XXX: should we flush here ?
+          */
+         for (i = 0; i < DRAW_MAX_SHADER_VARIANTS / 4; i++) {
+            struct draw_llvm_variant_list_item *item =
+               last_elem(&fpme->llvm->vs_variants_list);
+            draw_llvm_destroy_variant(item->base);
+         }
+      }
+
+      variant = draw_llvm_create_variant(fpme->llvm, nr);
 
-   if (!variant) {
-      variant = draw_llvm_prepare(fpme->llvm, nr);
-      variant->next = fpme->variants;
-      fpme->variants = variant;
-      ++fpme->nr_variants;
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&fpme->llvm->vs_variants_list, &variant->list_item_global);
+         fpme->llvm->nr_variants++;
+         shader->variants_cached++;
+      }
    }
+
    fpme->current_variant = variant;
 
    /*XXX we only support one constant buffer */
@@ -210,7 +233,8 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
                                        fetch_info->start,
                                        fetch_info->count,
                                        fpme->vertex_size,
-                                       draw->pt.vertex_buffer );
+                                       draw->pt.vertex_buffer,
+                                       draw->instance_id);
    else
       fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
                                             llvm_vert_info.verts,
@@ -218,7 +242,8 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
                                             fetch_info->elts,
                                             fetch_info->count,
                                             fpme->vertex_size,
-                                            draw->pt.vertex_buffer);
+                                            draw->pt.vertex_buffer,
+                                            draw->instance_id);
 
    /* Finished with fetch and vs:
     */
@@ -356,31 +381,7 @@ static void llvm_middle_end_finish( struct draw_pt_middle_end *middle )
 static void llvm_middle_end_destroy( struct draw_pt_middle_end *middle )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_llvm_variant *variant = NULL;
-
-   variant = fpme->variants;
-   while(variant) {
-      struct draw_llvm_variant *next = variant->next;
 
-      if (variant->function_elts) {
-         if (variant->function_elts)
-            LLVMFreeMachineCodeForFunction(draw->engine,
-                                           variant->function_elts);
-         LLVMDeleteFunction(variant->function_elts);
-      }
-
-      if (variant->function) {
-         if (variant->function)
-            LLVMFreeMachineCodeForFunction(draw->engine,
-                                           variant->function);
-         LLVMDeleteFunction(variant->function);
-      }
-
-      FREE(variant);
-
-      variant = next;
-   }
    if (fpme->fetch)
       draw_pt_fetch_destroy( fpme->fetch );
 
@@ -393,14 +394,12 @@ static void llvm_middle_end_destroy( struct draw_pt_middle_end *middle )
    if (fpme->post_vs)
       draw_pt_post_vs_destroy( fpme->post_vs );
 
-   if (fpme->llvm)
-      draw_llvm_destroy( fpme->llvm );
-
    FREE(middle);
 }
 
 
-struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_context *draw )
+struct draw_pt_middle_end *
+draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
 {
    struct llvm_middle_end *fpme = 0;
 
@@ -436,13 +435,11 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_cont
    if (!fpme->so_emit)
       goto fail;
 
-   fpme->llvm = draw_llvm_create(draw);
+   fpme->llvm = draw->llvm;
    if (!fpme->llvm)
       goto fail;
 
-   fpme->variants = NULL;
    fpme->current_variant = NULL;
-   fpme->nr_variants = 0;
 
    return &fpme->base;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 112be50f9a..308f927b77 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -38,7 +38,14 @@ struct pt_post_vs {
                    struct draw_vertex_info *info );
 };
 
-
+static INLINE void
+initialize_vertex_header(struct vertex_header *header)
+{
+   header->clipmask = 0;
+   header->edgeflag = 1;
+   header->pad = 0;
+   header->vertex_id = UNDEFINED_VERTEX_ID;
+}
 
 static INLINE float
 dot4(const float *a, const float *b)
@@ -49,10 +56,9 @@ dot4(const float *a, const float *b)
            a[3]*b[3]);
 }
 
-
-
 static INLINE unsigned
-compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
+compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr,
+                    boolean clip_depth)
 {
    unsigned mask = 0x0;
    unsigned i;
@@ -69,8 +75,10 @@ compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
    if ( clip[0] + clip[3] < 0) mask |= (1<<1);
    if (-clip[1] + clip[3] < 0) mask |= (1<<2);
    if ( clip[1] + clip[3] < 0) mask |= (1<<3);
-   if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
-   if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
+   if (clip_depth) {
+      if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
+      if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
+   }
 
    /* Followed by any remaining ones:
     */
@@ -103,6 +111,7 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    for (j = 0; j < info->count; j++) {
       float *position = out->data[pos];
 
+      initialize_vertex_header(out);
 #if 0
       debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n",
                    j, out, position, position[0], position[1], position[2], position[3]);
@@ -114,9 +123,11 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
       out->clip[3] = position[3];
 
       out->vertex_id = 0xffff;
+      /* Disable depth clipping if depth clamping is enabled. */
       out->clipmask = compute_clipmask_gl(out->clip, 
 					  pvs->draw->plane,
-					  pvs->draw->nr_planes);
+                                          pvs->draw->nr_planes,
+                                          !pvs->draw->depth_clamp);
       clipped += out->clipmask;
 
       if (out->clipmask == 0)
@@ -192,6 +203,7 @@ static boolean post_vs_viewport( struct pt_post_vs *pvs,
    for (j = 0; j < info->count; j++) {
       float *position = out->data[pos];
 
+      initialize_vertex_header(out);
       /* Viewport mapping only, no cliptest/rhw divide
        */
       position[0] = position[0] * scale[0] + trans[0];
@@ -211,7 +223,16 @@ static boolean post_vs_viewport( struct pt_post_vs *pvs,
 static boolean post_vs_none( struct pt_post_vs *pvs,
 			     struct draw_vertex_info *info )
 {
+   struct vertex_header *out = info->verts;
+   unsigned j;
+
    if (0) debug_printf("%s\n", __FUNCTION__);
+   /* just initialize the vertex_id in all headers */
+   for (j = 0; j < info->count; j++) {
+      initialize_vertex_header(out);
+
+      out = (struct vertex_header *)((char *)out + info->stride);
+   }
    return FALSE;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 5ea833032f..cd7bb7bf25 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -120,24 +120,27 @@ static void varray_fan_segment(struct varray_frontend *varray,
 #define FUNC varray_run
 #include "draw_pt_varray_tmp_linear.h"
 
-static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = {
+static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = {
    PIPE_PRIM_POINTS,
    PIPE_PRIM_LINES,
    PIPE_PRIM_LINE_STRIP,        /* decomposed LINELOOP */
    PIPE_PRIM_LINE_STRIP,
    PIPE_PRIM_TRIANGLES,
    PIPE_PRIM_TRIANGLE_STRIP,
-   PIPE_PRIM_TRIANGLE_FAN, 
+   PIPE_PRIM_TRIANGLE_FAN,
    PIPE_PRIM_QUADS,
    PIPE_PRIM_QUAD_STRIP,
-   PIPE_PRIM_POLYGON
+   PIPE_PRIM_POLYGON,
+   PIPE_PRIM_LINES_ADJACENCY,
+   PIPE_PRIM_LINE_STRIP_ADJACENCY,
+   PIPE_PRIM_TRIANGLES_ADJACENCY,
+   PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY
 };
 
 
 
 static void varray_prepare(struct draw_pt_front_end *frontend,
                            unsigned in_prim,
-                           unsigned out_prim,
                            struct draw_pt_middle_end *middle,
                            unsigned opt)
 {
@@ -146,11 +149,13 @@ static void varray_prepare(struct draw_pt_front_end *frontend,
    varray->base.run = varray_run;
 
    varray->input_prim = in_prim;
-   varray->output_prim = decompose_prim[out_prim];
+   assert(in_prim < Elements(decompose_prim));
+   varray->output_prim = decompose_prim[in_prim];
 
    varray->middle = middle;
-   middle->prepare(middle, varray->input_prim,
-                   varray->output_prim, opt, &varray->driver_fetch_max );
+   middle->prepare(middle,
+                   varray->output_prim,
+                   opt, &varray->driver_fetch_max );
 
    /* check that the max is even */
    assert((varray->driver_fetch_max & 1) == 0);
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index 914c87a9dc..8ef94c3163 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -41,6 +41,7 @@
 #define FETCH_MAX 256
 #define DRAW_MAX (16*1024)
 
+
 struct vcache_frontend {
    struct draw_pt_front_end base;
    struct draw_context *draw;
@@ -64,13 +65,13 @@ struct vcache_frontend {
    unsigned opt;
 };
 
+
 static INLINE void
 vcache_flush( struct vcache_frontend *vcache )
 {
    if (vcache->middle_prim != vcache->output_prim) {
       vcache->middle_prim = vcache->output_prim;
       vcache->middle->prepare( vcache->middle,
-                               vcache->input_prim,
                                vcache->middle_prim,
                                vcache->opt,
                                &vcache->fetch_max );
@@ -89,12 +90,12 @@ vcache_flush( struct vcache_frontend *vcache )
    vcache->draw_count = 0;
 }
 
+
 static INLINE void 
 vcache_check_flush( struct vcache_frontend *vcache )
 {
-   if ( vcache->draw_count + 6 >= DRAW_MAX ||
-        vcache->fetch_count + 4 >= FETCH_MAX )
-   {
+   if (vcache->draw_count + 6 >= DRAW_MAX ||
+       vcache->fetch_count + 4 >= FETCH_MAX) {
       vcache_flush( vcache );
    }
 }
@@ -146,6 +147,7 @@ vcache_triangle_flags( struct vcache_frontend *vcache,
    vcache_check_flush(vcache);
 }
 
+
 static INLINE void 
 vcache_line( struct vcache_frontend *vcache,
              unsigned i0,
@@ -177,6 +179,7 @@ vcache_point( struct vcache_frontend *vcache,
    vcache_check_flush(vcache);
 }
 
+
 static INLINE void 
 vcache_quad( struct vcache_frontend *vcache,
              unsigned i0,
@@ -196,6 +199,7 @@ vcache_quad( struct vcache_frontend *vcache,
    }
 }
 
+
 static INLINE void 
 vcache_ef_quad( struct vcache_frontend *vcache,
                 unsigned i0,
@@ -231,6 +235,7 @@ vcache_ef_quad( struct vcache_frontend *vcache,
    }
 }
 
+
 /* At least for now, we're back to using a template include file for
  * this.  The two paths aren't too different though - it may be
  * possible to reunify them.
@@ -256,23 +261,23 @@ rebase_uint_elts( const unsigned *src,
                   ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i] + delta);
 }
 
+
 static INLINE void 
 rebase_ushort_elts( const ushort *src,
                     unsigned count,
                     int delta,
-                                ushort *dest )
+                    ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i] + delta);
 }
 
+
 static INLINE void 
 rebase_ubyte_elts( const ubyte *src,
                    unsigned count,
@@ -280,42 +285,39 @@ rebase_ubyte_elts( const ubyte *src,
                    ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i] + delta);
 }
 
 
-
 static INLINE void 
 translate_uint_elts( const unsigned *src,
                      unsigned count,
                      ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i]);
 }
 
+
 static INLINE void 
 translate_ushort_elts( const ushort *src,
                        unsigned count,
                        ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i]);
 }
 
+
 static INLINE void 
 translate_ubyte_elts( const ubyte *src,
                       unsigned count,
                       ushort *dest )
 {
    unsigned i;
-
    for (i = 0; i < count; i++) 
       dest[i] = (ushort)(src[i]);
 }
@@ -336,6 +338,7 @@ format_from_get_elt( pt_elt_func get_elt )
 }
 #endif
 
+
 static INLINE void 
 vcache_check_run( struct draw_pt_front_end *frontend, 
                   pt_elt_func get_elt,
@@ -345,18 +348,46 @@ vcache_check_run( struct draw_pt_front_end *frontend,
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
    struct draw_context *draw = vcache->draw;
-   unsigned min_index = draw->pt.user.min_index;
-   unsigned max_index = draw->pt.user.max_index;
-   unsigned index_size = draw->pt.user.eltSize;
-   unsigned fetch_count = max_index + 1 - min_index;
+   const unsigned min_index = draw->pt.user.min_index;
+   const unsigned max_index = draw->pt.user.max_index;
+   const unsigned index_size = draw->pt.user.eltSize;
+   unsigned fetch_count;
    const ushort *transformed_elts;
    ushort *storage = NULL;
    boolean ok = FALSE;
 
+   /* debug: verify indexes are in range [min_index, max_index] */
+   if (0) {
+      unsigned i;
+      for (i = 0; i < draw_count; i++) {
+         if (index_size == 1) {
+            assert( ((const ubyte *) elts)[i] >= min_index);
+            assert( ((const ubyte *) elts)[i] <= max_index);
+         }
+         else if (index_size == 2) {
+            assert( ((const ushort *) elts)[i] >= min_index);
+            assert( ((const ushort *) elts)[i] <= max_index);
+         }
+         else {
+            assert(index_size == 4);
+            assert( ((const uint *) elts)[i] >= min_index);
+            assert( ((const uint *) elts)[i] <= max_index);
+         }
+      }
+   }
+
+   /* Note: max_index is frequently 0xffffffff so we have to be sure
+    * that any arithmetic involving max_index doesn't overflow!
+    */
+   if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES)
+      goto fail;
 
-   if (0) debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
-                       vcache->fetch_max,
-                       draw_count);
+   fetch_count = max_index + 1 - min_index;
+
+   if (0)
+      debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
+                   vcache->fetch_max,
+                   draw_count);
 
    if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES ||
        fetch_count >= UNDEFINED_VERTEX_ID ||
@@ -368,23 +399,19 @@ vcache_check_run( struct draw_pt_front_end *frontend,
    if (vcache->middle_prim != vcache->input_prim) {
       vcache->middle_prim = vcache->input_prim;
       vcache->middle->prepare( vcache->middle,
-                               vcache->input_prim,
                                vcache->middle_prim,
                                vcache->opt,
                                &vcache->fetch_max );
    }
 
-
    assert((elt_bias >= 0 && min_index + elt_bias >= min_index) ||
           (elt_bias <  0 && min_index + elt_bias <  min_index));
 
    if (min_index == 0 &&
-       index_size == 2)
-   {
+       index_size == 2) {
       transformed_elts = (const ushort *)elts;
    }
-   else 
-   {
+   else {
       storage = MALLOC( draw_count * sizeof(ushort) );
       if (!storage)
          goto fail;
@@ -419,23 +446,23 @@ vcache_check_run( struct draw_pt_front_end *frontend,
          switch(index_size) {
          case 1:
             rebase_ubyte_elts( (const ubyte *)elts,
-                                  draw_count,
-                                  0 - (int)min_index,
-                                  storage );
+                               draw_count,
+                               0 - (int)min_index,
+                               storage );
             break;
 
          case 2:
             rebase_ushort_elts( (const ushort *)elts,
-                                   draw_count,
-                                   0 - (int)min_index,
-                                   storage );
+                                draw_count,
+                                0 - (int)min_index,
+                                storage );
             break;
 
          case 4:
             rebase_uint_elts( (const uint *)elts,
-                                 draw_count,
-                                 0 - (int)min_index,
-                                 storage );
+                              draw_count,
+                              0 - (int)min_index,
+                              storage );
             break;
 
          default:
@@ -462,7 +489,7 @@ vcache_check_run( struct draw_pt_front_end *frontend,
    debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n",
                 fetch_count, draw_count);
 
- fail:
+fail:
    vcache_run( frontend, get_elt, elts, elt_bias, draw_count );
 }
 
@@ -472,23 +499,26 @@ vcache_check_run( struct draw_pt_front_end *frontend,
 static void
 vcache_prepare( struct draw_pt_front_end *frontend,
                 unsigned in_prim,
-                unsigned out_prim,
                 struct draw_pt_middle_end *middle,
                 unsigned opt )
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
 
-   if (opt & PT_PIPELINE)
-   {
+   if (opt & PT_PIPELINE) {
       vcache->base.run = vcache_run_extras;
    }
-   else
-   {
+   else {
       vcache->base.run = vcache_check_run;
    }
 
+   /* VCache will always emit the reduced version of its input
+    * primitive, ie STRIP/FANS become TRIS, etc.
+    *
+    * This is not to be confused with what the GS might be up to,
+    * which is a separate issue.
+    */
    vcache->input_prim = in_prim;
-   vcache->output_prim = u_reduced_prim(out_prim);
+   vcache->output_prim = u_reduced_prim(in_prim);
 
    vcache->middle = middle;
    vcache->opt = opt;
@@ -496,12 +526,13 @@ vcache_prepare( struct draw_pt_front_end *frontend,
    /* Have to run prepare here, but try and guess a good prim for
     * doing so:
     */
-   vcache->middle_prim = (opt & PT_PIPELINE) ? vcache->output_prim : vcache->input_prim;
-   middle->prepare( middle, vcache->input_prim,
-                    vcache->middle_prim, opt, &vcache->fetch_max );
-}
-
+   vcache->middle_prim = (opt & PT_PIPELINE)
+      ? vcache->output_prim : vcache->input_prim;
 
+   middle->prepare( middle,
+                    vcache->middle_prim,
+                    opt, &vcache->fetch_max );
+}
 
 
 static void 
@@ -512,6 +543,7 @@ vcache_finish( struct draw_pt_front_end *frontend )
    vcache->middle = NULL;
 }
 
+
 static void 
 vcache_destroy( struct draw_pt_front_end *frontend )
 {
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index b9db886a24..57ea63fc06 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -98,6 +98,11 @@ draw_create_vertex_shader(struct draw_context *draw,
       vs = draw_create_vs_ppc( draw, shader );
 #endif
    }
+#if HAVE_LLVM
+   else {
+      vs = draw_create_vs_llvm(draw, shader);
+   }
+#endif
 
    if (!vs) {
       vs = draw_create_vs_exec( draw, shader );
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 6c7e94db43..a731994523 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -165,7 +165,6 @@ draw_create_vs_ppc(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
 
-
 struct draw_vs_varient_key;
 struct draw_vertex_shader;
 
@@ -173,6 +172,11 @@ struct draw_vs_varient *
 draw_vs_create_varient_aos_sse( struct draw_vertex_shader *vs,
                                 const struct draw_vs_varient_key *key );
 
+#if HAVE_LLVM
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *state);
+#endif
 
 
 /********************************************************************************
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
new file mode 100644
index 0000000000..6c13df7913
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -0,0 +1,120 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+#include "draw_llvm.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+static void
+vs_llvm_prepare(struct draw_vertex_shader *shader,
+                struct draw_context *draw)
+{
+   /*struct llvm_vertex_shader *evs = llvm_vertex_shader(shader);*/
+}
+
+static void
+vs_llvm_run_linear( struct draw_vertex_shader *shader,
+		    const float (*input)[4],
+		    float (*output)[4],
+                    const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+		    unsigned count,
+		    unsigned input_stride,
+		    unsigned output_stride )
+{
+   /* we should never get here since the entire pipeline is
+    * generated in draw_pt_fetch_shade_pipeline_llvm.c */
+   debug_assert(0);
+}
+
+
+static void
+vs_llvm_delete( struct draw_vertex_shader *dvs )
+{
+   struct llvm_vertex_shader *shader = llvm_vertex_shader(dvs);
+   struct pipe_fence_handle *fence = NULL;
+   struct draw_llvm_variant_list_item *li;
+   struct pipe_context *pipe = dvs->draw->pipe;
+
+   /*
+    * XXX: This might be not neccessary at all.
+    */
+   pipe->flush(pipe, 0, &fence);
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+
+
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct draw_llvm_variant_list_item *next = next_elem(li);
+      draw_llvm_destroy_variant(li->base);
+      li = next;
+   }
+
+   assert(shader->variants_cached == 0);
+   FREE((void*) dvs->state.tokens);
+   FREE( dvs );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *state)
+{
+   struct llvm_vertex_shader *vs = CALLOC_STRUCT( llvm_vertex_shader );
+
+   if (vs == NULL)
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(state->tokens);
+   if (!vs->base.state.tokens) {
+      FREE(vs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(state->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+   vs->base.prepare = vs_llvm_prepare;
+   vs->base.run_linear = vs_llvm_run_linear;
+   vs->base.delete = vs_llvm_delete;
+   vs->base.create_varient = draw_vs_create_varient_generic;
+
+   make_empty_list(&vs->variants);
+
+   return &vs->base;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index d926b2de18..f5f2623e46 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -56,7 +56,6 @@
 #include "lp_bld_intr.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_pack.h"
-#include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
 
@@ -847,6 +846,11 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return the integer part of a float (vector) value.  The returned value is
+ * a float (vector).
+ * Ex: trunc(-1.5) = 1.0
+ */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -869,6 +873,12 @@ lp_build_trunc(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is a float (vector).
+ * Ex: round(0.9) = 1.0
+ * Ex: round(-1.5) = -2.0
+ */
 LLVMValueRef
 lp_build_round(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -890,6 +900,11 @@ lp_build_round(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return floor of float (vector), result is a float (vector)
+ * Ex: floor(1.1) = 1.0
+ * Ex: floor(-1.1) = -2.0
+ */
 LLVMValueRef
 lp_build_floor(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -911,6 +926,11 @@ lp_build_floor(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return ceiling of float (vector), returning float (vector).
+ * Ex: ceil( 1.1) = 2.0
+ * Ex: ceil(-1.1) = -1.0
+ */
 LLVMValueRef
 lp_build_ceil(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -933,7 +953,7 @@ lp_build_ceil(struct lp_build_context *bld,
 
 
 /**
- * Return fractional part of 'a' computed as a - floor(f)
+ * Return fractional part of 'a' computed as a - floor(a)
  * Typically used in texture coord arithmetic.
  */
 LLVMValueRef
@@ -946,8 +966,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating toward zero.
+ * Return the integer part of a float (vector) value.  The returned value is
+ * an integer (vector).
+ * Ex: itrunc(-1.5) = 1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -964,7 +985,10 @@ lp_build_itrunc(struct lp_build_context *bld,
 
 
 /**
- * Convert float[] to int[] with round().
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is an integer (vector).
+ * Ex: iround(0.9) = 1
+ * Ex: iround(-1.5) = -2
  */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
@@ -1007,7 +1031,9 @@ lp_build_iround(struct lp_build_context *bld,
 
 
 /**
- * Convert float[] to int[] with floor().
+ * Return floor of float (vector), result is an int (vector)
+ * Ex: ifloor(1.1) = 1.0
+ * Ex: ifloor(-1.1) = -2.0
  */
 LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
@@ -1034,29 +1060,31 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
-      lp_build_name(sign, "floor.sign");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      /* offset = a < 0 ? offset : 0.0f */
       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
-      lp_build_name(offset, "floor.offset");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
 
-      res = LLVMBuildAdd(bld->builder, a, offset, "");
-      lp_build_name(res, "floor.res");
+      res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-   lp_build_name(res, "floor");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
 
    return res;
 }
 
 
+/**
+ * Return ceiling of float (vector), returning int (vector).
+ * Ex: iceil( 1.1) = 2
+ * Ex: iceil(-1.1) = -1
+ */
 LLVMValueRef
 lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -1072,12 +1100,31 @@ lp_build_iceil(struct lp_build_context *bld,
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      /* TODO: mimic lp_build_ifloor() here */
-      assert(0);
-      res = bld->undef;
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? 0 : ~0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+      /* offset = 0.99999(9)f */
+      offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? 0.0 : offset */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
 
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index d46b9f882b..7ee8fff140 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -107,4 +107,12 @@ lp_build_const_mask_aos(struct lp_type type,
                         const boolean cond[4]);
 
 
+static INLINE LLVMValueRef
+lp_build_const_int32(int i)
+{
+   return LLVMConstInt(LLVMInt32Type(), i, 0);
+}
+
+
+
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 3f7f2ebde9..77012f1fac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -83,6 +83,9 @@
  *
  * Although the result values can be scaled to an arbitrary bit width specified
  * by dst_width, the actual result type will have the same width.
+ *
+ * Ex: src = { float, float, float, float }
+ * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
@@ -152,6 +155,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 
 /**
  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
+ * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
+ * return {float, float, float, float} with values in range [0, 1].
  */
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
@@ -219,18 +224,19 @@ lp_build_conv(LLVMBuilderRef builder,
    unsigned num_tmps;
    unsigned i;
 
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
    /* We must not loose or gain channels. Only precision */
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 
    tmp_type = src_type;
-   for(i = 0; i < num_srcs; ++i)
+   for(i = 0; i < num_srcs; ++i) {
+      assert(lp_check_value(src_type, src[i]));
       tmp[i] = src[i];
+   }
    num_tmps = num_srcs;
 
    /*
@@ -326,30 +332,25 @@ lp_build_conv(LLVMBuilderRef builder,
 
    /*
     * Truncate or expand bit width
+    *
+    * No data conversion should happen here, although the sign bits are
+    * crucial to avoid bad clamping.
     */
 
-   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
+   {
+      struct lp_type new_type;
 
-   if(tmp_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
-      num_tmps = 1;
-   }
+      new_type = tmp_type;
+      new_type.sign   = dst_type.sign;
+      new_type.width  = dst_type.width;
+      new_type.length = dst_type.length;
+
+      lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 
-   if(tmp_type.width < dst_type.width) {
-      assert(num_tmps == 1);
-      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
+      tmp_type = new_type;
       num_tmps = num_dsts;
    }
 
-   assert(tmp_type.width == dst_type.width);
-   assert(tmp_type.length == dst_type.length);
-   assert(num_tmps == num_dsts);
-
    /*
     * Scale to the widest range
     */
@@ -406,8 +407,10 @@ lp_build_conv(LLVMBuilderRef builder,
        }
     }
 
-   for(i = 0; i < num_dsts; ++i)
+   for(i = 0; i < num_dsts; ++i) {
       dst[i] = tmp[i];
+      assert(lp_check_value(dst_type, dst[i]));
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 5f5036e7bd..60e22d727a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -48,9 +48,9 @@ struct lp_build_context;
  */
 
 LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed);
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled);
 
 LLVMValueRef
 lp_build_pack_rgba_aos(LLVMBuilderRef builder,
@@ -60,7 +60,9 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j);
 
@@ -72,7 +74,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 void
 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                             struct lp_build_context *bld,
-                            const LLVMValueRef *unswizzled,
+                            const LLVMValueRef unswizzled[4],
                             LLVMValueRef swizzled_out[4]);
 
 void
@@ -82,6 +84,11 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          LLVMValueRef packed,
                          LLVMValueRef rgba_out[4]);
 
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba);
 
 void
 lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
@@ -93,5 +100,18 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                         LLVMValueRef j,
                         LLVMValueRef rgba_out[4]);
 
+/*
+ * YUV
+ */
+
+
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j);
 
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 87e3e72a6e..0f01fc1d75 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,33 +38,122 @@
 #include "util/u_math.h"
 #include "util/u_string.h"
 
+#include "lp_bld_arit.h"
 #include "lp_bld_init.h"
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 
 
 /**
+ * Basic swizzling.  Rearrange the order of the unswizzled array elements
+ * according to the format description.  PIPE_SWIZZLE_ZERO/ONE are supported
+ * too.
+ * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
+ */
+LLVMValueRef
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled)
+{
+   unsigned char swizzles[4];
+   unsigned chan;
+
+   assert(bld->type.length % 4 == 0);
+
+   for (chan = 0; chan < 4; ++chan) {
+      enum util_format_swizzle swizzle;
+
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         /*
+          * For ZS formats do RGBA = ZZZ1
+          */
+         if (chan == 3) {
+            swizzle = UTIL_FORMAT_SWIZZLE_1;
+         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
+            swizzle = UTIL_FORMAT_SWIZZLE_0;
+         } else {
+            swizzle = desc->swizzle[0];
+         }
+      } else {
+         swizzle = desc->swizzle[chan];
+      }
+      swizzles[chan] = swizzle;
+   }
+
+   return lp_build_swizzle_aos(bld, unswizzled, swizzles);
+}
+
+
+/**
+ * Whether the format matches the vector type, apart of swizzles.
+ */
+static INLINE boolean
+format_matches_type(const struct util_format_description *desc,
+                    struct lp_type type)
+{
+   enum util_format_type chan_type;
+   unsigned chan;
+
+   assert(type.length % 4 == 0);
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   if (type.floating) {
+      chan_type = UTIL_FORMAT_TYPE_FLOAT;
+   } else if (type.fixed) {
+      chan_type = UTIL_FORMAT_TYPE_FIXED;
+   } else if (type.sign) {
+      chan_type = UTIL_FORMAT_TYPE_SIGNED;
+   } else {
+      chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->channel[chan].size != type.width) {
+         return FALSE;
+      }
+
+      if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
+         if (desc->channel[chan].type != chan_type ||
+             desc->channel[chan].normalized != type.norm) {
+            return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
  * Unpack a single pixel into its RGBA components.
  *
  * @param desc  the pixel format for the packed pixel value
  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
  *
- * @return RGBA in a 4 floats vector.
+ * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector.
  */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed)
+static INLINE LLVMValueRef
+lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder,
+                               const struct util_format_description *desc,
+                               LLVMValueRef packed)
 {
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
    LLVMValueRef scales[4];
-   LLVMValueRef swizzles[4];
-   LLVMValueRef aux[4];
+
    boolean normalized;
-   int empty_channel;
    boolean needs_uitofp;
    unsigned shift;
    unsigned i;
@@ -77,8 +166,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    /* Do the intermediate integer computations with 32bit integers since it
     * matches floating point size */
-   if (desc->block.bits < 32)
-      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+   assert (LLVMTypeOf(packed) == LLVMInt32Type());
 
    /* Broadcast the packed value to all four channels
     * before: packed = BGRA
@@ -98,7 +186,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    /* Initialize vector constants */
    normalized = FALSE;
    needs_uitofp = FALSE;
-   empty_channel = -1;
    shift = 0;
 
    /* Loop over 4 color components */
@@ -109,7 +196,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
          shifts[i] = LLVMGetUndef(LLVMInt32Type());
          masks[i] = LLVMConstNull(LLVMInt32Type());
          scales[i] =  LLVMConstNull(LLVMFloatType());
-         empty_channel = i;
       }
       else {
          unsigned long long mask = (1ULL << bits) - 1;
@@ -158,52 +244,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    else
       scaled = casted;
 
-   for (i = 0; i < 4; ++i)
-      aux[i] = LLVMGetUndef(LLVMFloatType());
-
-   /* Build swizzles vector to put components into R,G,B,A order */
-   for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle;
-
-      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-         /*
-          * For ZS formats do RGBA = ZZZ1
-          */
-         if (i == 3) {
-            swizzle = UTIL_FORMAT_SWIZZLE_1;
-         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
-            swizzle = UTIL_FORMAT_SWIZZLE_0;
-         } else {
-            swizzle = desc->swizzle[0];
-         }
-      } else {
-         swizzle = desc->swizzle[i];
-      }
-
-      switch (swizzle) {
-      case UTIL_FORMAT_SWIZZLE_X:
-      case UTIL_FORMAT_SWIZZLE_Y:
-      case UTIL_FORMAT_SWIZZLE_Z:
-      case UTIL_FORMAT_SWIZZLE_W:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_0:
-         assert(empty_channel >= 0);
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_1:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
-         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_NONE:
-         swizzles[i] = LLVMGetUndef(LLVMFloatType());
-         assert(0);
-         break;
-      }
-   }
-
-   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4),
-                                 LLVMConstVector(swizzles, 4), "");
+   return scaled;
 }
 
 
@@ -310,22 +351,65 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 }
 
 
+
+
 /**
  * Fetch a pixel into a 4 float AoS.
  *
  * \param format_desc  describes format of the image we're fetching from
  * \param ptr  address of the pixel block (or the texel if uncompressed)
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
- *              these will always be (0,).
- * \return  valueRef with the float[4] RGBA pixel
+ *              these will always be (0, 0).
+ * \return  a 4 element vector with the pixel's RGBA values.
  */
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j)
 {
+   unsigned num_pixels = type.length / 4;
+   struct lp_build_context bld;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(type.length % 4 == 0);
+
+   lp_build_context_init(&bld, builder, type);
+
+   /*
+    * Trivial case
+    *
+    * The format matches the type (apart of a swizzle) so no need for
+    * scaling or converting.
+    */
+
+   if (format_matches_type(format_desc, type) &&
+       format_desc->block.bits <= type.width * 4 &&
+       util_is_pot(format_desc->block.bits)) {
+      LLVMValueRef packed;
+
+      /*
+       * The format matches the type (apart of a swizzle) so no need for
+       * scaling or converting.
+       */
+
+      packed = lp_build_gather(builder, type.length/4,
+                               format_desc->block.bits, type.width*4,
+                               base_ptr, offset);
+
+      assert(format_desc->block.bits <= type.width * type.length);
+
+      packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), "");
+
+      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
+   }
+
+   /*
+    * Bit arithmetic
+    */
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -337,21 +421,77 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
        format_desc->is_bitmask &&
        !format_desc->is_mixed &&
        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED))
-   {
-      LLVMValueRef packed;
+        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) {
+
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
+
+      /*
+       * Unpack a pixel at a time into a <4 x float> RGBA vector
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef packed;
+
+         packed = lp_build_gather_elem(builder, num_pixels,
+                                       format_desc->block.bits, 32,
+                                       base_ptr, offset, k);
 
-      ptr = LLVMBuildBitCast(builder, ptr,
-                             LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) ,
-                             "");
+         tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc,
+                                                  packed);
+      }
+
+      /*
+       * Type conversion.
+       *
+       * TODO: We could avoid floating conversion for integer to
+       * integer conversions.
+       */
 
-      packed = LLVMBuildLoad(builder, ptr, "packed");
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
 
-      return lp_build_unpack_rgba_aos(builder, format_desc, packed);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res);
    }
-   else if (format_desc->fetch_rgba_float) {
+
+   /*
+    * YUV / subsampled formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_subsampled_rgba_aos(builder,
+                                               format_desc,
+                                               num_pixels,
+                                               base_ptr,
+                                               offset,
+                                               i, j);
+
+      lp_build_conv(builder,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+      return tmp;
+   }
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_8unorm().
+    */
+
+   if (format_desc->fetch_rgba_8unorm &&
+       !type.floating && type.width == 8 && !type.sign && type.norm) {
       /*
-       * Fallback to calling util_format_description::fetch_rgba_float.
+       * Fallback to calling util_format_description::fetch_rgba_8unorm.
        *
        * This is definitely not the most efficient way of fetching pixels, as
        * we miss the opportunity to do vectorization, but this it is a
@@ -361,9 +501,113 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
       LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
       char name[256];
+      LLVMTypeRef i8t = LLVMInt8Type();
+      LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
       LLVMValueRef tmp;
-      LLVMValueRef args[4];
+      LLVMValueRef res;
+      unsigned k;
+
+      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm",
+                    format_desc->short_name);
+
+      /*
+       * Declare and bind format_desc->fetch_rgba_8unorm().
+       */
+
+      function = LLVMGetNamedFunction(module, name);
+      if (!function) {
+         LLVMTypeRef ret_type;
+         LLVMTypeRef arg_types[4];
+         LLVMTypeRef function_type;
+
+         ret_type = LLVMVoidType();
+         arg_types[0] = pi8t;
+         arg_types[1] = pi8t;
+         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
+         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
+         function = LLVMAddFunction(module, name, function_type);
+
+         LLVMSetFunctionCallConv(function, LLVMCCallConv);
+         LLVMSetLinkage(function, LLVMExternalLinkage);
+
+         assert(LLVMIsDeclaration(function));
+
+         LLVMAddGlobalMapping(lp_build_engine, function,
+                              func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm));
+      }
+
+      tmp_ptr = lp_build_alloca(builder, i32t, "");
+
+      res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
+
+      /*
+       * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
+       * in the SoA vectors.
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef args[4];
+
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
+
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmp = LLVMBuildLoad(builder, tmp_ptr, "");
+
+         if (num_pixels == 1) {
+            res = tmp;
+         }
+         else {
+            res = LLVMBuildInsertElement(builder, res, tmp, index, "");
+         }
+      }
+
+      /* Bitcast from <n x i32> to <4n x i8> */
+      res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
+
+      return res;
+   }
+
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_float().
+    */
+
+   if (format_desc->fetch_rgba_float) {
+      /*
+       * Fallback to calling util_format_description::fetch_rgba_float.
+       *
+       * This is definitely not the most efficient way of fetching pixels, as
+       * we miss the opportunity to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+      char name[256];
+      LLVMTypeRef f32t = LLVMFloatType();
+      LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
+      LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
+      LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
 
       util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
                     format_desc->short_name);
@@ -379,7 +623,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
          LLVMTypeRef function_type;
 
          ret_type = LLVMVoidType();
-         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
+         arg_types[0] = pf32t;
          arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
          arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
          function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
@@ -394,25 +638,43 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                               func_to_pointer((func_pointer)format_desc->fetch_rgba_float));
       }
 
-      tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), "");
+      tmp_ptr = lp_build_alloca(builder, f32x4t, "");
 
       /*
        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
        * in the SoA vectors.
        */
 
-      args[0] = LLVMBuildBitCast(builder, tmp,
-                                 LLVMPointerType(LLVMFloatType(), 0), "");
-      args[1] = ptr;
-      args[2] = i;
-      args[3] = j;
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef args[4];
 
-      LLVMBuildCall(builder, function, args, Elements(args), "");
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
 
-      return LLVMBuildLoad(builder, tmp, "");
-   }
-   else {
-      assert(0);
-      return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
+      }
+
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
+
+      return res;
    }
+
+   assert(0);
+   return lp_build_undef(type);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index e1b94adc85..9f405921b0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -36,7 +36,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_swizzle.h"
-#include "lp_bld_sample.h" /* for lp_build_gather */
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 
 
@@ -251,6 +251,41 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 }
 
 
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
+   unsigned chan;
+
+   packed = LLVMBuildBitCast(builder, packed,
+                             lp_build_int_vec_type(dst_type), "");
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if (start)
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(dst_type, start), "");
+
+      if (stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
+}
+
+
+
 /**
  * Fetch a texels from a texture, returning them in SoA layout.
  *
@@ -311,20 +346,49 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                                format_desc,
                                type,
                                packed, rgba_out);
+      return;
    }
-   else {
-      /*
-       * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
-       *
-       * This is not the most efficient way of fetching pixels, as we
-       * miss some opportunities to do vectorization, but this is
-       * convenient for formats or scenarios for which there was no
-       * opportunity or incentive to optimize.
-       */
 
+   /*
+    * Try calling lp_build_fetch_rgba_aos for all pixels.
+    */
+
+   if (util_format_fits_8unorm(format_desc) &&
+       type.floating && type.width == 32 && type.length == 4) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = type.length * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                    base_ptr, offset, i, j);
+
+      lp_build_rgba8_to_f32_soa(builder,
+                                type,
+                                tmp,
+                                rgba_out);
+
+      return;
+   }
+
+   /*
+    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
+    *
+    * This is not the most efficient way of fetching pixels, as we
+    * miss some opportunities to do vectorization, but this is
+    * convenient for formats or scenarios for which there was no
+    * opportunity or incentive to optimize.
+    */
+
+   {
       unsigned k, chan;
+      struct lp_type tmp_type;
 
-      assert(type.floating);
+      tmp_type = type;
+      tmp_type.length = 4;
 
       for (chan = 0; chan < 4; ++chan) {
          rgba_out[chan] = lp_build_undef(type);
@@ -334,18 +398,17 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
       for(k = 0; k < type.length; ++k) {
          LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
          LLVMValueRef offset_elem;
-         LLVMValueRef ptr;
          LLVMValueRef i_elem, j_elem;
          LLVMValueRef tmp;
 
          offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
-         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
 
          i_elem = LLVMBuildExtractElement(builder, i, index, "");
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
          /* Get a single float[4]={R,G,B,A} pixel */
-         tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr,
+         tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                       base_ptr, offset_elem,
                                        i_elem, j_elem);
 
          /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
new file mode 100644
index 0000000000..0a5038bc98
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -0,0 +1,399 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * YUV pixel format manipulation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Extract Y, U, V channels from packed UYVY.
+ * @param packed  is a <n x i32> vector with the packed UYVY blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+uyvy_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef shift, mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (uyvy >> 16*i) & 0xff
+    * u = (uyvy        ) & 0xff
+    * v = (uyvy >> 16  ) & 0xff
+    */
+
+   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+   shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
+   *y = LLVMBuildLShr(builder, packed, shift, "");
+   *u = packed;
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+/**
+ * Extract Y, U, V channels from packed YUYV.
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+yuyv_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef shift, mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (yuyv >> 16*i) & 0xff
+    * u = (yuyv >> 8   ) & 0xff
+    * v = (yuyv >> 24  ) & 0xff
+    */
+
+   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+   *y = LLVMBuildLShr(builder, packed, shift, "");
+   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+static INLINE void
+yuv_to_rgb_soa(LLVMBuilderRef builder,
+               unsigned n,
+               LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
+               LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
+{
+   struct lp_type type;
+   struct lp_build_context bld;
+
+   LLVMValueRef c0;
+   LLVMValueRef c8;
+   LLVMValueRef c16;
+   LLVMValueRef c128;
+   LLVMValueRef c255;
+
+   LLVMValueRef cy;
+   LLVMValueRef cug;
+   LLVMValueRef cub;
+   LLVMValueRef cvr;
+   LLVMValueRef cvg;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   lp_build_context_init(&bld, builder, type);
+
+   assert(lp_check_value(type, y));
+   assert(lp_check_value(type, u));
+   assert(lp_check_value(type, v));
+
+   /*
+    * Constants
+    */
+
+   c0   = lp_build_const_int_vec(type,   0);
+   c8   = lp_build_const_int_vec(type,   8);
+   c16  = lp_build_const_int_vec(type,  16);
+   c128 = lp_build_const_int_vec(type, 128);
+   c255 = lp_build_const_int_vec(type, 255);
+
+   cy  = lp_build_const_int_vec(type,  298);
+   cug = lp_build_const_int_vec(type, -100);
+   cub = lp_build_const_int_vec(type,  516);
+   cvr = lp_build_const_int_vec(type,  409);
+   cvg = lp_build_const_int_vec(type, -208);
+
+   /*
+    *  y -= 16;
+    *  u -= 128;
+    *  v -= 128;
+    */
+
+   y = LLVMBuildSub(builder, y, c16, "");
+   u = LLVMBuildSub(builder, u, c128, "");
+   v = LLVMBuildSub(builder, v, c128, "");
+
+   /*
+    * r = 298 * _y            + 409 * _v + 128;
+    * g = 298 * _y - 100 * _u - 208 * _v + 128;
+    * b = 298 * _y + 516 * _u            + 128;
+    */
+
+   y = LLVMBuildMul(builder, y, cy, "");
+   y = LLVMBuildAdd(builder, y, c128, "");
+
+   *r = LLVMBuildMul(builder, v, cvr, "");
+   *g = LLVMBuildAdd(builder,
+                     LLVMBuildMul(builder, u, cug, ""),
+                     LLVMBuildMul(builder, v, cvg, ""),
+                     "");
+   *b = LLVMBuildMul(builder, u, cub, "");
+
+   *r = LLVMBuildAdd(builder, *r, y, "");
+   *g = LLVMBuildAdd(builder, *g, y, "");
+   *b = LLVMBuildAdd(builder, *b, y, "");
+
+   /*
+    * r >>= 8;
+    * g >>= 8;
+    * b >>= 8;
+    */
+
+   *r = LLVMBuildAShr(builder, *r, c8, "r");
+   *g = LLVMBuildAShr(builder, *g, c8, "g");
+   *b = LLVMBuildAShr(builder, *b, c8, "b");
+
+   /*
+    * Clamp
+    */
+
+   *r = lp_build_clamp(&bld, *r, c0, c255);
+   *g = lp_build_clamp(&bld, *g, c0, c255);
+   *b = lp_build_clamp(&bld, *b, c0, c255);
+}
+
+
+static LLVMValueRef
+rgb_to_rgba_aos(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
+{
+   struct lp_type type;
+   LLVMValueRef a;
+   LLVMValueRef rgba;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, r));
+   assert(lp_check_value(type, g));
+   assert(lp_check_value(type, b));
+
+   /*
+    * Make a 4 x unorm8 vector
+    */
+
+   r = r;
+   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), "");
+   b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), "");
+   a = lp_build_const_int_vec(type, 0xff000000);
+
+   rgba = r;
+   rgba = LLVMBuildOr(builder, rgba, g, "");
+   rgba = LLVMBuildOr(builder, rgba, b, "");
+   rgba = LLVMBuildOr(builder, rgba, a, "");
+
+   rgba = LLVMBuildBitCast(builder, rgba,
+                           LLVMVectorType(LLVMInt8Type(), 4*n), "");
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+uyvy_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+yuyv_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+rgbg_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+grgb_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * @param n  is the number of pixels processed
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
+ */
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j)
+{
+   LLVMValueRef packed;
+   LLVMValueRef rgba;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
+   assert(format_desc->block.bits == 32);
+   assert(format_desc->block.width == 2);
+   assert(format_desc->block.height == 1);
+
+   packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset);
+
+   (void)j;
+
+   switch (format_desc->format) {
+   case PIPE_FORMAT_UYVY:
+      rgba = uyvy_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_YUYV:
+      rgba = yuyv_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      rgba = rgbg_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_G8R8_G8B8_UNORM:
+      rgba = grgb_to_rgba_aos(builder, n, packed, i);
+      break;
+   default:
+      assert(0);
+      rgba =  LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n));
+      break;
+   }
+
+   return rgba;
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
new file mode 100644
index 0000000000..d60472e065
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_format.h"
+#include "lp_bld_gather.h"
+
+
+/**
+ * Get the pointer to one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i)
+{
+   LLVMValueRef offset;
+   LLVMValueRef ptr;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   if (length == 1) {
+      assert(i == 0);
+      offset = offsets;
+   } else {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      offset = LLVMBuildExtractElement(builder, offsets, index, "");
+   }
+
+   ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+
+   return ptr;
+}
+
+
+/**
+ * Gather one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   ptr = lp_build_gather_elem_ptr(builder, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   assert(src_width <= dst_width);
+   if (src_width > dst_width)
+      res = LLVMBuildTrunc(builder, res, dst_elem_type, "");
+   if (src_width < dst_width)
+      res = LLVMBuildZExt(builder, res, dst_elem_type, "");
+
+   return res;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ * Use for fetching texels from a texture.
+ * For SSE, typical values are length=4, src_width=32, dst_width=32.
+ *
+ * @param length length of the offsets
+ * @param src_width src element width in bits
+ * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMValueRef res;
+
+   if (length == 1) {
+      /* Scalar */
+      return lp_build_gather_elem(builder, length,
+                                  src_width, dst_width,
+                                  base_ptr, offsets, 0);
+   } else {
+      /* Vector */
+
+      LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+      unsigned i;
+
+      res = LLVMGetUndef(dst_vec_type);
+      for (i = 0; i < length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         LLVMValueRef elem;
+         elem = lp_build_gather_elem(builder, length,
+                                     src_width, dst_width,
+                                     base_ptr, offsets, i);
+         res = LLVMBuildInsertElement(builder, res, elem, index, "");
+      }
+   }
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
new file mode 100644
index 0000000000..131af8ea07
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_GATHER_H_
+#define LP_BLD_GATHER_H_
+
+
+#include "gallivm/lp_bld.h"
+
+
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i);
+
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i);
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+#endif /* LP_BLD_GATHER_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 44cfdc4d3f..69353dea09 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -32,6 +32,8 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_init.h"
 
+#include <llvm-c/Transforms/Scalar.h>
+
 
 #ifdef DEBUG
 unsigned gallivm_debug = 0;
@@ -50,6 +52,7 @@ LLVMModuleRef lp_build_module = NULL;
 LLVMExecutionEngineRef lp_build_engine = NULL;
 LLVMModuleProviderRef lp_build_provider = NULL;
 LLVMTargetDataRef lp_build_target = NULL;
+LLVMPassManagerRef lp_build_pass = NULL;
 
 
 /*
@@ -127,6 +130,33 @@ lp_build_init(void)
    if (!lp_build_target)
       lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine);
 
+   if (!lp_build_pass) {
+      lp_build_pass = LLVMCreateFunctionPassManager(lp_build_provider);
+      LLVMAddTargetData(lp_build_target, lp_build_pass);
+
+      if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
+         /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+          * but there are more on SVN. */
+         /* TODO: Add more passes */
+         LLVMAddCFGSimplificationPass(lp_build_pass);
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+         LLVMAddConstantPropagationPass(lp_build_pass);
+         if(util_cpu_caps.has_sse4_1) {
+            /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+             * and sitofp (necessary for trunc/floor/ceil/round implementation)
+             * somehow becomes invalid code.
+             */
+            LLVMAddInstructionCombiningPass(lp_build_pass);
+         }
+         LLVMAddGVNPass(lp_build_pass);
+      } else {
+         /* We need at least this pass to prevent the backends to fail in
+          * unexpected ways.
+          */
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+      }
+   }
+
    util_cpu_detect();
 
 #if 0
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 0ec2afcd1b..a32ced9b4c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -38,6 +38,7 @@ extern LLVMModuleRef lp_build_module;
 extern LLVMExecutionEngineRef lp_build_engine;
 extern LLVMModuleProviderRef lp_build_provider;
 extern LLVMTargetDataRef lp_build_target;
+extern LLVMPassManagerRef lp_build_pass;
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index d13fa1a5d0..39854e43b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -34,6 +34,7 @@
 
 
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
 #include "util/u_debug.h"
 
 #include "lp_bld_type.h"
@@ -187,12 +188,10 @@ lp_build_compare(LLVMBuilderRef builder,
             return lp_build_undef(type);
          }
 
-         /* There are no signed byte and unsigned word/dword comparison
-          * instructions. So flip the sign bit so that the results match.
+         /* There are no unsigned comparison instructions. So flip the sign bit
+          * so that the results match.
           */
-         if(table[func].gt &&
-            ((type.width == 8 && type.sign) ||
-             (type.width != 8 && !type.sign))) {
+         if (table[func].gt && !type.sign) {
             LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
@@ -384,6 +383,46 @@ lp_build_select(struct lp_build_context *bld,
       mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
       res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
+   else if (util_cpu_caps.has_sse4_1 &&
+            type.width * type.length == 128 &&
+            !LLVMIsConstant(a) &&
+            !LLVMIsConstant(b) &&
+            !LLVMIsConstant(mask)) {
+      const char *intrinsic;
+      LLVMTypeRef arg_type;
+      LLVMValueRef args[3];
+
+      if (type.width == 64) {
+         intrinsic = "llvm.x86.sse41.blendvpd";
+         arg_type = LLVMVectorType(LLVMDoubleType(), 2);
+      } else if (type.width == 32) {
+         intrinsic = "llvm.x86.sse41.blendvps";
+         arg_type = LLVMVectorType(LLVMFloatType(), 4);
+      } else {
+         intrinsic = "llvm.x86.sse41.pblendvb";
+         arg_type = LLVMVectorType(LLVMInt8Type(), 16);
+      }
+
+      if (arg_type != bld->int_vec_type) {
+         mask = LLVMBuildBitCast(bld->builder, mask, arg_type, "");
+      }
+
+      if (arg_type != bld->vec_type) {
+         a = LLVMBuildBitCast(bld->builder, a, arg_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, arg_type, "");
+      }
+
+      args[0] = b;
+      args[1] = a;
+      args[2] = mask;
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               arg_type, args, Elements(args));
+
+      if (arg_type != bld->vec_type) {
+         res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+      }
+   }
    else {
       if(type.floating) {
          LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 186f8849b8..7748f8f099 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -427,3 +427,123 @@ lp_build_pack(LLVMBuilderRef builder,
 
    return tmp[0];
 }
+
+
+/**
+ * Truncate or expand the bitwidth.
+ *
+ * NOTE: Getting the right sign flags is crucial here, as we employ some
+ * intrinsics that do saturation.
+ */
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   /*
+    * We don't support float <-> int conversion here. That must be done
+    * before/after calling this function.
+    */
+   assert(src_type.floating == dst_type.floating);
+
+   /*
+    * We don't support double <-> float conversion yet, although it could be
+    * added with little effort.
+    */
+   assert((!src_type.floating && !dst_type.floating) ||
+          src_type.width == dst_type.width);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
+   assert(num_srcs == 1 || num_dsts == 1);
+
+   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
+
+   if (src_type.width > dst_type.width) {
+      /*
+       * Truncate bit width.
+       */
+
+      assert(num_dsts == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+        /*
+         * Register width remains constant -- use vector packing intrinsics
+         */
+
+         tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), "");
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else if (src_type.width < dst_type.width) {
+      /*
+       * Expand bit width.
+       */
+
+      assert(num_srcs == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+         /*
+          * Register width remains constant -- use vector unpack intrinsics
+          */
+         lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+
+            if (src_type.sign && dst_type.sign) {
+               val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), "");
+            } else {
+               val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), "");
+            }
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else {
+      /*
+       * No-op
+       */
+
+      assert(num_srcs == 1);
+      assert(num_dsts == 1);
+
+      tmp[0] = src[0];
+   }
+
+   for(i = 0; i < num_dsts; ++i)
+      dst[i] = tmp[i];
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 41adeed220..e470082b97 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -92,4 +92,12 @@ lp_build_pack(LLVMBuilderRef builder,
               const LLVMValueRef *src, unsigned num_srcs);
 
 
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
 #endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 38fd5a39ef..ca36046d22 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -61,8 +61,8 @@ LLVMValueRef
 lp_build_ddx(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   LLVMValueRef a_left  = lp_build_swizzle1_aos(bld, a, swizzle_left);
-   LLVMValueRef a_right = lp_build_swizzle1_aos(bld, a, swizzle_right);
+   LLVMValueRef a_left  = lp_build_swizzle_aos(bld, a, swizzle_left);
+   LLVMValueRef a_right = lp_build_swizzle_aos(bld, a, swizzle_right);
    return lp_build_sub(bld, a_right, a_left);
 }
 
@@ -71,8 +71,8 @@ LLVMValueRef
 lp_build_ddy(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   LLVMValueRef a_top    = lp_build_swizzle1_aos(bld, a, swizzle_top);
-   LLVMValueRef a_bottom = lp_build_swizzle1_aos(bld, a, swizzle_bottom);
+   LLVMValueRef a_top    = lp_build_swizzle_aos(bld, a, swizzle_top);
+   LLVMValueRef a_bottom = lp_build_swizzle_aos(bld, a, swizzle_bottom);
    return lp_build_sub(bld, a_bottom, a_top);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 946c23e317..0fd014ab9b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -40,7 +40,6 @@
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_type.h"
-#include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
 
@@ -125,73 +124,53 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
- * Gather elements from scatter positions in memory into a single vector.
- * Use for fetching texels from a texture.
- * For SSE, typical values are length=4, src_width=32, dst_width=32.
- *
- * @param length length of the offsets
- * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
- * @param base_ptr base pointer, should be a i8 pointer type.
- * @param offsets vector with offsets
- */
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
-{
-   LLVMTypeRef src_type = LLVMIntType(src_width);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
-   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-   LLVMValueRef res;
-   unsigned i;
-
-   res = LLVMGetUndef(dst_vec_type);
-   for(i = 0; i < length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef elem_offset;
-      LLVMValueRef elem_ptr;
-      LLVMValueRef elem;
-
-      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
-      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
-      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
-      elem = LLVMBuildLoad(builder, elem_ptr, "");
-
-      assert(src_width <= dst_width);
-      if(src_width > dst_width)
-         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
-      if(src_width < dst_width)
-         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
-
-      res = LLVMBuildInsertElement(builder, res, elem, index, "");
-   }
-
-   return res;
-}
-
-
-/**
  * Compute the offset of a pixel block.
  *
- * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as
- * per format description, and not individual pixels.
+ * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
+ *
+ * Returns the relative offset and i,j sub-block coordinates
  */
-LLVMValueRef
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride)
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
+   LLVMValueRef i;
+   LLVMValueRef j;
+
+   /*
+    * Describe the coordinates in terms of pixel blocks.
+    *
+    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
+    * bit arithmetic. Verify this.
+    */
+
+   if (format_desc->block.width == 1) {
+      i = bld->zero;
+   }
+   else {
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width);
+      i = LLVMBuildURem(bld->builder, x, block_width, "");
+      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
+   }
+
+   if (format_desc->block.height == 1) {
+      j = bld->zero;
+   }
+   else {
+      LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height);
+      j = LLVMBuildURem(bld->builder, y, block_height, "");
+      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
+   }
 
    x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
    offset = lp_build_mul(bld, x, x_stride);
@@ -206,5 +185,7 @@ lp_build_sample_offset(struct lp_build_context *bld,
       offset = lp_build_add(bld, offset, z_offset);
    }
 
-   return offset;
+   *out_offset = offset;
+   *out_i = i;
+   *out_j = j;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 51e98ab2f9..5b8f478094 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -146,23 +146,17 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
-
-
-LLVMValueRef
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride);
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 84c04fe272..1a20d74cac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -50,8 +50,10 @@
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_quad.h"
 
 
 /**
@@ -264,35 +266,11 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (bld->format_desc->block.width == 1) {
-      i = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
-
-   if (bld->format_desc->block.height == 1) {
-      j = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
-   }
-
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, z, y_stride, z_stride);
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, z, y_stride, z_stride,
+                          &offset, &i, &j);
 
    if (use_border) {
       /* If we can sample the border color, it means that texcoords may
@@ -344,6 +322,9 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 }
 
 
+/**
+ * Fetch the texels as <4n x i8> in AoS form.
+ */
 static LLVMValueRef
 lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef x,
@@ -351,25 +332,46 @@ lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef y_stride,
                        LLVMValueRef data_array)
 {
-   LLVMValueRef offset;
+   LLVMValueRef offset, i, j;
    LLVMValueRef data_ptr;
+   LLVMValueRef res;
 
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, NULL, y_stride, NULL);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, NULL, y_stride, NULL,
+                          &offset, &i, &j);
 
    /* get pointer to mipmap level 0 data */
    data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
 
-   return lp_build_gather(bld->builder,
-                          bld->texel_type.length,
-                          bld->format_desc->block.bits,
-                          bld->texel_type.width,
-                          data_ptr, offset);
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      /* Just fetch the data directly without swizzling */
+      assert(bld->format_desc->block.width == 1);
+      assert(bld->format_desc->block.height == 1);
+      assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+      res = lp_build_gather(bld->builder,
+                            bld->texel_type.length,
+                            bld->format_desc->block.bits,
+                            bld->texel_type.width,
+                            data_ptr, offset);
+   }
+   else {
+      struct lp_type type;
+
+      assert(bld->texel_type.width == 32);
+
+      memset(&type, 0, sizeof type);
+      type.width = 8;
+      type.length = bld->texel_type.length*4;
+      type.norm = TRUE;
+
+      res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type,
+                                    data_ptr, offset, i, j);
+   }
+
+   return res;
 }
 
 
@@ -817,9 +819,8 @@ lp_build_minify(struct lp_build_sample_context *bld,
 
 /**
  * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
  * \param lod_bias  optional float vector with the shader lod bias
  * \param explicit_lod  optional float vector with the explicit lod
  * \param width  scalar int texture width
@@ -831,11 +832,8 @@ lp_build_minify(struct lp_build_sample_context *bld,
  */
 static LLVMValueRef
 lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      const LLVMValueRef *ddx,
-                      const LLVMValueRef *ddy,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       LLVMValueRef width,
@@ -870,14 +868,6 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
          LLVMValueRef rho;
 
-         /*
-          * dsdx = abs(s[1] - s[0]);
-          * dsdy = abs(s[2] - s[0]);
-          * dtdx = abs(t[1] - t[0]);
-          * dtdy = abs(t[2] - t[0]);
-          * drdx = abs(r[1] - r[0]);
-          * drdy = abs(r[2] - r[0]);
-          */
          dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
          dsdx = lp_build_abs(float_bld, dsdx);
          dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
@@ -1287,7 +1277,7 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 
 
 /**
- * Generate code to do cube face selection and per-face texcoords.
+ * Generate code to do cube face selection and compute per-face texcoords.
  */
 static void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
@@ -1411,7 +1401,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          lp_build_endif(&if_ctx2);
          lp_build_flow_scope_end(flow_ctx2);
          lp_build_flow_destroy(flow_ctx2);
-
          *face_s = face_s2;
          *face_t = face_t2;
          *face = face2;
@@ -1457,13 +1446,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    int chan;
 
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld,
                                     width0_vec, height0_vec, depth0_vec,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_nearest(bld,
                                        width1_vec, height1_vec, depth1_vec,
                                        row_stride1_vec, img_stride1_vec,
@@ -1473,13 +1463,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 
+      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld,
                                    width0_vec, height0_vec, depth0_vec,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_linear(bld,
                                       width1_vec, height1_vec, depth1_vec,
                                       row_stride1_vec, img_stride1_vec,
@@ -1542,6 +1533,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
    LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
    LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef face_ddx[4], face_ddy[4];
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -1549,6 +1541,30 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    */
 
    /*
+    * Choose cube face, recompute texcoords and derivatives for the chosen face.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
+   /*
     * Compute the level of detail (float).
     */
    if (min_filter != mag_filter ||
@@ -1556,7 +1572,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, s, t, r, ddx, ddy,
+      lod = lp_build_lod_selector(bld, ddx, ddy,
                                   lod_bias, explicit_lod,
                                   width, height, depth);
    }
@@ -1566,9 +1582,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* always use mip level 0 */
-      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
    }
    else {
+      assert(lod);
       if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
          lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
       }
@@ -1623,18 +1650,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    }
 
    /*
-    * Choose cube face, recompute per-face texcoords.
-    */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-   }
-
-   /*
     * Get pointer(s) to image data for mipmap level(s).
     */
    data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
@@ -1712,36 +1727,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 
 
 static void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
-{
-   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
-   unsigned chan;
-
-   /* Decode the input vector components */
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned start = chan*8;
-      unsigned stop = start + 8;
-      LLVMValueRef input;
-
-      input = packed;
-
-      if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
-
-      if(stop < 32)
-         input = LLVMBuildAnd(builder, input, mask, "");
-
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
-
-      rgba[chan] = input;
-   }
-}
-
-
-static void
 lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef s,
                               LLVMValueRef t,
@@ -1935,15 +1920,20 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * Convert to SoA and swizzle.
     */
 
-   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
-
    lp_build_rgba8_to_f32_soa(bld->builder,
                              bld->texel_type,
                              packed, unswizzled);
 
-   lp_build_format_swizzle_soa(bld->format_desc,
-                               &bld->texel_bld,
-                               unswizzled, texel_out);
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      lp_build_format_swizzle_soa(bld->format_desc,
+                                  &bld->texel_bld,
+                                  unswizzled, texel_out);
+   } else {
+      texel_out[0] = unswizzled[0];
+      texel_out[1] = unswizzled[1];
+      texel_out[2] = unswizzled[2];
+      texel_out[3] = unswizzled[3];
+   }
 
    apply_sampler_swizzle(bld, texel_out);
 }
@@ -2007,6 +1997,8 @@ lp_build_sample_nop(struct lp_build_sample_context *bld,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
+ * \param ddx  partial derivatives of (s,t,r,q) with respect to x
+ * \param ddy  partial derivatives of (s,t,r,q) with respect to y
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -2016,8 +2008,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
+                    const LLVMValueRef ddx[4],
+                    const LLVMValueRef ddy[4],
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
@@ -2079,7 +2071,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       /* For debug: no-op texture sampling */
       lp_build_sample_nop(&bld, texel_out);
    }
-   else if (util_format_is_rgba8_variant(bld.format_desc) &&
+   else if (util_format_fits_8unorm(bld.format_desc) &&
+            bld.format_desc->nr_channels > 1 &&
             static_state->target == PIPE_TEXTURE_2D &&
             static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 3c8a7bc09e..20cf96ca66 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -110,7 +110,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
     * using shuffles here actually causes worst results. More investigation is
     * needed. */
-   if (n <= 4) {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
@@ -132,7 +132,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      struct lp_type type4 = type;
+      struct lp_type type4;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
@@ -147,6 +147,13 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 
       a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
 
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
+
+      type4 = type;
+      type4.floating = FALSE;
       type4.width *= 4;
       type4.length /= 4;
 
@@ -176,80 +183,170 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 
 
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4])
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4])
 {
-   const unsigned n = bld->type.length;
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
    unsigned i, j;
 
-   if(a == bld->undef || a == bld->zero || a == bld->one)
+   if (swizzles[0] == PIPE_SWIZZLE_RED &&
+       swizzles[1] == PIPE_SWIZZLE_GREEN &&
+       swizzles[2] == PIPE_SWIZZLE_BLUE &&
+       swizzles[3] == PIPE_SWIZZLE_ALPHA) {
       return a;
+   }
 
-   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
-      return lp_build_broadcast_aos(bld, a, swizzle[0]);
+   if (swizzles[0] == swizzles[1] &&
+       swizzles[1] == swizzles[2] &&
+       swizzles[2] == swizzles[3]) {
+      switch (swizzles[0]) {
+      case PIPE_SWIZZLE_RED:
+      case PIPE_SWIZZLE_GREEN:
+      case PIPE_SWIZZLE_BLUE:
+      case PIPE_SWIZZLE_ALPHA:
+         return lp_build_broadcast_aos(bld, a, swizzles[0]);
+      case PIPE_SWIZZLE_ZERO:
+         return bld->zero;
+      case PIPE_SWIZZLE_ONE:
+         return bld->one;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+   }
 
-   {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
-      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(type));
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
+
+      memset(aux, 0, sizeof aux);
+
+      for(j = 0; j < n; j += 4) {
+         for(i = 0; i < 4; ++i) {
+            unsigned shuffle;
+            switch (swizzles[i]) {
+            default:
+               assert(0);
+               /* fall through */
+            case PIPE_SWIZZLE_RED:
+            case PIPE_SWIZZLE_GREEN:
+            case PIPE_SWIZZLE_BLUE:
+            case PIPE_SWIZZLE_ALPHA:
+               shuffle = j + swizzles[i];
+               break;
+            case PIPE_SWIZZLE_ZERO:
+               shuffle = type.length + 0;
+               if (!aux[0]) {
+                  aux[0] = lp_build_const_elem(type, 0.0);
+               }
+               break;
+            case PIPE_SWIZZLE_ONE:
+               shuffle = type.length + 1;
+               if (!aux[1]) {
+                  aux[1] = lp_build_const_elem(type, 1.0);
+               }
+               break;
+            }
+            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
+         }
+      }
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
-   }
-}
-
+      for (i = 0; i < n; ++i) {
+         if (!aux[i]) {
+            aux[i] = undef;
+         }
+      }
 
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4])
-{
-   const unsigned n = bld->type.length;
-   unsigned i, j;
+      return LLVMBuildShuffleVector(bld->builder, a,
+                                    LLVMConstVector(aux, n),
+                                    LLVMConstVector(shuffles, n), "");
+   } else {
+      /*
+       * Bit mask and shifts.
+       *
+       * For example, this will convert BGRA to RGBA by doing
+       *
+       *   rgba = (bgra & 0x00ff0000) >> 16
+       *        | (bgra & 0xff00ff00)
+       *        | (bgra & 0x000000ff) << 16
+       *
+       * This is necessary not only for faster cause, but because X86 backend
+       * will refuse shuffles of <4 x i8> vectors
+       */
+      LLVMValueRef res;
+      struct lp_type type4;
+      boolean cond[4];
+      unsigned chan;
+      int shift;
 
-   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
-      return lp_build_swizzle1_aos(bld, a, swizzle);
+      /*
+       * Start with a mixture of 1 and 0.
+       */
+      for (chan = 0; chan < 4; ++chan) {
+         cond[chan] = swizzles[chan] == PIPE_SWIZZLE_ONE ? TRUE : FALSE;
+      }
+      res = lp_build_select_aos(bld, bld->one, bld->zero, cond);
 
-   if(a == b) {
-      unsigned char swizzle1[4];
-      swizzle1[0] = swizzle[0] % 4;
-      swizzle1[1] = swizzle[1] % 4;
-      swizzle1[2] = swizzle[2] % 4;
-      swizzle1[3] = swizzle[3] % 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle1);
-   }
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
+      type4 = type;
+      type4.floating = FALSE;
+      type4.width *= 4;
+      type4.length /= 4;
 
-   if(swizzle[0] % 4 == 0 &&
-      swizzle[1] % 4 == 1 &&
-      swizzle[2] % 4 == 2 &&
-      swizzle[3] % 4 == 3) {
-      boolean cond[4];
-      cond[0] = swizzle[0] / 4;
-      cond[1] = swizzle[1] / 4;
-      cond[2] = swizzle[2] / 4;
-      cond[3] = swizzle[3] / 4;
-      return lp_build_select_aos(bld, a, b, cond);
-   }
+      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
+      res = LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type4), "");
 
-   {
       /*
-       * Shuffle.
+       * Mask and shift the channels, trying to group as many channels in the
+       * same shift as possible
        */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
+      for (shift = -3; shift <= 3; ++shift) {
+         unsigned long long mask = 0;
+
+         assert(type4.width <= sizeof(mask)*8);
+
+         for (chan = 0; chan < 4; ++chan) {
+            /* FIXME: big endian */
+            if (swizzles[chan] < 4 &&
+                chan - swizzles[chan] == shift) {
+               mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
+            }
+         }
+
+         if (mask) {
+            LLVMValueRef masked;
+            LLVMValueRef shifted;
+
+            if (0)
+               debug_printf("shift = %i, mask = 0x%08llx\n", shift, mask);
+
+            masked = LLVMBuildAnd(bld->builder, a,
+                                  lp_build_const_int_vec(type4, mask), "");
+            if (shift > 0) {
+               shifted = LLVMBuildShl(bld->builder, masked,
+                                      lp_build_const_int_vec(type4, shift*type.width), "");
+            } else if (shift < 0) {
+               shifted = LLVMBuildLShr(bld->builder, masked,
+                                       lp_build_const_int_vec(type4, -shift*type.width), "");
+            } else {
+               shifted = masked;
+            }
+
+            res = LLVMBuildOr(bld->builder, res, shifted, "");
+         }
+      }
 
-      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+      return LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type), "");
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index 4f4fa777c9..315e1bcb54 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -68,24 +68,12 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 /**
  * Swizzle a vector consisting of an array of XYZW structs.
  *
- * @param swizzle is the in [0,4[ range.
+ * @param swizzles is the in [0,4[ range.
  */
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4]);
-
-
-/**
- * Swizzle two vector consisting of an array of XYZW structs.
- *
- * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
- */
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4]);
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4]);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index dec7556138..21236839fb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -49,6 +49,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
@@ -132,10 +133,14 @@ struct lp_build_tgsi_soa_context
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
    LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
 
-   /* we allocate an array of temps if we have indirect
-    * addressing and then the temps above is unused */
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
    LLVMValueRef temps_array;
-   boolean has_indirect_addressing;
+
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
@@ -404,25 +409,92 @@ static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
    lp_exec_mask_update(mask);
 }
 
+
+/**
+ * Return pointer to a temporary register channel (src or dest).
+ * Note that indirect addressing cannot be handled here.
+ * \param index  which temporary register
+ * \param chan  which channel of the temp register.
+ */
 static LLVMValueRef
 get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
              unsigned index,
-             unsigned chan,
-             boolean is_indirect,
-             LLVMValueRef addr)
+             unsigned chan)
 {
    assert(chan < 4);
-   if (!bld->has_indirect_addressing) {
-      return bld->temps[index][chan];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index * 4 + chan, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
+   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
       return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
    }
+   else {
+      return bld->temps[index][chan];
+   }
 }
 
+
+/**
+ * Gather vector.
+ * XXX the lp_build_gather() function should be capable of doing this
+ * with a little work.
+ */
+static LLVMValueRef
+build_gather(struct lp_build_tgsi_soa_context *bld,
+             LLVMValueRef base_ptr,
+             LLVMValueRef indexes)
+{
+   LLVMValueRef res = bld->base.undef;
+   unsigned i;
+
+   /*
+    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
+    */
+   for (i = 0; i < bld->base.type.length; i++) {
+      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
+                                                   indexes, ii, "");
+      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
+                                             &index, 1, "");
+      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
+      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Read the current value of the ADDR register, convert the floats to
+ * ints, multiply by four and return the vector of offsets.
+ * The offsets will be used to index into the constant buffer or
+ * temporary register file.
+ */
+static LLVMValueRef
+get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
+                     const struct tgsi_src_register *indirect_reg)
+{
+   /* always use X component of address register */
+   const int x = indirect_reg->SwizzleX;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
+   uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x);
+   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4); 
+   LLVMValueRef addr_vec;
+
+   addr_vec = LLVMBuildLoad(bld->base.builder,
+                            bld->addr[indirect_reg->Index][swizzle],
+                            "load addr reg");
+
+   /* for indexing we want integers */
+   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
+                              int_vec_type, "");
+
+   /* addr_vec = addr_vec * 4 */
+   addr_vec = lp_build_mul(&bld->base, addr_vec, vec4);
+
+   return addr_vec;
+}
+
+
 /**
  * Register fetch.
  */
@@ -430,14 +502,14 @@ static LLVMValueRef
 emit_fetch(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   unsigned index,
+   unsigned src_op,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->Src[index];
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
    const unsigned swizzle =
       tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    LLVMValueRef res;
-   LLVMValueRef addr = NULL;
+   LLVMValueRef addr_vec = NULL;
 
    if (swizzle > 3) {
       assert(0 && "invalid swizzle in emit_fetch()");
@@ -445,32 +517,33 @@ emit_fetch(
    }
 
    if (reg->Register.Indirect) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-      addr = LLVMBuildLoad(bld->base.builder,
-                           bld->addr[reg->Indirect.Index][swizzle],
-                           "");
-      /* for indexing we want integers */
-      addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                             int_vec_type, "");
-      addr = LLVMBuildExtractElement(bld->base.builder,
-                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                     "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      assert(bld->indirect_files);
+      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
    }
 
    switch (reg->Register.File) {
    case TGSI_FILE_CONSTANT:
-      {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(),
-                                           reg->Register.Index*4 + swizzle, 0);
+      if (reg->Register.Indirect) {
+         LLVMValueRef index_vec;  /* index into the const buffer */
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec = index_vec + addr_vec */
+         index_vec = lp_build_add(&bld->base, index_vec, addr_vec);
+
+         /* Gather values from the constant buffer */
+         res = build_gather(bld, bld->consts_ptr, index_vec);
+      }
+      else {
+         LLVMValueRef index;  /* index into the const buffer */
          LLVMValueRef scalar, scalar_ptr;
 
-         if (reg->Register.Indirect) {
-            /*lp_build_printf(bld->base.builder,
-              "\taddr = %d\n", addr);*/
-            index = lp_build_add(&bld->base, index, addr);
-         }
+         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
+
          scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
                                    &index, 1, "");
          scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
@@ -490,13 +563,38 @@ emit_fetch(
       break;
 
    case TGSI_FILE_TEMPORARY:
-      {
-         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                              swizzle,
-                                              reg->Register.Indirect,
-                                              addr);
+      if (reg->Register.Indirect) {
+         LLVMValueRef vec_len =
+            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+         LLVMValueRef temps_array;
+         LLVMTypeRef float4_ptr_type;
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec += addr_vec */
+         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
+
+         /* index_vec *= vector_length */
+         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
+
+         /* cast temps_array pointer to float* */
+         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
+                                        float4_ptr_type, "");
+
+         /* Gather values from the temporary register array */
+         res = build_gather(bld, temps_array, index_vec);
+      }
+      else {
+         LLVMValueRef temp_ptr;
+         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
          res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
-         if(!res)
+         if (!res)
             return bld->base.undef;
       }
       break;
@@ -660,8 +758,12 @@ emit_store(
    }
 
    if (reg->Register.Indirect) {
+      /* XXX use get_indirect_offsets() here eventually */
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
       unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
+
+      assert(bld->indirect_files);
+
       addr = LLVMBuildLoad(bld->base.builder,
                            bld->addr[reg->Indirect.Index][swizzle],
                            "");
@@ -680,14 +782,18 @@ emit_store(
                          bld->outputs[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_TEMPORARY: {
-      LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                           chan_index,
-                                           reg->Register.Indirect,
-                                           addr);
-      lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+   case TGSI_FILE_TEMPORARY:
+      if (reg->Register.Indirect) {
+         /* XXX not done yet */
+         debug_printf("WARNING: LLVM scatter store of temp regs"
+                      " not implemented\n");
+      }
+      else {
+         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                              chan_index);
+         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+      }
       break;
-   }
 
    case TGSI_FILE_ADDRESS:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
@@ -905,7 +1011,7 @@ emit_declaration(
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
          assert(idx < LP_MAX_TGSI_TEMPS);
-         if (bld->has_indirect_addressing) {
+         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
             LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
                                                    last*4 + 4, 0);
             bld->temps_array = lp_build_array_alloca(bld->base.builder,
@@ -1929,8 +2035,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
-   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
-                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
+   bld.indirect_files = info->indirect_files;
    bld.instructions = (struct tgsi_full_instruction *)
                       MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
    bld.max_instructions = LP_MAX_INSTRUCTIONS;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index df77ef2155..3ffe916f8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -316,6 +316,54 @@ LLVMTypeRef
 lp_build_int32_vec4_type(void);
 
 
+static INLINE struct lp_type
+lp_float32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = TRUE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_int32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_unorm8_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = FALSE;
+   type.norm = TRUE;
+   type.width = 8;
+   type.length = 4;
+
+   return type;
+}
+
+
 struct lp_type
 lp_uint_type(struct lp_type type);
 
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index 0238308d20..a084310d4f 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -45,7 +45,6 @@
 #include <pthread.h> /* POSIX threads headers */
 #include <stdio.h> /* for perror() */
 
-#define PIPE_THREAD_HAVE_CONDVAR
 
 /* pipe_thread
  */
@@ -168,19 +167,59 @@ typedef CRITICAL_SECTION pipe_mutex;
 #define pipe_mutex_unlock(mutex) \
    LeaveCriticalSection(&mutex)
 
+/* TODO: Need a macro to declare "I don't care about WinXP compatibilty" */
+#if 0 && defined (_WIN32_WINNT) && (_WIN32_WINNT >= 0x0600)
+/* CONDITION_VARIABLE is only available on newer versions of Windows
+ * (Server 2008/Vista or later).
+ * http://msdn.microsoft.com/en-us/library/ms682052(VS.85).aspx
+ *
+ * pipe_condvar
+ */
+typedef CONDITION_VARIABLE pipe_condvar;
+
+#define pipe_static_condvar(cond) \
+   /*static*/ pipe_condvar cond = CONDITION_VARIABLE_INIT
+
+#define pipe_condvar_init(cond) \
+   InitializeConditionVariable(&(cond))
+
+#define pipe_condvar_destroy(cond) \
+   (void) cond /* nothing to do */
+
+#define pipe_condvar_wait(cond, mutex) \
+   SleepConditionVariableCS(&(cond), &(mutex), INFINITE)
+
+#define pipe_condvar_signal(cond) \
+   WakeConditionVariable(&(cond))
+
+#define pipe_condvar_broadcast(cond) \
+   WakeAllConditionVariable(&(cond))
+
+#else /* need compatibility with pre-Vista Win32 */
 
 /* pipe_condvar (XXX FIX THIS)
+ * See http://www.cs.wustl.edu/~schmidt/win32-cv-1.html
+ * for potential pitfalls in implementation.
  */
-typedef unsigned pipe_condvar;
+typedef DWORD pipe_condvar;
+
+#define pipe_static_condvar(cond) \
+   /*static*/ pipe_condvar cond = 1
 
 #define pipe_condvar_init(cond) \
-   (void) cond
+   (void) (cond = 1)
 
 #define pipe_condvar_destroy(cond) \
    (void) cond
 
+/* Poor man's pthread_cond_wait():
+   Just release the mutex and sleep for one millisecond.
+   The caller's while() loop does all the work. */
 #define pipe_condvar_wait(cond, mutex) \
-   (void) cond; (void) mutex
+   do { pipe_mutex_unlock(mutex); \
+        Sleep(cond); \
+        pipe_mutex_lock(mutex); \
+   } while (0)
 
 #define pipe_condvar_signal(cond) \
    (void) cond
@@ -188,9 +227,12 @@ typedef unsigned pipe_condvar;
 #define pipe_condvar_broadcast(cond) \
    (void) cond
 
+#endif /* pre-Vista win32 */
 
 #else
 
+#include "os/os_time.h"
+
 /** Dummy definitions */
 
 typedef unsigned pipe_thread;
@@ -214,7 +256,6 @@ static INLINE int pipe_thread_destroy( pipe_thread thread )
 }
 
 typedef unsigned pipe_mutex;
-typedef unsigned pipe_condvar;
 
 #define pipe_static_mutex(mutex) \
    static pipe_mutex mutex = 0
@@ -231,17 +272,25 @@ typedef unsigned pipe_condvar;
 #define pipe_mutex_unlock(mutex) \
    (void) mutex
 
+typedef int64_t pipe_condvar;
+
 #define pipe_static_condvar(condvar) \
-   static unsigned condvar = 0
+   static pipe_condvar condvar = 1000
 
 #define pipe_condvar_init(condvar) \
-   (void) condvar
+   (void) (condvar = 1000)
 
 #define pipe_condvar_destroy(condvar) \
    (void) condvar
 
+/* Poor man's pthread_cond_wait():
+   Just release the mutex and sleep for one millisecond.
+   The caller's while() loop does all the work. */
 #define pipe_condvar_wait(condvar, mutex) \
-   (void) condvar
+   do { pipe_mutex_unlock(mutex); \
+        os_time_sleep(condvar); \
+        pipe_mutex_lock(mutex); \
+   } while (0)
 
 #define pipe_condvar_signal(condvar) \
    (void) condvar
@@ -277,27 +326,7 @@ static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
 }
 
 
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-
-/* XXX FIX THIS */
-typedef unsigned pipe_barrier;
-
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
-{
-   /* XXX we could implement barriers with a mutex and condition var */
-}
-
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
-{
-}
-
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
-{
-   assert(0);
-}
-
-
-#else
+#else /* If the OS doesn't have its own, implement barriers using a mutex and a condvar */
 
 typedef struct {
    unsigned count;
diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
new file mode 100644
index 0000000000..0433da6141
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -0,0 +1,44 @@
+
+#ifndef INLINE_DEBUG_HELPER_H
+#define INLINE_DEBUG_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+
+/* Helper function to wrap a screen with
+ * one or more debug driver: rbug, trace.
+ */
+
+#ifdef GALLIUM_TRACE
+#include "trace/tr_public.h"
+#endif
+
+#ifdef GALLIUM_RBUG
+#include "rbug/rbug_public.h"
+#endif
+
+#ifdef GALLIUM_GALAHAD
+#include "galahad/glhd_public.h"
+#endif
+
+static INLINE struct pipe_screen *
+debug_screen_wrap(struct pipe_screen *screen)
+{
+
+#if defined(GALLIUM_RBUG)
+   screen = rbug_screen_create(screen);
+#endif
+
+#if defined(GALLIUM_TRACE)
+   screen = trace_screen_create(screen);
+#endif
+
+#if defined(GALLIUM_GALAHAD)
+   screen = galahad_screen_create(screen);
+#endif
+
+   return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
new file mode 100644
index 0000000000..036c1ee48a
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -0,0 +1,63 @@
+
+#ifndef INLINE_SW_HELPER_H
+#define INLINE_SW_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "state_tracker/sw_winsys.h"
+
+
+/* Helper function to choose and instantiate one of the software rasterizers:
+ * cell, llvmpipe, softpipe.
+ */
+
+#ifdef GALLIUM_SOFTPIPE
+#include "softpipe/sp_public.h"
+#endif
+
+#ifdef GALLIUM_LLVMPIPE
+#include "llvmpipe/lp_public.h"
+#endif
+
+#ifdef GALLIUM_CELL
+#include "cell/ppu/cell_public.h"
+#endif
+
+static INLINE struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+   struct pipe_screen *screen = NULL;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+
+#if defined(GALLIUM_CELL)
+   if (screen == NULL && strcmp(driver, "cell") == 0)
+      screen = cell_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_LLVMPIPE)
+   if (screen == NULL && strcmp(driver, "llvmpipe") == 0)
+      screen = llvmpipe_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_SOFTPIPE)
+   if (screen == NULL)
+      screen = softpipe_create_screen(winsys);
+#endif
+
+   return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
new file mode 100644
index 0000000000..0b4e740403
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -0,0 +1,34 @@
+
+#ifndef INLINE_WRAPPER_SW_HELPER_H
+#define INLINE_WRAPPER_SW_HELPER_H
+
+#include "target-helpers/inline_sw_helper.h"
+#include "sw/wrapper/wrapper_sw_winsys.h"
+
+/**
+ * Try to wrap a hw screen with a software screen.
+ * On failure will return given screen.
+ */
+static INLINE struct pipe_screen *
+sw_screen_wrap(struct pipe_screen *screen)
+{
+   struct sw_winsys *sws;
+   struct pipe_screen *sw_screen;
+
+   sws = wrapper_sw_winsys_warp_pipe_screen(screen);
+   if (!sws)
+      goto err;
+
+   sw_screen = sw_screen_create(sws);
+   if (sw_screen == screen)
+      goto err_winsys;
+
+   return sw_screen;
+
+err_winsys:
+   sws->destroy(sws);
+err:
+  return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 9fcc28f4c9..f71ffb7030 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -176,7 +176,11 @@ static const char *primitive_names[] =
    "TRIANGLE_FAN",
    "QUADS",
    "QUAD_STRIP",
-   "POLYGON"
+   "POLYGON",
+   "LINES_ADJACENCY",
+   "LINE_STRIP_ADJACENCY",
+   "TRIANGLES_ADJACENCY",
+   "TRIANGLE_STRIP_ADJACENCY"
 };
 
 static const char *fs_coord_origin_names[] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index ced9c94f46..90198a4f60 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -109,6 +109,19 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      info->input_usage_mask[ind] |= usage_mask;
                   }
                }
+
+               /* check for indirect register reads */
+               if (src->Register.Indirect) {
+                  info->indirect_files |= (1 << src->Register.File);
+               }
+            }
+
+            /* check for indirect register writes */
+            for (i = 0; i < fullinst->Instruction.NumDstRegs; i++) {
+               const struct tgsi_full_dst_register *dst = &fullinst->Dst[i];
+               if (dst->Register.Indirect) {
+                  info->indirect_files |= (1 << dst->Register.File);
+               }
             }
 
             info->num_instructions++;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index e75280336f..f8aa90cf06 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -63,6 +63,12 @@ struct tgsi_shader_info
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
 
+   /**
+    * Bitmask indicating which register files are accessed with
+    * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.
+    */
+   unsigned indirect_files;
+
    struct {
       unsigned name;
       unsigned data[8];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 55fccba4d8..b01d2ff468 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -58,7 +58,7 @@ static boolean is_digit_alpha_underscore( const char *cur )
 static char uprcase( char c )
 {
    if (c >= 'a' && c <= 'z')
-      return c += 'A' - 'a';
+      return c + 'A' - 'a';
    return c;
 }
 
@@ -732,7 +732,7 @@ parse_optional_swizzle(
          else if (uprcase( *cur ) == 'W')
             swizzle[i] = TGSI_SWIZZLE_W;
          else {
-	    report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+	    report_error( ctx, "Expected register swizzle component `x', `y', `z' or `w'" );
 	    return FALSE;
          }
          cur++;
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index dfe2101c2e..0d94aaae95 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -52,9 +52,8 @@
 
 struct blitter_context_priv
 {
-   struct blitter_context blitter;
+   struct blitter_context base;
 
-   struct pipe_context *pipe; /**< pipe context */
    struct pipe_resource *vbuf;  /**< quad */
 
    float vertices[4][2][4];   /**< {pos, color} or {pos, texcoord} */
@@ -97,15 +96,25 @@ struct blitter_context_priv
    /* Rasterizer state. */
    void *rs_state;
 
-   struct pipe_sampler_view *sampler_view;
-
    /* Viewport state. */
    struct pipe_viewport_state viewport;
 
    /* Clip state. */
    struct pipe_clip_state clip;
+
+   /* Destination surface dimensions. */
+   unsigned dst_width;
+   unsigned dst_height;
 };
 
+static void blitter_draw_rectangle(struct blitter_context *blitter,
+                                   unsigned x, unsigned y,
+                                   unsigned width, unsigned height,
+                                   float depth,
+                                   enum blitter_attrib_type type,
+                                   const float attrib[4]);
+
+
 struct blitter_context *util_blitter_create(struct pipe_context *pipe)
 {
    struct blitter_context_priv *ctx;
@@ -120,19 +129,20 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    if (!ctx)
       return NULL;
 
-   ctx->pipe = pipe;
+   ctx->base.pipe = pipe;
+   ctx->base.draw_rectangle = blitter_draw_rectangle;
 
    /* init state objects for them to be considered invalid */
-   ctx->blitter.saved_blend_state = INVALID_PTR;
-   ctx->blitter.saved_dsa_state = INVALID_PTR;
-   ctx->blitter.saved_rs_state = INVALID_PTR;
-   ctx->blitter.saved_fs = INVALID_PTR;
-   ctx->blitter.saved_vs = INVALID_PTR;
-   ctx->blitter.saved_velem_state = INVALID_PTR;
-   ctx->blitter.saved_fb_state.nr_cbufs = ~0;
-   ctx->blitter.saved_num_sampler_views = ~0;
-   ctx->blitter.saved_num_sampler_states = ~0;
-   ctx->blitter.saved_num_vertex_buffers = ~0;
+   ctx->base.saved_blend_state = INVALID_PTR;
+   ctx->base.saved_dsa_state = INVALID_PTR;
+   ctx->base.saved_rs_state = INVALID_PTR;
+   ctx->base.saved_fs = INVALID_PTR;
+   ctx->base.saved_vs = INVALID_PTR;
+   ctx->base.saved_velem_state = INVALID_PTR;
+   ctx->base.saved_fb_state.nr_cbufs = ~0;
+   ctx->base.saved_num_sampler_views = ~0;
+   ctx->base.saved_num_sampler_states = ~0;
+   ctx->base.saved_num_vertex_buffers = ~0;
 
    /* blend state objects */
    memset(&blend, 0, sizeof(blend));
@@ -219,17 +229,17 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
       ctx->vertices[i][0][3] = 1; /*v.w*/
 
    /* create the vertex buffer */
-   ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
+   ctx->vbuf = pipe_buffer_create(ctx->base.pipe->screen,
                                   PIPE_BIND_VERTEX_BUFFER,
                                   sizeof(ctx->vertices));
 
-   return &ctx->blitter;
+   return &ctx->base;
 }
 
 void util_blitter_destroy(struct blitter_context *blitter)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = blitter->pipe;
    int i;
 
    pipe->delete_blend_state(pipe, ctx->blend_write_color);
@@ -260,10 +270,6 @@ void util_blitter_destroy(struct blitter_context *blitter)
       if (ctx->sampler_state[i])
          pipe->delete_sampler_state(pipe, ctx->sampler_state[i]);
 
-   if (ctx->sampler_view) {
-      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
-   }
-
    pipe_resource_reference(&ctx->vbuf, NULL);
    FREE(ctx);
 }
@@ -271,112 +277,117 @@ void util_blitter_destroy(struct blitter_context *blitter)
 static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
 {
    /* make sure these CSOs have been saved */
-   assert(ctx->blitter.saved_blend_state != INVALID_PTR &&
-          ctx->blitter.saved_dsa_state != INVALID_PTR &&
-          ctx->blitter.saved_rs_state != INVALID_PTR &&
-          ctx->blitter.saved_fs != INVALID_PTR &&
-          ctx->blitter.saved_vs != INVALID_PTR &&
-          ctx->blitter.saved_velem_state != INVALID_PTR);
+   assert(ctx->base.saved_blend_state != INVALID_PTR &&
+          ctx->base.saved_dsa_state != INVALID_PTR &&
+          ctx->base.saved_rs_state != INVALID_PTR &&
+          ctx->base.saved_fs != INVALID_PTR &&
+          ctx->base.saved_vs != INVALID_PTR &&
+          ctx->base.saved_velem_state != INVALID_PTR);
 }
 
 static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    unsigned i;
 
    /* restore the state objects which are always required to be saved */
-   pipe->bind_blend_state(pipe, ctx->blitter.saved_blend_state);
-   pipe->bind_depth_stencil_alpha_state(pipe, ctx->blitter.saved_dsa_state);
-   pipe->bind_rasterizer_state(pipe, ctx->blitter.saved_rs_state);
-   pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
-   pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
-   pipe->bind_vertex_elements_state(pipe, ctx->blitter.saved_velem_state);
+   pipe->bind_blend_state(pipe, ctx->base.saved_blend_state);
+   pipe->bind_depth_stencil_alpha_state(pipe, ctx->base.saved_dsa_state);
+   pipe->bind_rasterizer_state(pipe, ctx->base.saved_rs_state);
+   pipe->bind_fs_state(pipe, ctx->base.saved_fs);
+   pipe->bind_vs_state(pipe, ctx->base.saved_vs);
+   pipe->bind_vertex_elements_state(pipe, ctx->base.saved_velem_state);
 
-   ctx->blitter.saved_blend_state = INVALID_PTR;
-   ctx->blitter.saved_dsa_state = INVALID_PTR;
-   ctx->blitter.saved_rs_state = INVALID_PTR;
-   ctx->blitter.saved_fs = INVALID_PTR;
-   ctx->blitter.saved_vs = INVALID_PTR;
-   ctx->blitter.saved_velem_state = INVALID_PTR;
+   ctx->base.saved_blend_state = INVALID_PTR;
+   ctx->base.saved_dsa_state = INVALID_PTR;
+   ctx->base.saved_rs_state = INVALID_PTR;
+   ctx->base.saved_fs = INVALID_PTR;
+   ctx->base.saved_vs = INVALID_PTR;
+   ctx->base.saved_velem_state = INVALID_PTR;
 
-   pipe->set_stencil_ref(pipe, &ctx->blitter.saved_stencil_ref);
+   pipe->set_stencil_ref(pipe, &ctx->base.saved_stencil_ref);
 
-   pipe->set_viewport_state(pipe, &ctx->blitter.saved_viewport);
-   pipe->set_clip_state(pipe, &ctx->blitter.saved_clip);
+   pipe->set_viewport_state(pipe, &ctx->base.saved_viewport);
+   pipe->set_clip_state(pipe, &ctx->base.saved_clip);
 
    /* restore the state objects which are required to be saved before copy/fill
     */
-   if (ctx->blitter.saved_fb_state.nr_cbufs != ~0) {
-      pipe->set_framebuffer_state(pipe, &ctx->blitter.saved_fb_state);
-      ctx->blitter.saved_fb_state.nr_cbufs = ~0;
+   if (ctx->base.saved_fb_state.nr_cbufs != ~0) {
+      pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state);
+      util_assign_framebuffer_state(&ctx->base.saved_fb_state, NULL);
+      ctx->base.saved_fb_state.nr_cbufs = ~0;
    }
 
-   if (ctx->blitter.saved_num_sampler_states != ~0) {
+   if (ctx->base.saved_num_sampler_states != ~0) {
       pipe->bind_fragment_sampler_states(pipe,
-                                         ctx->blitter.saved_num_sampler_states,
-                                         ctx->blitter.saved_sampler_states);
-      ctx->blitter.saved_num_sampler_states = ~0;
+                                         ctx->base.saved_num_sampler_states,
+                                         ctx->base.saved_sampler_states);
+      ctx->base.saved_num_sampler_states = ~0;
    }
 
-   if (ctx->blitter.saved_num_sampler_views != ~0) {
+   if (ctx->base.saved_num_sampler_views != ~0) {
       pipe->set_fragment_sampler_views(pipe,
-                                       ctx->blitter.saved_num_sampler_views,
-                                       ctx->blitter.saved_sampler_views);
-      ctx->blitter.saved_num_sampler_views = ~0;
+                                       ctx->base.saved_num_sampler_views,
+                                       ctx->base.saved_sampler_views);
+
+      for (i = 0; i < ctx->base.saved_num_sampler_views; i++)
+         pipe_sampler_view_reference(&ctx->base.saved_sampler_views[i],
+                                     NULL);
+
+      ctx->base.saved_num_sampler_views = ~0;
    }
 
-   if (ctx->blitter.saved_num_vertex_buffers != ~0) {
+   if (ctx->base.saved_num_vertex_buffers != ~0) {
       pipe->set_vertex_buffers(pipe,
-                               ctx->blitter.saved_num_vertex_buffers,
-                               ctx->blitter.saved_vertex_buffers);
+                               ctx->base.saved_num_vertex_buffers,
+                               ctx->base.saved_vertex_buffers);
 
-      for (i = 0; i < ctx->blitter.saved_num_vertex_buffers; i++) {
-         if (ctx->blitter.saved_vertex_buffers[i].buffer) {
-            pipe_resource_reference(&ctx->blitter.saved_vertex_buffers[i].buffer,
+      for (i = 0; i < ctx->base.saved_num_vertex_buffers; i++) {
+         if (ctx->base.saved_vertex_buffers[i].buffer) {
+            pipe_resource_reference(&ctx->base.saved_vertex_buffers[i].buffer,
                                     NULL);
          }
       }
-      ctx->blitter.saved_num_vertex_buffers = ~0;
+      ctx->base.saved_num_vertex_buffers = ~0;
    }
 }
 
 static void blitter_set_rectangle(struct blitter_context_priv *ctx,
                                   unsigned x1, unsigned y1,
                                   unsigned x2, unsigned y2,
-                                  unsigned width, unsigned height,
                                   float depth)
 {
    int i;
 
    /* set vertex positions */
-   ctx->vertices[0][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v0.x*/
-   ctx->vertices[0][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v0.y*/
+   ctx->vertices[0][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v0.x*/
+   ctx->vertices[0][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v0.y*/
 
-   ctx->vertices[1][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v1.x*/
-   ctx->vertices[1][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v1.y*/
+   ctx->vertices[1][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v1.x*/
+   ctx->vertices[1][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v1.y*/
 
-   ctx->vertices[2][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v2.x*/
-   ctx->vertices[2][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v2.y*/
+   ctx->vertices[2][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v2.x*/
+   ctx->vertices[2][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v2.y*/
 
-   ctx->vertices[3][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v3.x*/
-   ctx->vertices[3][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v3.y*/
+   ctx->vertices[3][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v3.x*/
+   ctx->vertices[3][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v3.y*/
 
    for (i = 0; i < 4; i++)
       ctx->vertices[i][0][2] = depth; /*z*/
 
    /* viewport */
-   ctx->viewport.scale[0] = 0.5f * width;
-   ctx->viewport.scale[1] = 0.5f * height;
+   ctx->viewport.scale[0] = 0.5f * ctx->dst_width;
+   ctx->viewport.scale[1] = 0.5f * ctx->dst_height;
    ctx->viewport.scale[2] = 1.0f;
    ctx->viewport.scale[3] = 1.0f;
-   ctx->viewport.translate[0] = 0.5f * width;
-   ctx->viewport.translate[1] = 0.5f * height;
+   ctx->viewport.translate[0] = 0.5f * ctx->dst_width;
+   ctx->viewport.translate[1] = 0.5f * ctx->dst_height;
    ctx->viewport.translate[2] = 0.0f;
    ctx->viewport.translate[3] = 0.0f;
-   ctx->pipe->set_viewport_state(ctx->pipe, &ctx->viewport);
+   ctx->base.pipe->set_viewport_state(ctx->base.pipe, &ctx->viewport);
 
    /* clip */
-   ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip);
+   ctx->base.pipe->set_clip_state(ctx->base.pipe, &ctx->clip);
 }
 
 static void blitter_set_clear_color(struct blitter_context_priv *ctx,
@@ -401,29 +412,45 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx,
    }
 }
 
+static void get_normalized_texcoords(struct pipe_resource *src,
+                                     struct pipe_subresource subsrc,
+                                     unsigned x1, unsigned y1,
+                                     unsigned x2, unsigned y2,
+                                     float out[4])
+{
+   out[0] = x1 / (float)u_minify(src->width0,  subsrc.level);
+   out[1] = y1 / (float)u_minify(src->height0, subsrc.level);
+   out[2] = x2 / (float)u_minify(src->width0,  subsrc.level);
+   out[3] = y2 / (float)u_minify(src->height0, subsrc.level);
+}
+
+static void set_texcoords_in_vertices(const float coord[4],
+                                      float *out, unsigned stride)
+{
+   out[0] = coord[0]; /*t0.s*/
+   out[1] = coord[1]; /*t0.t*/
+   out += stride;
+   out[0] = coord[2]; /*t1.s*/
+   out[1] = coord[1]; /*t1.t*/
+   out += stride;
+   out[0] = coord[2]; /*t2.s*/
+   out[1] = coord[3]; /*t2.t*/
+   out += stride;
+   out[0] = coord[0]; /*t3.s*/
+   out[1] = coord[3]; /*t3.t*/
+}
+
 static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
                                      struct pipe_resource *src,
                                      struct pipe_subresource subsrc,
                                      unsigned x1, unsigned y1,
                                      unsigned x2, unsigned y2)
 {
-   int i;
-   float s1 = x1 / (float)u_minify(src->width0,  subsrc.level);
-   float t1 = y1 / (float)u_minify(src->height0, subsrc.level);
-   float s2 = x2 / (float)u_minify(src->width0,  subsrc.level);
-   float t2 = y2 / (float)u_minify(src->height0, subsrc.level);
-
-   ctx->vertices[0][1][0] = s1; /*t0.s*/
-   ctx->vertices[0][1][1] = t1; /*t0.t*/
-
-   ctx->vertices[1][1][0] = s2; /*t1.s*/
-   ctx->vertices[1][1][1] = t1; /*t1.t*/
-
-   ctx->vertices[2][1][0] = s2; /*t2.s*/
-   ctx->vertices[2][1][1] = t2; /*t2.t*/
+   unsigned i;
+   float coord[4];
 
-   ctx->vertices[3][1][0] = s1; /*t3.s*/
-   ctx->vertices[3][1][1] = t2; /*t3.t*/
+   get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord);
+   set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8);
 
    for (i = 0; i < 4; i++) {
       ctx->vertices[i][1][2] = 0; /*r*/
@@ -454,20 +481,11 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
                                        unsigned x2, unsigned y2)
 {
    int i;
-   float s1 = x1 / (float)u_minify(src->width0,  subsrc.level);
-   float t1 = y1 / (float)u_minify(src->height0, subsrc.level);
-   float s2 = x2 / (float)u_minify(src->width0,  subsrc.level);
-   float t2 = y2 / (float)u_minify(src->height0, subsrc.level);
+   float coord[4];
    float st[4][2];
 
-   st[0][0] = s1;
-   st[0][1] = t1;
-   st[1][0] = s2;
-   st[1][1] = t1;
-   st[2][0] = s2;
-   st[2][1] = t2;
-   st[3][0] = s1;
-   st[3][1] = t2;
+   get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord);
+   set_texcoords_in_vertices(coord, &st[0][0], 2);
 
    util_map_texcoords2d_onto_cubemap(subsrc.face,
                                      /* pointer, stride in floats */
@@ -478,9 +496,16 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
       ctx->vertices[i][1][3] = 1; /*q*/
 }
 
+static void blitter_set_dst_dimensions(struct blitter_context_priv *ctx,
+                                       unsigned width, unsigned height)
+{
+   ctx->dst_width = width;
+   ctx->dst_height = height;
+}
+
 static void blitter_draw_quad(struct blitter_context_priv *ctx)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    /* write vertices and draw them */
    pipe_buffer_write(pipe, ctx->vbuf,
@@ -495,7 +520,7 @@ static INLINE
 void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
                                  int miplevel)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state;
 
    assert(miplevel < PIPE_MAX_TEXTURE_LEVELS);
@@ -518,7 +543,7 @@ void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
 static INLINE
 void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
 
@@ -531,7 +556,7 @@ void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs)
 
 /** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
 static unsigned
-pipe_tex_to_tgsi_tex(unsigned pipe_tex_target)
+pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target)
 {
    switch (pipe_tex_target) {
    case PIPE_TEXTURE_1D:
@@ -553,7 +578,7 @@ static INLINE
 void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
                                   unsigned tex_target)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
 
@@ -572,7 +597,7 @@ static INLINE
 void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
                                     unsigned tex_target)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
 
@@ -588,6 +613,31 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
    return ctx->fs_texfetch_depth[tex_target];
 }
 
+static void blitter_draw_rectangle(struct blitter_context *blitter,
+                                   unsigned x1, unsigned y1,
+                                   unsigned x2, unsigned y2,
+                                   float depth,
+                                   enum blitter_attrib_type type,
+                                   const float attrib[4])
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+
+   switch (type) {
+      case UTIL_BLITTER_ATTRIB_COLOR:
+         blitter_set_clear_color(ctx, attrib);
+         break;
+
+      case UTIL_BLITTER_ATTRIB_TEXCOORD:
+         set_texcoords_in_vertices(attrib, &ctx->vertices[0][1][0], 8);
+         break;
+
+      default:;
+   }
+
+   blitter_set_rectangle(ctx, x1, y1, x2, y2, depth);
+   blitter_draw_quad(ctx);
+}
+
 void util_blitter_clear(struct blitter_context *blitter,
                         unsigned width, unsigned height,
                         unsigned num_cbufs,
@@ -596,7 +646,7 @@ void util_blitter_clear(struct blitter_context *blitter,
                         double depth, unsigned stencil)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_stencil_ref sr = { { 0 } };
 
    assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
@@ -630,9 +680,9 @@ void util_blitter_clear(struct blitter_context *blitter,
    pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs));
    pipe->bind_vs_state(pipe, ctx->vs_col);
 
-   blitter_set_clear_color(ctx, rgba);
-   blitter_set_rectangle(ctx, 0, 0, width, height, width, height, depth);
-   blitter_draw_quad(ctx);
+   blitter_set_dst_dimensions(ctx, width, height);
+   blitter->draw_rectangle(blitter, 0, 0, width, height, depth,
+                           UTIL_BLITTER_ATTRIB_COLOR, rgba);
    blitter_restore_CSOs(ctx);
 }
 
@@ -654,7 +704,7 @@ void util_blitter_copy_region(struct blitter_context *blitter,
                               boolean ignore_stencil)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_screen *screen = pipe->screen;
    struct pipe_surface *dstsurf;
    struct pipe_framebuffer_state fb_state;
@@ -736,11 +786,6 @@ void util_blitter_copy_region(struct blitter_context *blitter,
    u_sampler_view_default_template(&viewTempl, src, src->format);
    view = pipe->create_sampler_view(pipe, src, &viewTempl);
 
-   if (ctx->sampler_view) {
-      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
-   }
-   ctx->sampler_view = view;
-
    /* Set rasterizer state, shaders, and textures. */
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_vs_state(pipe, ctx->vs_tex);
@@ -750,32 +795,49 @@ void util_blitter_copy_region(struct blitter_context *blitter,
    pipe->set_fragment_sampler_views(pipe, 1, &view);
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   /* Set texture coordinates. */
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+
    switch (src->target) {
+      /* Draw the quad with the draw_rectangle callback. */
       case PIPE_TEXTURE_1D:
       case PIPE_TEXTURE_2D:
-         blitter_set_texcoords_2d(ctx, src, subsrc,
-                                  srcx, srcy, srcx+width, srcy+height);
+         {
+            /* Set texture coordinates. */
+            float coord[4];
+            get_normalized_texcoords(src, subsrc, srcx, srcy,
+                                     srcx+width, srcy+height, coord);
+
+            /* Draw. */
+            blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
+                                    UTIL_BLITTER_ATTRIB_TEXCOORD, coord);
+         }
          break;
+
+      /* Draw the quad with the generic codepath. */
       case PIPE_TEXTURE_3D:
-         blitter_set_texcoords_3d(ctx, src, subsrc, srcz,
-                                  srcx, srcy, srcx+width, srcy+height);
-         break;
       case PIPE_TEXTURE_CUBE:
-         blitter_set_texcoords_cube(ctx, src, subsrc,
-                                    srcx, srcy, srcx+width, srcy+height);
+         /* Set texture coordinates. */
+         if (src->target == PIPE_TEXTURE_3D)
+            blitter_set_texcoords_3d(ctx, src, subsrc, srcz,
+                                     srcx, srcy, srcx+width, srcy+height);
+         else
+            blitter_set_texcoords_cube(ctx, src, subsrc,
+                                       srcx, srcy, srcx+width, srcy+height);
+
+         /* Draw. */
+         blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, 0);
+         blitter_draw_quad(ctx);
          break;
+
       default:
          assert(0);
          return;
    }
 
-   blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height,
-                         dstsurf->width, dstsurf->height, 0);
-   blitter_draw_quad(ctx);
    blitter_restore_CSOs(ctx);
 
    pipe_surface_reference(&dstsurf, NULL);
+   pipe_sampler_view_reference(&view, NULL);
 }
 
 /* Clear a region of a color surface to a constant value. */
@@ -786,7 +848,7 @@ void util_blitter_clear_render_target(struct blitter_context *blitter,
                                       unsigned width, unsigned height)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_framebuffer_state fb_state;
 
    assert(dstsurf->texture);
@@ -813,9 +875,9 @@ void util_blitter_clear_render_target(struct blitter_context *blitter,
    fb_state.zsbuf = 0;
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   blitter_set_clear_color(ctx, rgba);
-   blitter_set_rectangle(ctx, 0, 0, width, height, dstsurf->width, dstsurf->height, 0);
-   blitter_draw_quad(ctx);
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
+                           UTIL_BLITTER_ATTRIB_COLOR, rgba);
    blitter_restore_CSOs(ctx);
 }
 
@@ -829,7 +891,7 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
                                       unsigned width, unsigned height)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_framebuffer_state fb_state;
    struct pipe_stencil_ref sr = { { 0 } };
 
@@ -873,7 +935,8 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
    fb_state.zsbuf = dstsurf;
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   blitter_set_rectangle(ctx, 0, 0, width, height, dstsurf->width, dstsurf->height, depth);
-   blitter_draw_quad(ctx);
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, depth,
+                           UTIL_BLITTER_ATTRIB_NONE, NULL);
    blitter_restore_CSOs(ctx);
 }
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 22849280ab..ba3f92eca8 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -39,9 +39,48 @@ extern "C" {
 
 struct pipe_context;
 
+enum blitter_attrib_type {
+   UTIL_BLITTER_ATTRIB_NONE,
+   UTIL_BLITTER_ATTRIB_COLOR,
+   UTIL_BLITTER_ATTRIB_TEXCOORD
+};
+
 struct blitter_context
 {
+   /**
+    * Draw a rectangle.
+    *
+    * \param x1      An X coordinate of the top-left corner.
+    * \param y1      A Y coordinate of the top-left corner.
+    * \param x2      An X coordinate of the bottom-right corner.
+    * \param y2      A Y coordinate of the bottom-right corner.
+    * \param depth  A depth which the rectangle is rendered at.
+    *
+    * \param type   Semantics of the attributes "attrib".
+    *               If type is UTIL_BLITTER_ATTRIB_NONE, ignore them.
+    *               If type is UTIL_BLITTER_ATTRIB_COLOR, the attributes
+    *               make up a constant RGBA color, and should go to the COLOR0
+    *               varying slot of a fragment shader.
+    *               If type is UTIL_BLITTER_ATTRIB_TEXCOORD, {a1, a2} and
+    *               {a3, a4} specify top-left and bottom-right texture
+    *               coordinates of the rectangle, respectively, and should go
+    *               to the GENERIC0 varying slot of a fragment shader.
+    *
+    * \param attrib See type.
+    *
+    * \note A driver may optionally override this callback to implement
+    *       a specialized hardware path for drawing a rectangle, e.g. using
+    *       a rectangular point sprite.
+    */
+   void (*draw_rectangle)(struct blitter_context *blitter,
+                          unsigned x1, unsigned y1, unsigned x2, unsigned y2,
+                          float depth,
+                          enum blitter_attrib_type type,
+                          const float attrib[4]);
+
    /* Private members, really. */
+   struct pipe_context *pipe; /**< pipe context */
+
    void *saved_blend_state;   /**< blend state */
    void *saved_dsa_state;     /**< depth stencil alpha state */
    void *saved_velem_state;   /**< vertex elements state */
@@ -73,6 +112,15 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe);
  */
 void util_blitter_destroy(struct blitter_context *blitter);
 
+/**
+ * Return the pipe context associated with a blitter context.
+ */
+static INLINE
+struct pipe_context *util_blitter_get_pipe(struct blitter_context *blitter)
+{
+   return blitter->pipe;
+}
+
 /*
  * These CSOs must be saved before any of the following functions is called:
  * - blend state
@@ -208,11 +256,45 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter,
    blitter->saved_vs = vs;
 }
 
+/* XXX This should probably be moved elsewhere. */
+static INLINE
+void util_assign_framebuffer_state(struct pipe_framebuffer_state *dst,
+                                   const struct pipe_framebuffer_state *src)
+{
+   unsigned i;
+
+   if (src) {
+      /* Reference all surfaces. */
+      for (i = 0; i < src->nr_cbufs; i++) {
+         pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+      }
+      for (; i < dst->nr_cbufs; i++) {
+         pipe_surface_reference(&dst->cbufs[i], NULL);
+      }
+
+      pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+
+      dst->nr_cbufs = src->nr_cbufs;
+      dst->width = src->width;
+      dst->height = src->height;
+   } else {
+      /* Set all surfaces to NULL. */
+      for (i = 0; i < dst->nr_cbufs; i++) {
+         pipe_surface_reference(&dst->cbufs[i], NULL);
+      }
+
+      pipe_surface_reference(&dst->zsbuf, NULL);
+
+      dst->nr_cbufs = 0;
+   }
+}
+
 static INLINE
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
-                                   struct pipe_framebuffer_state *state)
+                                   const struct pipe_framebuffer_state *state)
 {
-   blitter->saved_fb_state = *state;
+   blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */
+   util_assign_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
 static INLINE
@@ -247,12 +329,13 @@ util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                          int num_views,
                                          struct pipe_sampler_view **views)
 {
+   unsigned i;
    assert(num_views <= Elements(blitter->saved_sampler_views));
 
    blitter->saved_num_sampler_views = num_views;
-   memcpy(blitter->saved_sampler_views,
-          views,
-          num_views * sizeof(struct pipe_sampler_view *));
+   for (i = 0; i < num_views; i++)
+      pipe_sampler_view_reference(&blitter->saved_sampler_views[i],
+                                  views[i]);
 }
 
 static INLINE void
diff --git a/src/gallium/auxiliary/util/u_caps.c b/src/gallium/auxiliary/util/u_caps.c
index 294ee37033..94d5bd3027 100644
--- a/src/gallium/auxiliary/util/u_caps.c
+++ b/src/gallium/auxiliary/util/u_caps.c
@@ -186,6 +186,22 @@ static unsigned caps_opengl_2_1[] = {
 /* OpenGL 3.0 */
 /* UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), */
 
+/* Shader Model 3 */
+static unsigned caps_sm3[] = {
+    UTIL_CHECK_INT(MAX_FS_INSTRUCTIONS, 512),
+    UTIL_CHECK_INT(MAX_FS_INPUTS, 10),
+    UTIL_CHECK_INT(MAX_FS_TEMPS, 32),
+    UTIL_CHECK_INT(MAX_FS_ADDRS, 1),
+    UTIL_CHECK_INT(MAX_FS_CONSTS, 224),
+
+    UTIL_CHECK_INT(MAX_VS_INSTRUCTIONS, 512),
+    UTIL_CHECK_INT(MAX_VS_INPUTS, 16),
+    UTIL_CHECK_INT(MAX_VS_TEMPS, 32),
+    UTIL_CHECK_INT(MAX_VS_ADDRS, 2),
+    UTIL_CHECK_INT(MAX_VS_CONSTS, 256),
+
+    UTIL_CHECK_TERMINATE
+};
 
 /**
  * Demo function which checks against theoretical caps needed for different APIs.
@@ -203,6 +219,7 @@ void util_caps_demo_print(struct pipe_screen *screen)
       {"DX 11", caps_dx_11},
       {"OpenGL 2.1", caps_opengl_2_1},
 /*    {"OpenGL 3.0", caps_opengl_3_0},*/
+      {"SM3", caps_sm3},
       {NULL, NULL}
    };
    int i, out = 0;
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index a08241971c..23d33af4e4 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -38,7 +38,7 @@
 #include "u_cpu_detect.h"
 
 #if defined(PIPE_ARCH_PPC)
-#if defined(PIPE_OS_DARWIN)
+#if defined(PIPE_OS_APPLE)
 #include <sys/sysctl.h>
 #else
 #include <signal.h>
@@ -132,7 +132,7 @@ win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
 #endif /* PIPE_ARCH_X86 */
 
 
-#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN)
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
 static jmp_buf  __lv_powerpc_jmpbuf;
 static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
@@ -153,7 +153,7 @@ sigill_handler(int sig)
 static void
 check_os_altivec_support(void)
 {
-#if defined(PIPE_OS_DARWIN)
+#if defined(PIPE_OS_APPLE)
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
    int has_vu = 0;
    int len = sizeof (has_vu);
@@ -166,8 +166,8 @@ check_os_altivec_support(void)
          util_cpu_caps.has_altivec = 1;
       }
    }
-#else /* !PIPE_OS_DARWIN */
-   /* no Darwin, do it the brute-force way */
+#else /* !PIPE_OS_APPLE */
+   /* not on Apple/Darwin, do it the brute-force way */
    /* this is borrowed from the libmpeg2 library */
    signal(SIGILL, sigill_handler);
    if (setjmp(__lv_powerpc_jmpbuf)) {
@@ -184,7 +184,7 @@ check_os_altivec_support(void)
       signal(SIGILL, SIG_DFL);
       util_cpu_caps.has_altivec = 1;
    }
-#endif /* PIPE_OS_DARWIN */
+#endif /* !PIPE_OS_APPLE */
 }
 #endif /* PIPE_ARCH_PPC */
 
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 5e373ff24c..ad162558bc 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -190,11 +190,11 @@ debug_get_flags_option(const char *name,
       result = dfault;
    else if (!util_strcmp(str, "help")) {
       result = dfault;
-      debug_printf("%s: help for %s:\n", __FUNCTION__, name);
+      _debug_printf("%s: help for %s:\n", __FUNCTION__, name);
       for (; flags->name; ++flags)
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
-         debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
+         _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
                       (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
diff --git a/src/gallium/auxiliary/util/u_double_list.h b/src/gallium/auxiliary/util/u_double_list.h
index 53bb1342dd..42adb1f069 100644
--- a/src/gallium/auxiliary/util/u_double_list.h
+++ b/src/gallium/auxiliary/util/u_double_list.h
@@ -98,5 +98,20 @@ struct list_head
 #define LIST_IS_EMPTY(__list)                   \
     ((__list)->next == (__list))
 
-
+#ifndef container_of
+#define container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr)						\
+	     - ((char *)&(sample)->member - (char *)(sample)))
+#endif
+
+#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
+   for (pos = container_of((head)->next, pos, member);			\
+	&pos->member != (head);						\
+	pos = container_of(pos->member.next, pos, member))
+
+#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
+   for (pos = container_of((head)->next, pos, member),			\
+	storage = container_of(pos->member.next, pos, member);	\
+	&pos->member != (head);						\
+	pos = storage, storage = container_of(storage->member.next, storage, member))
 #endif /*_U_DOUBLE_LIST_H_*/
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
index 3168a1fab4..43d09f1960 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -120,7 +120,7 @@ util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_
 }
 
 
-static INLINE boolean
+boolean
 util_format_fits_8unorm(const struct util_format_description *format_desc)
 {
    unsigned chan;
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index fd95bea1a7..38254b1096 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -213,6 +213,16 @@ struct util_format_description
                        unsigned width, unsigned height);
 
    /**
+    * Fetch a single pixel (i, j) from a block.
+    *
+    * XXX: Only defined for a very few select formats.
+    */
+   void
+   (*fetch_rgba_8unorm)(uint8_t *dst,
+                        const uint8_t *src,
+                        unsigned i, unsigned j);
+
+   /**
     * Unpack pixel blocks to R32G32B32A32_FLOAT.
     * Note: strides are in bytes.
     *
@@ -663,6 +673,9 @@ util_format_write_4ub(enum pipe_format format,
  * Generic format conversion;
  */
 
+boolean
+util_format_fits_8unorm(const struct util_format_description *format_desc);
+
 void
 util_format_translate(enum pipe_format dst_format,
                       void *dst, unsigned dst_stride,
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index ae9a598197..f0b407b8b8 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -132,12 +132,17 @@ def write_format_table(formats):
         if format.colorspace != ZS:
             print "   &util_format_%s_unpack_rgba_8unorm," % format.short_name() 
             print "   &util_format_%s_pack_rgba_8unorm," % format.short_name() 
+            if format.layout == 's3tc':
+                print "   &util_format_%s_fetch_rgba_8unorm," % format.short_name()
+            else:
+                print "   NULL, /* fetch_rgba_8unorm */" 
             print "   &util_format_%s_unpack_rgba_float," % format.short_name() 
             print "   &util_format_%s_pack_rgba_float," % format.short_name() 
             print "   &util_format_%s_fetch_rgba_float," % format.short_name()
         else:
             print "   NULL, /* unpack_rgba_8unorm */" 
             print "   NULL, /* pack_rgba_8unorm */" 
+            print "   NULL, /* fetch_rgba_8unorm */" 
             print "   NULL, /* unpack_rgba_float */" 
             print "   NULL, /* pack_rgba_float */" 
             print "   NULL, /* fetch_rgba_float */" 
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 6370e77986..fe19466436 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -567,12 +567,26 @@ util_bswap16(uint16_t n)
 #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C )
 
 
+/**
+ * Align a value, only works pot alignemnts.
+ */
 static INLINE int
 align(int value, int alignment)
 {
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
+/**
+ * Works like align but on npot alignments.
+ */
+static INLINE size_t
+util_align_npot(size_t value, size_t alignment)
+{
+   if (value % alignment)
+      return value + (alignment - (value % alignment));
+   return value;
+}
+
 static INLINE unsigned
 u_minify(unsigned value, unsigned levels)
 {
diff --git a/src/gallium/auxiliary/util/u_mempool.c b/src/gallium/auxiliary/util/u_mempool.c
new file mode 100644
index 0000000000..1f336b39a1
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mempool.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_mempool.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include <stdio.h>
+
+#define UTIL_MEMPOOL_MAGIC 0xcafe4321
+
+/* The block is either allocated memory or free space. */
+struct util_mempool_block {
+   /* The header. */
+   /* The first next free block. */
+   struct util_mempool_block *next_free;
+
+   intptr_t magic;
+
+   /* Memory after the last member is dedicated to the block itself.
+    * The allocated size is always larger than this structure. */
+};
+
+static struct util_mempool_block *
+util_mempool_get_block(struct util_mempool *pool,
+                       struct util_mempool_page *page, unsigned index)
+{
+   return (struct util_mempool_block*)
+          ((uint8_t*)page + sizeof(struct util_mempool_page) +
+           (pool->block_size * index));
+}
+
+static void util_mempool_add_new_page(struct util_mempool *pool)
+{
+   struct util_mempool_page *page;
+   struct util_mempool_block *block;
+   int i;
+
+   page = MALLOC(pool->page_size);
+   insert_at_tail(&pool->list, page);
+
+   /* Mark all blocks as free. */
+   for (i = 0; i < pool->num_blocks-1; i++) {
+      block = util_mempool_get_block(pool, page, i);
+      block->next_free = util_mempool_get_block(pool, page, i+1);
+      block->magic = UTIL_MEMPOOL_MAGIC;
+   }
+
+   block = util_mempool_get_block(pool, page, pool->num_blocks-1);
+   block->next_free = pool->first_free;
+   block->magic = UTIL_MEMPOOL_MAGIC;
+   pool->first_free = util_mempool_get_block(pool, page, 0);
+   pool->num_pages++;
+
+#if 0
+   fprintf(stderr, "New page! Num of pages: %i\n", pool->num_pages);
+#endif
+}
+
+static void *util_mempool_malloc_st(struct util_mempool *pool)
+{
+   struct util_mempool_block *block;
+
+   if (!pool->first_free)
+      util_mempool_add_new_page(pool);
+
+   block = pool->first_free;
+   assert(block->magic == UTIL_MEMPOOL_MAGIC);
+   pool->first_free = block->next_free;
+
+   return (uint8_t*)block + sizeof(struct util_mempool_block);
+}
+
+static void util_mempool_free_st(struct util_mempool *pool, void *ptr)
+{
+   struct util_mempool_block *block =
+         (struct util_mempool_block*)
+         ((uint8_t*)ptr - sizeof(struct util_mempool_block));
+
+   assert(block->magic == UTIL_MEMPOOL_MAGIC);
+   block->next_free = pool->first_free;
+   pool->first_free = block;
+}
+
+static void *util_mempool_malloc_mt(struct util_mempool *pool)
+{
+   void *mem;
+
+   pipe_mutex_lock(pool->mutex);
+   mem = util_mempool_malloc_st(pool);
+   pipe_mutex_unlock(pool->mutex);
+   return mem;
+}
+
+static void util_mempool_free_mt(struct util_mempool *pool, void *ptr)
+{
+   pipe_mutex_lock(pool->mutex);
+   util_mempool_free_st(pool, ptr);
+   pipe_mutex_unlock(pool->mutex);
+}
+
+void util_mempool_set_thread_safety(struct util_mempool *pool,
+                                    enum util_mempool_threading threading)
+{
+   pool->threading = threading;
+
+   if (threading) {
+      pool->malloc = util_mempool_malloc_mt;
+      pool->free = util_mempool_free_mt;
+   } else {
+      pool->malloc = util_mempool_malloc_st;
+      pool->free = util_mempool_free_st;
+   }
+}
+
+void util_mempool_create(struct util_mempool *pool,
+                         unsigned item_size,
+                         unsigned num_blocks,
+                         enum util_mempool_threading threading)
+{
+   item_size = align(item_size, sizeof(intptr_t));
+
+   pool->num_pages = 0;
+   pool->num_blocks = num_blocks;
+   pool->block_size = sizeof(struct util_mempool_block) + item_size;
+   pool->block_size = align(pool->block_size, sizeof(intptr_t));
+   pool->page_size = sizeof(struct util_mempool_page) +
+                     num_blocks * pool->block_size;
+   pool->first_free = NULL;
+
+   make_empty_list(&pool->list);
+
+   pipe_mutex_init(pool->mutex);
+
+   util_mempool_set_thread_safety(pool, threading);
+}
+
+void util_mempool_destroy(struct util_mempool *pool)
+{
+   struct util_mempool_page *page, *temp;
+
+   foreach_s(page, temp, &pool->list) {
+      remove_from_list(page);
+      FREE(page);
+   }
+
+   pipe_mutex_destroy(pool->mutex);
+}
diff --git a/src/gallium/auxiliary/util/u_mempool.h b/src/gallium/auxiliary/util/u_mempool.h
new file mode 100644
index 0000000000..a5b5d6a9b7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mempool.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * @file
+ * Simple memory pool for equally sized memory allocations.
+ * util_mempool_malloc and util_mempool_free are in O(1).
+ *
+ * Good for allocations which have very low lifetime and are allocated
+ * and freed very often. Use a profiler first!
+ *
+ * Candidates: get_transfer, user_buffer_create
+ *
+ * @author Marek Olšák
+ */
+
+#ifndef U_MEMPOOL_H
+#define U_MEMPOOL_H
+
+#include "os/os_thread.h"
+
+enum util_mempool_threading {
+   UTIL_MEMPOOL_SINGLETHREADED = FALSE,
+   UTIL_MEMPOOL_MULTITHREADED = TRUE
+};
+
+/* The page is an array of blocks (allocations). */
+struct util_mempool_page {
+   /* The header (linked-list pointers). */
+   struct util_mempool_page *prev, *next;
+
+   /* Memory after the last member is dedicated to the page itself.
+    * The allocated size is always larger than this structure. */
+};
+
+struct util_mempool {
+   /* Public members. */
+   void *(*malloc)(struct util_mempool *pool);
+   void (*free)(struct util_mempool *pool, void *ptr);
+
+   /* Private members. */
+   struct util_mempool_block *first_free;
+
+   struct util_mempool_page list;
+
+   unsigned block_size;
+   unsigned page_size;
+   unsigned num_blocks;
+   unsigned num_pages;
+   enum util_mempool_threading threading;
+
+   pipe_mutex mutex;
+};
+
+void util_mempool_create(struct util_mempool *pool,
+                         unsigned item_size,
+                         unsigned num_blocks,
+                         enum util_mempool_threading threading);
+
+void util_mempool_destroy(struct util_mempool *pool);
+
+void util_mempool_set_thread_safety(struct util_mempool *pool,
+                                    enum util_mempool_threading threading);
+
+#define util_mempool_malloc(pool)    (pool)->malloc(pool)
+#define util_mempool_free(pool, ptr) (pool)->free(pool, ptr)
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index 87ee0e4768..77f2c5fc7d 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -6,7 +6,7 @@
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <winsock2.h>
 #  include <windows.h>
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_CYGWIN)
 #  include <sys/socket.h>
 #  include <netinet/in.h>
 #  include <unistd.h>