22 files changed, 1876 insertions, 461 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index d926b2de18..f5f2623e46 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -56,7 +56,6 @@
 #include "lp_bld_intr.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_pack.h"
-#include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
 
@@ -847,6 +846,11 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return the integer part of a float (vector) value.  The returned value is
+ * a float (vector).
+ * Ex: trunc(-1.5) = 1.0
+ */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -869,6 +873,12 @@ lp_build_trunc(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is a float (vector).
+ * Ex: round(0.9) = 1.0
+ * Ex: round(-1.5) = -2.0
+ */
 LLVMValueRef
 lp_build_round(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -890,6 +900,11 @@ lp_build_round(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return floor of float (vector), result is a float (vector)
+ * Ex: floor(1.1) = 1.0
+ * Ex: floor(-1.1) = -2.0
+ */
 LLVMValueRef
 lp_build_floor(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -911,6 +926,11 @@ lp_build_floor(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return ceiling of float (vector), returning float (vector).
+ * Ex: ceil( 1.1) = 2.0
+ * Ex: ceil(-1.1) = -1.0
+ */
 LLVMValueRef
 lp_build_ceil(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -933,7 +953,7 @@ lp_build_ceil(struct lp_build_context *bld,
 
 
 /**
- * Return fractional part of 'a' computed as a - floor(f)
+ * Return fractional part of 'a' computed as a - floor(a)
  * Typically used in texture coord arithmetic.
  */
 LLVMValueRef
@@ -946,8 +966,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating toward zero.
+ * Return the integer part of a float (vector) value.  The returned value is
+ * an integer (vector).
+ * Ex: itrunc(-1.5) = 1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -964,7 +985,10 @@ lp_build_itrunc(struct lp_build_context *bld,
 
 
 /**
- * Convert float[] to int[] with round().
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is an integer (vector).
+ * Ex: iround(0.9) = 1
+ * Ex: iround(-1.5) = -2
  */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
@@ -1007,7 +1031,9 @@ lp_build_iround(struct lp_build_context *bld,
 
 
 /**
- * Convert float[] to int[] with floor().
+ * Return floor of float (vector), result is an int (vector)
+ * Ex: ifloor(1.1) = 1.0
+ * Ex: ifloor(-1.1) = -2.0
  */
 LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
@@ -1034,29 +1060,31 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
-      lp_build_name(sign, "floor.sign");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      /* offset = a < 0 ? offset : 0.0f */
       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
-      lp_build_name(offset, "floor.offset");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
 
-      res = LLVMBuildAdd(bld->builder, a, offset, "");
-      lp_build_name(res, "floor.res");
+      res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-   lp_build_name(res, "floor");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
 
    return res;
 }
 
 
+/**
+ * Return ceiling of float (vector), returning int (vector).
+ * Ex: iceil( 1.1) = 2
+ * Ex: iceil(-1.1) = -1
+ */
 LLVMValueRef
 lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -1072,12 +1100,31 @@ lp_build_iceil(struct lp_build_context *bld,
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      /* TODO: mimic lp_build_ifloor() here */
-      assert(0);
-      res = bld->undef;
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? 0 : ~0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+      /* offset = 0.99999(9)f */
+      offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? 0.0 : offset */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
 
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index d46b9f882b..7ee8fff140 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -107,4 +107,12 @@ lp_build_const_mask_aos(struct lp_type type,
                         const boolean cond[4]);
 
 
+static INLINE LLVMValueRef
+lp_build_const_int32(int i)
+{
+   return LLVMConstInt(LLVMInt32Type(), i, 0);
+}
+
+
+
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 3f7f2ebde9..77012f1fac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -83,6 +83,9 @@
  *
  * Although the result values can be scaled to an arbitrary bit width specified
  * by dst_width, the actual result type will have the same width.
+ *
+ * Ex: src = { float, float, float, float }
+ * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
@@ -152,6 +155,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 
 /**
  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
+ * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
+ * return {float, float, float, float} with values in range [0, 1].
  */
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
@@ -219,18 +224,19 @@ lp_build_conv(LLVMBuilderRef builder,
    unsigned num_tmps;
    unsigned i;
 
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
    /* We must not loose or gain channels. Only precision */
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 
    tmp_type = src_type;
-   for(i = 0; i < num_srcs; ++i)
+   for(i = 0; i < num_srcs; ++i) {
+      assert(lp_check_value(src_type, src[i]));
       tmp[i] = src[i];
+   }
    num_tmps = num_srcs;
 
    /*
@@ -326,30 +332,25 @@ lp_build_conv(LLVMBuilderRef builder,
 
    /*
     * Truncate or expand bit width
+    *
+    * No data conversion should happen here, although the sign bits are
+    * crucial to avoid bad clamping.
     */
 
-   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
+   {
+      struct lp_type new_type;
 
-   if(tmp_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
-      num_tmps = 1;
-   }
+      new_type = tmp_type;
+      new_type.sign   = dst_type.sign;
+      new_type.width  = dst_type.width;
+      new_type.length = dst_type.length;
+
+      lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 
-   if(tmp_type.width < dst_type.width) {
-      assert(num_tmps == 1);
-      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
+      tmp_type = new_type;
       num_tmps = num_dsts;
    }
 
-   assert(tmp_type.width == dst_type.width);
-   assert(tmp_type.length == dst_type.length);
-   assert(num_tmps == num_dsts);
-
    /*
     * Scale to the widest range
     */
@@ -406,8 +407,10 @@ lp_build_conv(LLVMBuilderRef builder,
        }
     }
 
-   for(i = 0; i < num_dsts; ++i)
+   for(i = 0; i < num_dsts; ++i) {
       dst[i] = tmp[i];
+      assert(lp_check_value(dst_type, dst[i]));
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 5f5036e7bd..60e22d727a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -48,9 +48,9 @@ struct lp_build_context;
  */
 
 LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed);
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled);
 
 LLVMValueRef
 lp_build_pack_rgba_aos(LLVMBuilderRef builder,
@@ -60,7 +60,9 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j);
 
@@ -72,7 +74,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 void
 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                             struct lp_build_context *bld,
-                            const LLVMValueRef *unswizzled,
+                            const LLVMValueRef unswizzled[4],
                             LLVMValueRef swizzled_out[4]);
 
 void
@@ -82,6 +84,11 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          LLVMValueRef packed,
                          LLVMValueRef rgba_out[4]);
 
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba);
 
 void
 lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
@@ -93,5 +100,18 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                         LLVMValueRef j,
                         LLVMValueRef rgba_out[4]);
 
+/*
+ * YUV
+ */
+
+
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j);
 
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 87e3e72a6e..0f01fc1d75 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,33 +38,122 @@
 #include "util/u_math.h"
 #include "util/u_string.h"
 
+#include "lp_bld_arit.h"
 #include "lp_bld_init.h"
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 
 
 /**
+ * Basic swizzling.  Rearrange the order of the unswizzled array elements
+ * according to the format description.  PIPE_SWIZZLE_ZERO/ONE are supported
+ * too.
+ * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
+ */
+LLVMValueRef
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled)
+{
+   unsigned char swizzles[4];
+   unsigned chan;
+
+   assert(bld->type.length % 4 == 0);
+
+   for (chan = 0; chan < 4; ++chan) {
+      enum util_format_swizzle swizzle;
+
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         /*
+          * For ZS formats do RGBA = ZZZ1
+          */
+         if (chan == 3) {
+            swizzle = UTIL_FORMAT_SWIZZLE_1;
+         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
+            swizzle = UTIL_FORMAT_SWIZZLE_0;
+         } else {
+            swizzle = desc->swizzle[0];
+         }
+      } else {
+         swizzle = desc->swizzle[chan];
+      }
+      swizzles[chan] = swizzle;
+   }
+
+   return lp_build_swizzle_aos(bld, unswizzled, swizzles);
+}
+
+
+/**
+ * Whether the format matches the vector type, apart of swizzles.
+ */
+static INLINE boolean
+format_matches_type(const struct util_format_description *desc,
+                    struct lp_type type)
+{
+   enum util_format_type chan_type;
+   unsigned chan;
+
+   assert(type.length % 4 == 0);
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   if (type.floating) {
+      chan_type = UTIL_FORMAT_TYPE_FLOAT;
+   } else if (type.fixed) {
+      chan_type = UTIL_FORMAT_TYPE_FIXED;
+   } else if (type.sign) {
+      chan_type = UTIL_FORMAT_TYPE_SIGNED;
+   } else {
+      chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->channel[chan].size != type.width) {
+         return FALSE;
+      }
+
+      if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
+         if (desc->channel[chan].type != chan_type ||
+             desc->channel[chan].normalized != type.norm) {
+            return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
  * Unpack a single pixel into its RGBA components.
  *
  * @param desc  the pixel format for the packed pixel value
  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
  *
- * @return RGBA in a 4 floats vector.
+ * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector.
  */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed)
+static INLINE LLVMValueRef
+lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder,
+                               const struct util_format_description *desc,
+                               LLVMValueRef packed)
 {
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
    LLVMValueRef scales[4];
-   LLVMValueRef swizzles[4];
-   LLVMValueRef aux[4];
+
    boolean normalized;
-   int empty_channel;
    boolean needs_uitofp;
    unsigned shift;
    unsigned i;
@@ -77,8 +166,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    /* Do the intermediate integer computations with 32bit integers since it
     * matches floating point size */
-   if (desc->block.bits < 32)
-      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+   assert (LLVMTypeOf(packed) == LLVMInt32Type());
 
    /* Broadcast the packed value to all four channels
     * before: packed = BGRA
@@ -98,7 +186,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    /* Initialize vector constants */
    normalized = FALSE;
    needs_uitofp = FALSE;
-   empty_channel = -1;
    shift = 0;
 
    /* Loop over 4 color components */
@@ -109,7 +196,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
          shifts[i] = LLVMGetUndef(LLVMInt32Type());
          masks[i] = LLVMConstNull(LLVMInt32Type());
          scales[i] =  LLVMConstNull(LLVMFloatType());
-         empty_channel = i;
       }
       else {
          unsigned long long mask = (1ULL << bits) - 1;
@@ -158,52 +244,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    else
       scaled = casted;
 
-   for (i = 0; i < 4; ++i)
-      aux[i] = LLVMGetUndef(LLVMFloatType());
-
-   /* Build swizzles vector to put components into R,G,B,A order */
-   for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle;
-
-      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-         /*
-          * For ZS formats do RGBA = ZZZ1
-          */
-         if (i == 3) {
-            swizzle = UTIL_FORMAT_SWIZZLE_1;
-         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
-            swizzle = UTIL_FORMAT_SWIZZLE_0;
-         } else {
-            swizzle = desc->swizzle[0];
-         }
-      } else {
-         swizzle = desc->swizzle[i];
-      }
-
-      switch (swizzle) {
-      case UTIL_FORMAT_SWIZZLE_X:
-      case UTIL_FORMAT_SWIZZLE_Y:
-      case UTIL_FORMAT_SWIZZLE_Z:
-      case UTIL_FORMAT_SWIZZLE_W:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_0:
-         assert(empty_channel >= 0);
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_1:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
-         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_NONE:
-         swizzles[i] = LLVMGetUndef(LLVMFloatType());
-         assert(0);
-         break;
-      }
-   }
-
-   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4),
-                                 LLVMConstVector(swizzles, 4), "");
+   return scaled;
 }
 
 
@@ -310,22 +351,65 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 }
 
 
+
+
 /**
  * Fetch a pixel into a 4 float AoS.
  *
  * \param format_desc  describes format of the image we're fetching from
  * \param ptr  address of the pixel block (or the texel if uncompressed)
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
- *              these will always be (0,).
- * \return  valueRef with the float[4] RGBA pixel
+ *              these will always be (0, 0).
+ * \return  a 4 element vector with the pixel's RGBA values.
  */
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j)
 {
+   unsigned num_pixels = type.length / 4;
+   struct lp_build_context bld;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(type.length % 4 == 0);
+
+   lp_build_context_init(&bld, builder, type);
+
+   /*
+    * Trivial case
+    *
+    * The format matches the type (apart of a swizzle) so no need for
+    * scaling or converting.
+    */
+
+   if (format_matches_type(format_desc, type) &&
+       format_desc->block.bits <= type.width * 4 &&
+       util_is_pot(format_desc->block.bits)) {
+      LLVMValueRef packed;
+
+      /*
+       * The format matches the type (apart of a swizzle) so no need for
+       * scaling or converting.
+       */
+
+      packed = lp_build_gather(builder, type.length/4,
+                               format_desc->block.bits, type.width*4,
+                               base_ptr, offset);
+
+      assert(format_desc->block.bits <= type.width * type.length);
+
+      packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), "");
+
+      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
+   }
+
+   /*
+    * Bit arithmetic
+    */
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -337,21 +421,77 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
        format_desc->is_bitmask &&
        !format_desc->is_mixed &&
        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED))
-   {
-      LLVMValueRef packed;
+        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) {
+
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
+
+      /*
+       * Unpack a pixel at a time into a <4 x float> RGBA vector
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef packed;
+
+         packed = lp_build_gather_elem(builder, num_pixels,
+                                       format_desc->block.bits, 32,
+                                       base_ptr, offset, k);
 
-      ptr = LLVMBuildBitCast(builder, ptr,
-                             LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) ,
-                             "");
+         tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc,
+                                                  packed);
+      }
+
+      /*
+       * Type conversion.
+       *
+       * TODO: We could avoid floating conversion for integer to
+       * integer conversions.
+       */
 
-      packed = LLVMBuildLoad(builder, ptr, "packed");
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
 
-      return lp_build_unpack_rgba_aos(builder, format_desc, packed);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res);
    }
-   else if (format_desc->fetch_rgba_float) {
+
+   /*
+    * YUV / subsampled formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_subsampled_rgba_aos(builder,
+                                               format_desc,
+                                               num_pixels,
+                                               base_ptr,
+                                               offset,
+                                               i, j);
+
+      lp_build_conv(builder,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+      return tmp;
+   }
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_8unorm().
+    */
+
+   if (format_desc->fetch_rgba_8unorm &&
+       !type.floating && type.width == 8 && !type.sign && type.norm) {
       /*
-       * Fallback to calling util_format_description::fetch_rgba_float.
+       * Fallback to calling util_format_description::fetch_rgba_8unorm.
        *
        * This is definitely not the most efficient way of fetching pixels, as
        * we miss the opportunity to do vectorization, but this it is a
@@ -361,9 +501,113 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
       LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
       char name[256];
+      LLVMTypeRef i8t = LLVMInt8Type();
+      LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
       LLVMValueRef tmp;
-      LLVMValueRef args[4];
+      LLVMValueRef res;
+      unsigned k;
+
+      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm",
+                    format_desc->short_name);
+
+      /*
+       * Declare and bind format_desc->fetch_rgba_8unorm().
+       */
+
+      function = LLVMGetNamedFunction(module, name);
+      if (!function) {
+         LLVMTypeRef ret_type;
+         LLVMTypeRef arg_types[4];
+         LLVMTypeRef function_type;
+
+         ret_type = LLVMVoidType();
+         arg_types[0] = pi8t;
+         arg_types[1] = pi8t;
+         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
+         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
+         function = LLVMAddFunction(module, name, function_type);
+
+         LLVMSetFunctionCallConv(function, LLVMCCallConv);
+         LLVMSetLinkage(function, LLVMExternalLinkage);
+
+         assert(LLVMIsDeclaration(function));
+
+         LLVMAddGlobalMapping(lp_build_engine, function,
+                              func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm));
+      }
+
+      tmp_ptr = lp_build_alloca(builder, i32t, "");
+
+      res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
+
+      /*
+       * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
+       * in the SoA vectors.
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef args[4];
+
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
+
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmp = LLVMBuildLoad(builder, tmp_ptr, "");
+
+         if (num_pixels == 1) {
+            res = tmp;
+         }
+         else {
+            res = LLVMBuildInsertElement(builder, res, tmp, index, "");
+         }
+      }
+
+      /* Bitcast from <n x i32> to <4n x i8> */
+      res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
+
+      return res;
+   }
+
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_float().
+    */
+
+   if (format_desc->fetch_rgba_float) {
+      /*
+       * Fallback to calling util_format_description::fetch_rgba_float.
+       *
+       * This is definitely not the most efficient way of fetching pixels, as
+       * we miss the opportunity to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+      char name[256];
+      LLVMTypeRef f32t = LLVMFloatType();
+      LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
+      LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
+      LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
 
       util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
                     format_desc->short_name);
@@ -379,7 +623,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
          LLVMTypeRef function_type;
 
          ret_type = LLVMVoidType();
-         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
+         arg_types[0] = pf32t;
          arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
          arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
          function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
@@ -394,25 +638,43 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                               func_to_pointer((func_pointer)format_desc->fetch_rgba_float));
       }
 
-      tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), "");
+      tmp_ptr = lp_build_alloca(builder, f32x4t, "");
 
       /*
        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
        * in the SoA vectors.
        */
 
-      args[0] = LLVMBuildBitCast(builder, tmp,
-                                 LLVMPointerType(LLVMFloatType(), 0), "");
-      args[1] = ptr;
-      args[2] = i;
-      args[3] = j;
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef args[4];
 
-      LLVMBuildCall(builder, function, args, Elements(args), "");
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
 
-      return LLVMBuildLoad(builder, tmp, "");
-   }
-   else {
-      assert(0);
-      return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
+      }
+
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
+
+      return res;
    }
+
+   assert(0);
+   return lp_build_undef(type);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index e1b94adc85..9f405921b0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -36,7 +36,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_swizzle.h"
-#include "lp_bld_sample.h" /* for lp_build_gather */
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 
 
@@ -251,6 +251,41 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 }
 
 
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
+   unsigned chan;
+
+   packed = LLVMBuildBitCast(builder, packed,
+                             lp_build_int_vec_type(dst_type), "");
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if (start)
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(dst_type, start), "");
+
+      if (stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
+}
+
+
+
 /**
  * Fetch a texels from a texture, returning them in SoA layout.
  *
@@ -311,20 +346,49 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                                format_desc,
                                type,
                                packed, rgba_out);
+      return;
    }
-   else {
-      /*
-       * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
-       *
-       * This is not the most efficient way of fetching pixels, as we
-       * miss some opportunities to do vectorization, but this is
-       * convenient for formats or scenarios for which there was no
-       * opportunity or incentive to optimize.
-       */
 
+   /*
+    * Try calling lp_build_fetch_rgba_aos for all pixels.
+    */
+
+   if (util_format_fits_8unorm(format_desc) &&
+       type.floating && type.width == 32 && type.length == 4) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = type.length * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                    base_ptr, offset, i, j);
+
+      lp_build_rgba8_to_f32_soa(builder,
+                                type,
+                                tmp,
+                                rgba_out);
+
+      return;
+   }
+
+   /*
+    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
+    *
+    * This is not the most efficient way of fetching pixels, as we
+    * miss some opportunities to do vectorization, but this is
+    * convenient for formats or scenarios for which there was no
+    * opportunity or incentive to optimize.
+    */
+
+   {
       unsigned k, chan;
+      struct lp_type tmp_type;
 
-      assert(type.floating);
+      tmp_type = type;
+      tmp_type.length = 4;
 
       for (chan = 0; chan < 4; ++chan) {
          rgba_out[chan] = lp_build_undef(type);
@@ -334,18 +398,17 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
       for(k = 0; k < type.length; ++k) {
          LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
          LLVMValueRef offset_elem;
-         LLVMValueRef ptr;
          LLVMValueRef i_elem, j_elem;
          LLVMValueRef tmp;
 
          offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
-         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
 
          i_elem = LLVMBuildExtractElement(builder, i, index, "");
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
          /* Get a single float[4]={R,G,B,A} pixel */
-         tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr,
+         tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                       base_ptr, offset_elem,
                                        i_elem, j_elem);
 
          /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
new file mode 100644
index 0000000000..0a5038bc98
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -0,0 +1,399 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * YUV pixel format manipulation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Extract Y, U, V channels from packed UYVY.
+ * @param packed  is a <n x i32> vector with the packed UYVY blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+uyvy_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef shift, mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (uyvy >> 16*i) & 0xff
+    * u = (uyvy        ) & 0xff
+    * v = (uyvy >> 16  ) & 0xff
+    */
+
+   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+   shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
+   *y = LLVMBuildLShr(builder, packed, shift, "");
+   *u = packed;
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+/**
+ * Extract Y, U, V channels from packed YUYV.
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+yuyv_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef shift, mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (yuyv >> 16*i) & 0xff
+    * u = (yuyv >> 8   ) & 0xff
+    * v = (yuyv >> 24  ) & 0xff
+    */
+
+   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+   *y = LLVMBuildLShr(builder, packed, shift, "");
+   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+static INLINE void
+yuv_to_rgb_soa(LLVMBuilderRef builder,
+               unsigned n,
+               LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
+               LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
+{
+   struct lp_type type;
+   struct lp_build_context bld;
+
+   LLVMValueRef c0;
+   LLVMValueRef c8;
+   LLVMValueRef c16;
+   LLVMValueRef c128;
+   LLVMValueRef c255;
+
+   LLVMValueRef cy;
+   LLVMValueRef cug;
+   LLVMValueRef cub;
+   LLVMValueRef cvr;
+   LLVMValueRef cvg;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   lp_build_context_init(&bld, builder, type);
+
+   assert(lp_check_value(type, y));
+   assert(lp_check_value(type, u));
+   assert(lp_check_value(type, v));
+
+   /*
+    * Constants
+    */
+
+   c0   = lp_build_const_int_vec(type,   0);
+   c8   = lp_build_const_int_vec(type,   8);
+   c16  = lp_build_const_int_vec(type,  16);
+   c128 = lp_build_const_int_vec(type, 128);
+   c255 = lp_build_const_int_vec(type, 255);
+
+   cy  = lp_build_const_int_vec(type,  298);
+   cug = lp_build_const_int_vec(type, -100);
+   cub = lp_build_const_int_vec(type,  516);
+   cvr = lp_build_const_int_vec(type,  409);
+   cvg = lp_build_const_int_vec(type, -208);
+
+   /*
+    *  y -= 16;
+    *  u -= 128;
+    *  v -= 128;
+    */
+
+   y = LLVMBuildSub(builder, y, c16, "");
+   u = LLVMBuildSub(builder, u, c128, "");
+   v = LLVMBuildSub(builder, v, c128, "");
+
+   /*
+    * r = 298 * _y            + 409 * _v + 128;
+    * g = 298 * _y - 100 * _u - 208 * _v + 128;
+    * b = 298 * _y + 516 * _u            + 128;
+    */
+
+   y = LLVMBuildMul(builder, y, cy, "");
+   y = LLVMBuildAdd(builder, y, c128, "");
+
+   *r = LLVMBuildMul(builder, v, cvr, "");
+   *g = LLVMBuildAdd(builder,
+                     LLVMBuildMul(builder, u, cug, ""),
+                     LLVMBuildMul(builder, v, cvg, ""),
+                     "");
+   *b = LLVMBuildMul(builder, u, cub, "");
+
+   *r = LLVMBuildAdd(builder, *r, y, "");
+   *g = LLVMBuildAdd(builder, *g, y, "");
+   *b = LLVMBuildAdd(builder, *b, y, "");
+
+   /*
+    * r >>= 8;
+    * g >>= 8;
+    * b >>= 8;
+    */
+
+   *r = LLVMBuildAShr(builder, *r, c8, "r");
+   *g = LLVMBuildAShr(builder, *g, c8, "g");
+   *b = LLVMBuildAShr(builder, *b, c8, "b");
+
+   /*
+    * Clamp
+    */
+
+   *r = lp_build_clamp(&bld, *r, c0, c255);
+   *g = lp_build_clamp(&bld, *g, c0, c255);
+   *b = lp_build_clamp(&bld, *b, c0, c255);
+}
+
+
+static LLVMValueRef
+rgb_to_rgba_aos(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
+{
+   struct lp_type type;
+   LLVMValueRef a;
+   LLVMValueRef rgba;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, r));
+   assert(lp_check_value(type, g));
+   assert(lp_check_value(type, b));
+
+   /*
+    * Make a 4 x unorm8 vector
+    */
+
+   r = r;
+   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), "");
+   b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), "");
+   a = lp_build_const_int_vec(type, 0xff000000);
+
+   rgba = r;
+   rgba = LLVMBuildOr(builder, rgba, g, "");
+   rgba = LLVMBuildOr(builder, rgba, b, "");
+   rgba = LLVMBuildOr(builder, rgba, a, "");
+
+   rgba = LLVMBuildBitCast(builder, rgba,
+                           LLVMVectorType(LLVMInt8Type(), 4*n), "");
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+uyvy_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+yuyv_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+rgbg_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+grgb_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * @param n  is the number of pixels processed
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
+ */
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j)
+{
+   LLVMValueRef packed;
+   LLVMValueRef rgba;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
+   assert(format_desc->block.bits == 32);
+   assert(format_desc->block.width == 2);
+   assert(format_desc->block.height == 1);
+
+   packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset);
+
+   (void)j;
+
+   switch (format_desc->format) {
+   case PIPE_FORMAT_UYVY:
+      rgba = uyvy_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_YUYV:
+      rgba = yuyv_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      rgba = rgbg_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_G8R8_G8B8_UNORM:
+      rgba = grgb_to_rgba_aos(builder, n, packed, i);
+      break;
+   default:
+      assert(0);
+      rgba =  LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n));
+      break;
+   }
+
+   return rgba;
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
new file mode 100644
index 0000000000..d60472e065
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_format.h"
+#include "lp_bld_gather.h"
+
+
+/**
+ * Get the pointer to one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i)
+{
+   LLVMValueRef offset;
+   LLVMValueRef ptr;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   if (length == 1) {
+      assert(i == 0);
+      offset = offsets;
+   } else {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      offset = LLVMBuildExtractElement(builder, offsets, index, "");
+   }
+
+   ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+
+   return ptr;
+}
+
+
+/**
+ * Gather one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   ptr = lp_build_gather_elem_ptr(builder, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   assert(src_width <= dst_width);
+   if (src_width > dst_width)
+      res = LLVMBuildTrunc(builder, res, dst_elem_type, "");
+   if (src_width < dst_width)
+      res = LLVMBuildZExt(builder, res, dst_elem_type, "");
+
+   return res;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ * Use for fetching texels from a texture.
+ * For SSE, typical values are length=4, src_width=32, dst_width=32.
+ *
+ * @param length length of the offsets
+ * @param src_width src element width in bits
+ * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMValueRef res;
+
+   if (length == 1) {
+      /* Scalar */
+      return lp_build_gather_elem(builder, length,
+                                  src_width, dst_width,
+                                  base_ptr, offsets, 0);
+   } else {
+      /* Vector */
+
+      LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+      unsigned i;
+
+      res = LLVMGetUndef(dst_vec_type);
+      for (i = 0; i < length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         LLVMValueRef elem;
+         elem = lp_build_gather_elem(builder, length,
+                                     src_width, dst_width,
+                                     base_ptr, offsets, i);
+         res = LLVMBuildInsertElement(builder, res, elem, index, "");
+      }
+   }
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
new file mode 100644
index 0000000000..131af8ea07
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_GATHER_H_
+#define LP_BLD_GATHER_H_
+
+
+#include "gallivm/lp_bld.h"
+
+
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i);
+
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i);
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+#endif /* LP_BLD_GATHER_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 44cfdc4d3f..69353dea09 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -32,6 +32,8 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_init.h"
 
+#include <llvm-c/Transforms/Scalar.h>
+
 
 #ifdef DEBUG
 unsigned gallivm_debug = 0;
@@ -50,6 +52,7 @@ LLVMModuleRef lp_build_module = NULL;
 LLVMExecutionEngineRef lp_build_engine = NULL;
 LLVMModuleProviderRef lp_build_provider = NULL;
 LLVMTargetDataRef lp_build_target = NULL;
+LLVMPassManagerRef lp_build_pass = NULL;
 
 
 /*
@@ -127,6 +130,33 @@ lp_build_init(void)
    if (!lp_build_target)
       lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine);
 
+   if (!lp_build_pass) {
+      lp_build_pass = LLVMCreateFunctionPassManager(lp_build_provider);
+      LLVMAddTargetData(lp_build_target, lp_build_pass);
+
+      if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
+         /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+          * but there are more on SVN. */
+         /* TODO: Add more passes */
+         LLVMAddCFGSimplificationPass(lp_build_pass);
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+         LLVMAddConstantPropagationPass(lp_build_pass);
+         if(util_cpu_caps.has_sse4_1) {
+            /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+             * and sitofp (necessary for trunc/floor/ceil/round implementation)
+             * somehow becomes invalid code.
+             */
+            LLVMAddInstructionCombiningPass(lp_build_pass);
+         }
+         LLVMAddGVNPass(lp_build_pass);
+      } else {
+         /* We need at least this pass to prevent the backends to fail in
+          * unexpected ways.
+          */
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+      }
+   }
+
    util_cpu_detect();
 
 #if 0
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 0ec2afcd1b..a32ced9b4c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -38,6 +38,7 @@ extern LLVMModuleRef lp_build_module;
 extern LLVMExecutionEngineRef lp_build_engine;
 extern LLVMModuleProviderRef lp_build_provider;
 extern LLVMTargetDataRef lp_build_target;
+extern LLVMPassManagerRef lp_build_pass;
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index d13fa1a5d0..39854e43b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -34,6 +34,7 @@
 
 
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
 #include "util/u_debug.h"
 
 #include "lp_bld_type.h"
@@ -187,12 +188,10 @@ lp_build_compare(LLVMBuilderRef builder,
             return lp_build_undef(type);
          }
 
-         /* There are no signed byte and unsigned word/dword comparison
-          * instructions. So flip the sign bit so that the results match.
+         /* There are no unsigned comparison instructions. So flip the sign bit
+          * so that the results match.
           */
-         if(table[func].gt &&
-            ((type.width == 8 && type.sign) ||
-             (type.width != 8 && !type.sign))) {
+         if (table[func].gt && !type.sign) {
             LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
@@ -384,6 +383,46 @@ lp_build_select(struct lp_build_context *bld,
       mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
       res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
+   else if (util_cpu_caps.has_sse4_1 &&
+            type.width * type.length == 128 &&
+            !LLVMIsConstant(a) &&
+            !LLVMIsConstant(b) &&
+            !LLVMIsConstant(mask)) {
+      const char *intrinsic;
+      LLVMTypeRef arg_type;
+      LLVMValueRef args[3];
+
+      if (type.width == 64) {
+         intrinsic = "llvm.x86.sse41.blendvpd";
+         arg_type = LLVMVectorType(LLVMDoubleType(), 2);
+      } else if (type.width == 32) {
+         intrinsic = "llvm.x86.sse41.blendvps";
+         arg_type = LLVMVectorType(LLVMFloatType(), 4);
+      } else {
+         intrinsic = "llvm.x86.sse41.pblendvb";
+         arg_type = LLVMVectorType(LLVMInt8Type(), 16);
+      }
+
+      if (arg_type != bld->int_vec_type) {
+         mask = LLVMBuildBitCast(bld->builder, mask, arg_type, "");
+      }
+
+      if (arg_type != bld->vec_type) {
+         a = LLVMBuildBitCast(bld->builder, a, arg_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, arg_type, "");
+      }
+
+      args[0] = b;
+      args[1] = a;
+      args[2] = mask;
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               arg_type, args, Elements(args));
+
+      if (arg_type != bld->vec_type) {
+         res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+      }
+   }
    else {
       if(type.floating) {
          LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 186f8849b8..7748f8f099 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -427,3 +427,123 @@ lp_build_pack(LLVMBuilderRef builder,
 
    return tmp[0];
 }
+
+
+/**
+ * Truncate or expand the bitwidth.
+ *
+ * NOTE: Getting the right sign flags is crucial here, as we employ some
+ * intrinsics that do saturation.
+ */
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   /*
+    * We don't support float <-> int conversion here. That must be done
+    * before/after calling this function.
+    */
+   assert(src_type.floating == dst_type.floating);
+
+   /*
+    * We don't support double <-> float conversion yet, although it could be
+    * added with little effort.
+    */
+   assert((!src_type.floating && !dst_type.floating) ||
+          src_type.width == dst_type.width);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
+   assert(num_srcs == 1 || num_dsts == 1);
+
+   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
+
+   if (src_type.width > dst_type.width) {
+      /*
+       * Truncate bit width.
+       */
+
+      assert(num_dsts == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+        /*
+         * Register width remains constant -- use vector packing intrinsics
+         */
+
+         tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), "");
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else if (src_type.width < dst_type.width) {
+      /*
+       * Expand bit width.
+       */
+
+      assert(num_srcs == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+         /*
+          * Register width remains constant -- use vector unpack intrinsics
+          */
+         lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+
+            if (src_type.sign && dst_type.sign) {
+               val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), "");
+            } else {
+               val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), "");
+            }
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else {
+      /*
+       * No-op
+       */
+
+      assert(num_srcs == 1);
+      assert(num_dsts == 1);
+
+      tmp[0] = src[0];
+   }
+
+   for(i = 0; i < num_dsts; ++i)
+      dst[i] = tmp[i];
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 41adeed220..e470082b97 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -92,4 +92,12 @@ lp_build_pack(LLVMBuilderRef builder,
               const LLVMValueRef *src, unsigned num_srcs);
 
 
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
 #endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 38fd5a39ef..ca36046d22 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -61,8 +61,8 @@ LLVMValueRef
 lp_build_ddx(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   LLVMValueRef a_left  = lp_build_swizzle1_aos(bld, a, swizzle_left);
-   LLVMValueRef a_right = lp_build_swizzle1_aos(bld, a, swizzle_right);
+   LLVMValueRef a_left  = lp_build_swizzle_aos(bld, a, swizzle_left);
+   LLVMValueRef a_right = lp_build_swizzle_aos(bld, a, swizzle_right);
    return lp_build_sub(bld, a_right, a_left);
 }
 
@@ -71,8 +71,8 @@ LLVMValueRef
 lp_build_ddy(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   LLVMValueRef a_top    = lp_build_swizzle1_aos(bld, a, swizzle_top);
-   LLVMValueRef a_bottom = lp_build_swizzle1_aos(bld, a, swizzle_bottom);
+   LLVMValueRef a_top    = lp_build_swizzle_aos(bld, a, swizzle_top);
+   LLVMValueRef a_bottom = lp_build_swizzle_aos(bld, a, swizzle_bottom);
    return lp_build_sub(bld, a_bottom, a_top);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 946c23e317..0fd014ab9b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -40,7 +40,6 @@
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_type.h"
-#include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
 
@@ -125,73 +124,53 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
- * Gather elements from scatter positions in memory into a single vector.
- * Use for fetching texels from a texture.
- * For SSE, typical values are length=4, src_width=32, dst_width=32.
- *
- * @param length length of the offsets
- * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
- * @param base_ptr base pointer, should be a i8 pointer type.
- * @param offsets vector with offsets
- */
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
-{
-   LLVMTypeRef src_type = LLVMIntType(src_width);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
-   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-   LLVMValueRef res;
-   unsigned i;
-
-   res = LLVMGetUndef(dst_vec_type);
-   for(i = 0; i < length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef elem_offset;
-      LLVMValueRef elem_ptr;
-      LLVMValueRef elem;
-
-      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
-      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
-      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
-      elem = LLVMBuildLoad(builder, elem_ptr, "");
-
-      assert(src_width <= dst_width);
-      if(src_width > dst_width)
-         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
-      if(src_width < dst_width)
-         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
-
-      res = LLVMBuildInsertElement(builder, res, elem, index, "");
-   }
-
-   return res;
-}
-
-
-/**
  * Compute the offset of a pixel block.
  *
- * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as
- * per format description, and not individual pixels.
+ * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
+ *
+ * Returns the relative offset and i,j sub-block coordinates
  */
-LLVMValueRef
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride)
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
+   LLVMValueRef i;
+   LLVMValueRef j;
+
+   /*
+    * Describe the coordinates in terms of pixel blocks.
+    *
+    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
+    * bit arithmetic. Verify this.
+    */
+
+   if (format_desc->block.width == 1) {
+      i = bld->zero;
+   }
+   else {
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width);
+      i = LLVMBuildURem(bld->builder, x, block_width, "");
+      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
+   }
+
+   if (format_desc->block.height == 1) {
+      j = bld->zero;
+   }
+   else {
+      LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height);
+      j = LLVMBuildURem(bld->builder, y, block_height, "");
+      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
+   }
 
    x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
    offset = lp_build_mul(bld, x, x_stride);
@@ -206,5 +185,7 @@ lp_build_sample_offset(struct lp_build_context *bld,
       offset = lp_build_add(bld, offset, z_offset);
    }
 
-   return offset;
+   *out_offset = offset;
+   *out_i = i;
+   *out_j = j;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 51e98ab2f9..5b8f478094 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -146,23 +146,17 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
-
-
-LLVMValueRef
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride);
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 84c04fe272..1a20d74cac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -50,8 +50,10 @@
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_quad.h"
 
 
 /**
@@ -264,35 +266,11 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (bld->format_desc->block.width == 1) {
-      i = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
-
-   if (bld->format_desc->block.height == 1) {
-      j = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
-   }
-
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, z, y_stride, z_stride);
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, z, y_stride, z_stride,
+                          &offset, &i, &j);
 
    if (use_border) {
       /* If we can sample the border color, it means that texcoords may
@@ -344,6 +322,9 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 }
 
 
+/**
+ * Fetch the texels as <4n x i8> in AoS form.
+ */
 static LLVMValueRef
 lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef x,
@@ -351,25 +332,46 @@ lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef y_stride,
                        LLVMValueRef data_array)
 {
-   LLVMValueRef offset;
+   LLVMValueRef offset, i, j;
    LLVMValueRef data_ptr;
+   LLVMValueRef res;
 
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, NULL, y_stride, NULL);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, NULL, y_stride, NULL,
+                          &offset, &i, &j);
 
    /* get pointer to mipmap level 0 data */
    data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
 
-   return lp_build_gather(bld->builder,
-                          bld->texel_type.length,
-                          bld->format_desc->block.bits,
-                          bld->texel_type.width,
-                          data_ptr, offset);
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      /* Just fetch the data directly without swizzling */
+      assert(bld->format_desc->block.width == 1);
+      assert(bld->format_desc->block.height == 1);
+      assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+      res = lp_build_gather(bld->builder,
+                            bld->texel_type.length,
+                            bld->format_desc->block.bits,
+                            bld->texel_type.width,
+                            data_ptr, offset);
+   }
+   else {
+      struct lp_type type;
+
+      assert(bld->texel_type.width == 32);
+
+      memset(&type, 0, sizeof type);
+      type.width = 8;
+      type.length = bld->texel_type.length*4;
+      type.norm = TRUE;
+
+      res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type,
+                                    data_ptr, offset, i, j);
+   }
+
+   return res;
 }
 
 
@@ -817,9 +819,8 @@ lp_build_minify(struct lp_build_sample_context *bld,
 
 /**
  * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
  * \param lod_bias  optional float vector with the shader lod bias
  * \param explicit_lod  optional float vector with the explicit lod
  * \param width  scalar int texture width
@@ -831,11 +832,8 @@ lp_build_minify(struct lp_build_sample_context *bld,
  */
 static LLVMValueRef
 lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      const LLVMValueRef *ddx,
-                      const LLVMValueRef *ddy,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       LLVMValueRef width,
@@ -870,14 +868,6 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
          LLVMValueRef rho;
 
-         /*
-          * dsdx = abs(s[1] - s[0]);
-          * dsdy = abs(s[2] - s[0]);
-          * dtdx = abs(t[1] - t[0]);
-          * dtdy = abs(t[2] - t[0]);
-          * drdx = abs(r[1] - r[0]);
-          * drdy = abs(r[2] - r[0]);
-          */
          dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
          dsdx = lp_build_abs(float_bld, dsdx);
          dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
@@ -1287,7 +1277,7 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 
 
 /**
- * Generate code to do cube face selection and per-face texcoords.
+ * Generate code to do cube face selection and compute per-face texcoords.
  */
 static void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
@@ -1411,7 +1401,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          lp_build_endif(&if_ctx2);
          lp_build_flow_scope_end(flow_ctx2);
          lp_build_flow_destroy(flow_ctx2);
-
          *face_s = face_s2;
          *face_t = face_t2;
          *face = face2;
@@ -1457,13 +1446,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    int chan;
 
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld,
                                     width0_vec, height0_vec, depth0_vec,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_nearest(bld,
                                        width1_vec, height1_vec, depth1_vec,
                                        row_stride1_vec, img_stride1_vec,
@@ -1473,13 +1463,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 
+      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld,
                                    width0_vec, height0_vec, depth0_vec,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_linear(bld,
                                       width1_vec, height1_vec, depth1_vec,
                                       row_stride1_vec, img_stride1_vec,
@@ -1542,6 +1533,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
    LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
    LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef face_ddx[4], face_ddy[4];
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -1549,6 +1541,30 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    */
 
    /*
+    * Choose cube face, recompute texcoords and derivatives for the chosen face.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
+   /*
     * Compute the level of detail (float).
     */
    if (min_filter != mag_filter ||
@@ -1556,7 +1572,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, s, t, r, ddx, ddy,
+      lod = lp_build_lod_selector(bld, ddx, ddy,
                                   lod_bias, explicit_lod,
                                   width, height, depth);
    }
@@ -1566,9 +1582,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* always use mip level 0 */
-      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
    }
    else {
+      assert(lod);
       if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
          lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
       }
@@ -1623,18 +1650,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    }
 
    /*
-    * Choose cube face, recompute per-face texcoords.
-    */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-   }
-
-   /*
     * Get pointer(s) to image data for mipmap level(s).
     */
    data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
@@ -1712,36 +1727,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 
 
 static void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
-{
-   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
-   unsigned chan;
-
-   /* Decode the input vector components */
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned start = chan*8;
-      unsigned stop = start + 8;
-      LLVMValueRef input;
-
-      input = packed;
-
-      if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
-
-      if(stop < 32)
-         input = LLVMBuildAnd(builder, input, mask, "");
-
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
-
-      rgba[chan] = input;
-   }
-}
-
-
-static void
 lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef s,
                               LLVMValueRef t,
@@ -1935,15 +1920,20 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * Convert to SoA and swizzle.
     */
 
-   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
-
    lp_build_rgba8_to_f32_soa(bld->builder,
                              bld->texel_type,
                              packed, unswizzled);
 
-   lp_build_format_swizzle_soa(bld->format_desc,
-                               &bld->texel_bld,
-                               unswizzled, texel_out);
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      lp_build_format_swizzle_soa(bld->format_desc,
+                                  &bld->texel_bld,
+                                  unswizzled, texel_out);
+   } else {
+      texel_out[0] = unswizzled[0];
+      texel_out[1] = unswizzled[1];
+      texel_out[2] = unswizzled[2];
+      texel_out[3] = unswizzled[3];
+   }
 
    apply_sampler_swizzle(bld, texel_out);
 }
@@ -2007,6 +1997,8 @@ lp_build_sample_nop(struct lp_build_sample_context *bld,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
+ * \param ddx  partial derivatives of (s,t,r,q) with respect to x
+ * \param ddy  partial derivatives of (s,t,r,q) with respect to y
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -2016,8 +2008,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
+                    const LLVMValueRef ddx[4],
+                    const LLVMValueRef ddy[4],
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
@@ -2079,7 +2071,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       /* For debug: no-op texture sampling */
       lp_build_sample_nop(&bld, texel_out);
    }
-   else if (util_format_is_rgba8_variant(bld.format_desc) &&
+   else if (util_format_fits_8unorm(bld.format_desc) &&
+            bld.format_desc->nr_channels > 1 &&
             static_state->target == PIPE_TEXTURE_2D &&
             static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 3c8a7bc09e..20cf96ca66 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -110,7 +110,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
     * using shuffles here actually causes worst results. More investigation is
     * needed. */
-   if (n <= 4) {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
@@ -132,7 +132,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      struct lp_type type4 = type;
+      struct lp_type type4;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
@@ -147,6 +147,13 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 
       a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
 
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
+
+      type4 = type;
+      type4.floating = FALSE;
       type4.width *= 4;
       type4.length /= 4;
 
@@ -176,80 +183,170 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 
 
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4])
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4])
 {
-   const unsigned n = bld->type.length;
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
    unsigned i, j;
 
-   if(a == bld->undef || a == bld->zero || a == bld->one)
+   if (swizzles[0] == PIPE_SWIZZLE_RED &&
+       swizzles[1] == PIPE_SWIZZLE_GREEN &&
+       swizzles[2] == PIPE_SWIZZLE_BLUE &&
+       swizzles[3] == PIPE_SWIZZLE_ALPHA) {
       return a;
+   }
 
-   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
-      return lp_build_broadcast_aos(bld, a, swizzle[0]);
+   if (swizzles[0] == swizzles[1] &&
+       swizzles[1] == swizzles[2] &&
+       swizzles[2] == swizzles[3]) {
+      switch (swizzles[0]) {
+      case PIPE_SWIZZLE_RED:
+      case PIPE_SWIZZLE_GREEN:
+      case PIPE_SWIZZLE_BLUE:
+      case PIPE_SWIZZLE_ALPHA:
+         return lp_build_broadcast_aos(bld, a, swizzles[0]);
+      case PIPE_SWIZZLE_ZERO:
+         return bld->zero;
+      case PIPE_SWIZZLE_ONE:
+         return bld->one;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+   }
 
-   {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
-      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(type));
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
+
+      memset(aux, 0, sizeof aux);
+
+      for(j = 0; j < n; j += 4) {
+         for(i = 0; i < 4; ++i) {
+            unsigned shuffle;
+            switch (swizzles[i]) {
+            default:
+               assert(0);
+               /* fall through */
+            case PIPE_SWIZZLE_RED:
+            case PIPE_SWIZZLE_GREEN:
+            case PIPE_SWIZZLE_BLUE:
+            case PIPE_SWIZZLE_ALPHA:
+               shuffle = j + swizzles[i];
+               break;
+            case PIPE_SWIZZLE_ZERO:
+               shuffle = type.length + 0;
+               if (!aux[0]) {
+                  aux[0] = lp_build_const_elem(type, 0.0);
+               }
+               break;
+            case PIPE_SWIZZLE_ONE:
+               shuffle = type.length + 1;
+               if (!aux[1]) {
+                  aux[1] = lp_build_const_elem(type, 1.0);
+               }
+               break;
+            }
+            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
+         }
+      }
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
-   }
-}
-
+      for (i = 0; i < n; ++i) {
+         if (!aux[i]) {
+            aux[i] = undef;
+         }
+      }
 
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4])
-{
-   const unsigned n = bld->type.length;
-   unsigned i, j;
+      return LLVMBuildShuffleVector(bld->builder, a,
+                                    LLVMConstVector(aux, n),
+                                    LLVMConstVector(shuffles, n), "");
+   } else {
+      /*
+       * Bit mask and shifts.
+       *
+       * For example, this will convert BGRA to RGBA by doing
+       *
+       *   rgba = (bgra & 0x00ff0000) >> 16
+       *        | (bgra & 0xff00ff00)
+       *        | (bgra & 0x000000ff) << 16
+       *
+       * This is necessary not only for faster cause, but because X86 backend
+       * will refuse shuffles of <4 x i8> vectors
+       */
+      LLVMValueRef res;
+      struct lp_type type4;
+      boolean cond[4];
+      unsigned chan;
+      int shift;
 
-   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
-      return lp_build_swizzle1_aos(bld, a, swizzle);
+      /*
+       * Start with a mixture of 1 and 0.
+       */
+      for (chan = 0; chan < 4; ++chan) {
+         cond[chan] = swizzles[chan] == PIPE_SWIZZLE_ONE ? TRUE : FALSE;
+      }
+      res = lp_build_select_aos(bld, bld->one, bld->zero, cond);
 
-   if(a == b) {
-      unsigned char swizzle1[4];
-      swizzle1[0] = swizzle[0] % 4;
-      swizzle1[1] = swizzle[1] % 4;
-      swizzle1[2] = swizzle[2] % 4;
-      swizzle1[3] = swizzle[3] % 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle1);
-   }
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
+      type4 = type;
+      type4.floating = FALSE;
+      type4.width *= 4;
+      type4.length /= 4;
 
-   if(swizzle[0] % 4 == 0 &&
-      swizzle[1] % 4 == 1 &&
-      swizzle[2] % 4 == 2 &&
-      swizzle[3] % 4 == 3) {
-      boolean cond[4];
-      cond[0] = swizzle[0] / 4;
-      cond[1] = swizzle[1] / 4;
-      cond[2] = swizzle[2] / 4;
-      cond[3] = swizzle[3] / 4;
-      return lp_build_select_aos(bld, a, b, cond);
-   }
+      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
+      res = LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type4), "");
 
-   {
       /*
-       * Shuffle.
+       * Mask and shift the channels, trying to group as many channels in the
+       * same shift as possible
        */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
+      for (shift = -3; shift <= 3; ++shift) {
+         unsigned long long mask = 0;
+
+         assert(type4.width <= sizeof(mask)*8);
+
+         for (chan = 0; chan < 4; ++chan) {
+            /* FIXME: big endian */
+            if (swizzles[chan] < 4 &&
+                chan - swizzles[chan] == shift) {
+               mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
+            }
+         }
+
+         if (mask) {
+            LLVMValueRef masked;
+            LLVMValueRef shifted;
+
+            if (0)
+               debug_printf("shift = %i, mask = 0x%08llx\n", shift, mask);
+
+            masked = LLVMBuildAnd(bld->builder, a,
+                                  lp_build_const_int_vec(type4, mask), "");
+            if (shift > 0) {
+               shifted = LLVMBuildShl(bld->builder, masked,
+                                      lp_build_const_int_vec(type4, shift*type.width), "");
+            } else if (shift < 0) {
+               shifted = LLVMBuildLShr(bld->builder, masked,
+                                       lp_build_const_int_vec(type4, -shift*type.width), "");
+            } else {
+               shifted = masked;
+            }
+
+            res = LLVMBuildOr(bld->builder, res, shifted, "");
+         }
+      }
 
-      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+      return LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type), "");
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index 4f4fa777c9..315e1bcb54 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -68,24 +68,12 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 /**
  * Swizzle a vector consisting of an array of XYZW structs.
  *
- * @param swizzle is the in [0,4[ range.
+ * @param swizzles is the in [0,4[ range.
  */
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4]);
-
-
-/**
- * Swizzle two vector consisting of an array of XYZW structs.
- *
- * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
- */
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4]);
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4]);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index dec7556138..21236839fb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -49,6 +49,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
@@ -132,10 +133,14 @@ struct lp_build_tgsi_soa_context
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
    LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
 
-   /* we allocate an array of temps if we have indirect
-    * addressing and then the temps above is unused */
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
    LLVMValueRef temps_array;
-   boolean has_indirect_addressing;
+
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
@@ -404,25 +409,92 @@ static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
    lp_exec_mask_update(mask);
 }
 
+
+/**
+ * Return pointer to a temporary register channel (src or dest).
+ * Note that indirect addressing cannot be handled here.
+ * \param index  which temporary register
+ * \param chan  which channel of the temp register.
+ */
 static LLVMValueRef
 get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
              unsigned index,
-             unsigned chan,
-             boolean is_indirect,
-             LLVMValueRef addr)
+             unsigned chan)
 {
    assert(chan < 4);
-   if (!bld->has_indirect_addressing) {
-      return bld->temps[index][chan];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index * 4 + chan, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
+   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
       return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
    }
+   else {
+      return bld->temps[index][chan];
+   }
 }
 
+
+/**
+ * Gather vector.
+ * XXX the lp_build_gather() function should be capable of doing this
+ * with a little work.
+ */
+static LLVMValueRef
+build_gather(struct lp_build_tgsi_soa_context *bld,
+             LLVMValueRef base_ptr,
+             LLVMValueRef indexes)
+{
+   LLVMValueRef res = bld->base.undef;
+   unsigned i;
+
+   /*
+    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
+    */
+   for (i = 0; i < bld->base.type.length; i++) {
+      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
+                                                   indexes, ii, "");
+      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
+                                             &index, 1, "");
+      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
+      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Read the current value of the ADDR register, convert the floats to
+ * ints, multiply by four and return the vector of offsets.
+ * The offsets will be used to index into the constant buffer or
+ * temporary register file.
+ */
+static LLVMValueRef
+get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
+                     const struct tgsi_src_register *indirect_reg)
+{
+   /* always use X component of address register */
+   const int x = indirect_reg->SwizzleX;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
+   uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x);
+   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4); 
+   LLVMValueRef addr_vec;
+
+   addr_vec = LLVMBuildLoad(bld->base.builder,
+                            bld->addr[indirect_reg->Index][swizzle],
+                            "load addr reg");
+
+   /* for indexing we want integers */
+   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
+                              int_vec_type, "");
+
+   /* addr_vec = addr_vec * 4 */
+   addr_vec = lp_build_mul(&bld->base, addr_vec, vec4);
+
+   return addr_vec;
+}
+
+
 /**
  * Register fetch.
  */
@@ -430,14 +502,14 @@ static LLVMValueRef
 emit_fetch(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   unsigned index,
+   unsigned src_op,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->Src[index];
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
    const unsigned swizzle =
       tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    LLVMValueRef res;
-   LLVMValueRef addr = NULL;
+   LLVMValueRef addr_vec = NULL;
 
    if (swizzle > 3) {
       assert(0 && "invalid swizzle in emit_fetch()");
@@ -445,32 +517,33 @@ emit_fetch(
    }
 
    if (reg->Register.Indirect) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-      addr = LLVMBuildLoad(bld->base.builder,
-                           bld->addr[reg->Indirect.Index][swizzle],
-                           "");
-      /* for indexing we want integers */
-      addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                             int_vec_type, "");
-      addr = LLVMBuildExtractElement(bld->base.builder,
-                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                     "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      assert(bld->indirect_files);
+      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
    }
 
    switch (reg->Register.File) {
    case TGSI_FILE_CONSTANT:
-      {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(),
-                                           reg->Register.Index*4 + swizzle, 0);
+      if (reg->Register.Indirect) {
+         LLVMValueRef index_vec;  /* index into the const buffer */
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec = index_vec + addr_vec */
+         index_vec = lp_build_add(&bld->base, index_vec, addr_vec);
+
+         /* Gather values from the constant buffer */
+         res = build_gather(bld, bld->consts_ptr, index_vec);
+      }
+      else {
+         LLVMValueRef index;  /* index into the const buffer */
          LLVMValueRef scalar, scalar_ptr;
 
-         if (reg->Register.Indirect) {
-            /*lp_build_printf(bld->base.builder,
-              "\taddr = %d\n", addr);*/
-            index = lp_build_add(&bld->base, index, addr);
-         }
+         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
+
          scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
                                    &index, 1, "");
          scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
@@ -490,13 +563,38 @@ emit_fetch(
       break;
 
    case TGSI_FILE_TEMPORARY:
-      {
-         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                              swizzle,
-                                              reg->Register.Indirect,
-                                              addr);
+      if (reg->Register.Indirect) {
+         LLVMValueRef vec_len =
+            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+         LLVMValueRef temps_array;
+         LLVMTypeRef float4_ptr_type;
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec += addr_vec */
+         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
+
+         /* index_vec *= vector_length */
+         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
+
+         /* cast temps_array pointer to float* */
+         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
+                                        float4_ptr_type, "");
+
+         /* Gather values from the temporary register array */
+         res = build_gather(bld, temps_array, index_vec);
+      }
+      else {
+         LLVMValueRef temp_ptr;
+         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
          res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
-         if(!res)
+         if (!res)
             return bld->base.undef;
       }
       break;
@@ -660,8 +758,12 @@ emit_store(
    }
 
    if (reg->Register.Indirect) {
+      /* XXX use get_indirect_offsets() here eventually */
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
       unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
+
+      assert(bld->indirect_files);
+
       addr = LLVMBuildLoad(bld->base.builder,
                            bld->addr[reg->Indirect.Index][swizzle],
                            "");
@@ -680,14 +782,18 @@ emit_store(
                          bld->outputs[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_TEMPORARY: {
-      LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                           chan_index,
-                                           reg->Register.Indirect,
-                                           addr);
-      lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+   case TGSI_FILE_TEMPORARY:
+      if (reg->Register.Indirect) {
+         /* XXX not done yet */
+         debug_printf("WARNING: LLVM scatter store of temp regs"
+                      " not implemented\n");
+      }
+      else {
+         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                              chan_index);
+         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+      }
       break;
-   }
 
    case TGSI_FILE_ADDRESS:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
@@ -905,7 +1011,7 @@ emit_declaration(
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
          assert(idx < LP_MAX_TGSI_TEMPS);
-         if (bld->has_indirect_addressing) {
+         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
             LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
                                                    last*4 + 4, 0);
             bld->temps_array = lp_build_array_alloca(bld->base.builder,
@@ -1929,8 +2035,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
-   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
-                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
+   bld.indirect_files = info->indirect_files;
    bld.instructions = (struct tgsi_full_instruction *)
                       MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
    bld.max_instructions = LP_MAX_INSTRUCTIONS;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index df77ef2155..3ffe916f8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -316,6 +316,54 @@ LLVMTypeRef
 lp_build_int32_vec4_type(void);
 
 
+static INLINE struct lp_type
+lp_float32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = TRUE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_int32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_unorm8_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = FALSE;
+   type.norm = TRUE;
+   type.width = 8;
+   type.length = 4;
+
+   return type;
+}
+
+
 struct lp_type
 lp_uint_type(struct lp_type type);