41 files changed, 2766 insertions, 2634 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h b/src/gallium/auxiliary/gallivm/lp_bld.h
index 634575670d..2fa682f400 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,29 +26,22 @@
  **************************************************************************/
 
 /**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
+ * @file
+ * Wrapper for LLVM header file #includes.
  */
 
-#ifndef LP_BLD_ALPHA_H
-#define LP_BLD_ALPHA_H
 
+#ifndef LP_BLD_H
+#define LP_BLD_H
 
-#include <llvm-c/Core.h>  
 
-struct pipe_alpha_state;
-struct lp_type;
-struct lp_build_mask_context;
+#include <llvm-c/Core.h>  
 
 
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref);
+/** Ensure HAVE_LLVM is set to avoid #ifdef HAVE_LLVM everywhere */
+#ifndef HAVE_LLVM
+#error "HAVE_LLVM should be set with LLVM's version number, e.g. (0x0207 for 2.7)"
+#endif
 
 
-#endif /* !LP_BLD_ALPHA_H */
+#endif /* LP_BLD_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.c b/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
deleted file mode 100644
index 7245730350..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_alpha.h"
-
-
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref)
-{
-   struct lp_build_context bld;
-
-   lp_build_context_init(&bld, builder, type);
-
-   if(state->enabled) {
-      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
-
-      lp_build_name(test, "alpha_mask");
-
-      lp_build_mask_update(mask, test);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 32f9e5201c..20ae958714 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -232,6 +232,37 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
+/** Return the sum of the elements of a */
+LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef index, res;
+   int i;
+
+   if (a == bld->zero)
+      return bld->zero;
+   if (a == bld->undef)
+      return bld->undef;
+   assert(type.length > 1);
+
+   assert(!bld->type.norm);
+
+   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   res = LLVMBuildExtractElement(bld->builder, a, index, "");
+
+   for (i = 1; i < type.length; i++) {
+      index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildAdd(bld->builder, res,
+                         LLVMBuildExtractElement(bld->builder, a, index, ""),
+                         "");
+   }
+
+   return res;
+}
+
+
 /**
  * Generate a - b
  */
@@ -330,12 +361,12 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    LLVMValueRef c8;
    LLVMValueRef ab;
 
-   c8 = lp_build_int_const_scalar(i16_type, 8);
+   c8 = lp_build_const_int_vec(i16_type, 8);
    
 #if 0
    
    /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
+   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
    ab = LLVMBuildMul(builder, a, b, "");
 
 #else
@@ -343,7 +374,7 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    ab = LLVMBuildMul(builder, a, b, "");
    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
-   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 
 #endif
    
@@ -398,7 +429,7 @@ lp_build_mul(struct lp_build_context *bld,
    }
 
    if(type.fixed)
-      shift = lp_build_int_const_scalar(type, type.width/2);
+      shift = lp_build_const_int_vec(type, type.width/2);
    else
       shift = NULL;
 
@@ -460,7 +491,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
           * for Inf and NaN.
           */
          unsigned mantissa = lp_mantissa(bld->type);
-         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
          a = LLVMBuildAdd(bld->builder, a, factor, "");
          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
@@ -468,12 +499,12 @@ lp_build_mul_imm(struct lp_build_context *bld,
 #endif
       }
       else {
-         factor = lp_build_const_scalar(bld->type, shift);
+         factor = lp_build_const_vec(bld->type, shift);
          return LLVMBuildShl(bld->builder, a, factor, "");
       }
    }
 
-   factor = lp_build_const_scalar(bld->type, (double)b);
+   factor = lp_build_const_vec(bld->type, (double)b);
    return lp_build_mul(bld, a, factor);
 }
 
@@ -536,7 +567,7 @@ lp_build_lerp(struct lp_build_context *bld,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
-      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 
    return res;
 }
@@ -644,13 +675,26 @@ lp_build_abs(struct lp_build_context *bld,
 
    if(type.floating) {
       /* Mask out the sign bit */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      unsigned long long absMask = ~(1ULL << (type.width - 1));
-      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      a = LLVMBuildAnd(bld->builder, a, mask, "");
-      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-      return a;
+      if (type.length == 1) {
+         LLVMTypeRef int_type = LLVMIntType(type.width);
+         LLVMTypeRef float_type = LLVMFloatType();
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
+         a = LLVMBuildBitCast(bld->builder, a, int_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, float_type, "");
+         return a;
+      }
+      else {
+         /* vector of floats */
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+         return a;
+      }
    }
 
    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -676,12 +720,12 @@ lp_build_negate(struct lp_build_context *bld,
 }
 
 
+/** Return -1, 0 or +1 depending on the sign of a */
 LLVMValueRef
 lp_build_sgn(struct lp_build_context *bld,
              LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMValueRef cond;
    LLVMValueRef res;
 
@@ -691,27 +735,42 @@ lp_build_sgn(struct lp_build_context *bld,
       res = bld->one;
    }
    else if(type.floating) {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMTypeRef vec_type;
+      LLVMTypeRef int_type;
+      LLVMValueRef mask;
       LLVMValueRef sign;
       LLVMValueRef one;
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
+
+      if (type.length == 1) {
+         int_type = lp_build_int_elem_type(type);
+         vec_type = lp_build_elem_type(type);
+         mask = LLVMConstInt(int_type, maskBit, 0);
+      }
+      else {
+         /* vector */
+         int_type = lp_build_int_vec_type(type);
+         vec_type = lp_build_vec_type(type);
+         mask = lp_build_const_int_vec(type, maskBit);
+      }
+
+      /* Take the sign bit and add it to 1 constant */
+      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      one = LLVMConstBitCast(bld->one, int_vec_type);
+      one = LLVMConstBitCast(bld->one, int_type);
       res = LLVMBuildOr(bld->builder, sign, one, "");
       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
    }
    else
    {
-      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
    }
 
    /* Handle zero */
    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
-   res = lp_build_select(bld, cond, bld->zero, bld->one);
+   res = lp_build_select(bld, cond, bld->zero, res);
 
    return res;
 }
@@ -730,8 +789,8 @@ lp_build_set_sign(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMValueRef shift = lp_build_int_const_scalar(type, type.width - 1);
-   LLVMValueRef mask = lp_build_int_const_scalar(type,
+   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
+   LLVMValueRef mask = lp_build_const_int_vec(type,
                              ~((unsigned long long) 1 << (type.width - 1)));
    LLVMValueRef val, res;
 
@@ -753,7 +812,7 @@ lp_build_set_sign(struct lp_build_context *bld,
 
 
 /**
- * Convert vector of int to vector of float.
+ * Convert vector of (or scalar) int to vector of (or scalar) float.
  */
 LLVMValueRef
 lp_build_int_to_float(struct lp_build_context *bld,
@@ -764,7 +823,11 @@ lp_build_int_to_float(struct lp_build_context *bld,
    assert(type.floating);
    /*assert(lp_check_value(type, a));*/
 
-   {
+   if (type.length == 1) {
+      LLVMTypeRef float_type = LLVMFloatType();
+      return LLVMBuildSIToFP(bld->builder, a, float_type, "");
+   }
+   else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
       LLVMValueRef res;
@@ -866,6 +929,13 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
+   if (type.length == 1) {
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
+      return res;
+   }
+
    if(util_cpu_caps.has_sse4_1)
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    else {
@@ -921,15 +991,24 @@ lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
    assert(type.floating);
-   assert(lp_check_value(type, a));
 
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   if (type.length == 1) {
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      return LLVMBuildFPToSI(bld->builder, a, int_type, "");
+   }
+   else {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      assert(lp_check_value(type, a));
+      return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   }
 }
 
 
+/**
+ * Convert float[] to int[] with round().
+ */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -939,6 +1018,15 @@ lp_build_iround(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      /* XXX we want rounding here! */
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -946,7 +1034,7 @@ lp_build_iround(struct lp_build_context *bld,
    }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef half;
 
@@ -955,7 +1043,7 @@ lp_build_iround(struct lp_build_context *bld,
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 
       /* sign * 0.5 */
-      half = lp_build_const_scalar(type, 0.5);
+      half = lp_build_const_vec(type, 0.5);
       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
       half = LLVMBuildOr(bld->builder, sign, half, "");
       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
@@ -981,6 +1069,14 @@ lp_build_ifloor(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -990,18 +1086,18 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* Take the sign bit and add it to 1 constant */
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef offset;
 
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
       lp_build_name(sign, "floor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
@@ -1114,6 +1210,14 @@ LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
               LLVMValueRef a)
 {
+#ifdef PIPE_OS_WINDOWS
+   /*
+    * FIXME: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
+    * which is neither efficient nor does the CRT linkage work on Windows
+    * causing segmentation fault. So simply disable the code for now.
+    */
+   return bld->one;
+#else
    const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
@@ -1124,6 +1228,7 @@ lp_build_cos(struct lp_build_context *bld,
    util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
 
    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+#endif
 }
 
 
@@ -1134,6 +1239,14 @@ LLVMValueRef
 lp_build_sin(struct lp_build_context *bld,
               LLVMValueRef a)
 {
+#ifdef PIPE_OS_WINDOWS
+   /*
+    * FIXME: X86 backend translates llvm.sin.v4f32 to 4 calls to CRT's sinf()
+    * which is neither efficient nor does the CRT linkage work on Windows
+    * causing segmentation fault. So simply disable the code for now.
+    */
+   return bld->zero;
+#else
    const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
@@ -1144,6 +1257,7 @@ lp_build_sin(struct lp_build_context *bld,
    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
 
    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+#endif
 }
 
 
@@ -1172,7 +1286,7 @@ lp_build_exp(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log2(e) = 1/log(2) */
-   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
+   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
 
    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 }
@@ -1186,7 +1300,7 @@ lp_build_log(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
+   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
 
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
@@ -1207,6 +1321,7 @@ lp_build_polynomial(struct lp_build_context *bld,
                     unsigned num_coeffs)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
    LLVMValueRef res = NULL;
    unsigned i;
 
@@ -1216,7 +1331,13 @@ lp_build_polynomial(struct lp_build_context *bld,
                    __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
-      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
+      LLVMValueRef coeff;
+
+      if (type.length == 1)
+         coeff = LLVMConstReal(float_type, coeffs[i]);
+      else
+         coeff = lp_build_const_vec(type, coeffs[i]);
+
       if(res)
          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
       else
@@ -1272,11 +1393,11 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
       assert(type.floating && type.width == 32);
 
-      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
-      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
+      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
+      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
 
       /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
+      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
 
       /* fpart = x - ipart */
@@ -1286,8 +1407,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
    if(p_exp2_int_part || p_exp2) {
       /* expipart = (float) (1 << ipart) */
-      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
-      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
+      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
+      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
    }
 
@@ -1353,8 +1474,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
-   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
-   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
+   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
 
    LLVMValueRef i = NULL;
@@ -1379,8 +1500,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    }
 
    if(p_floor_log2 || p_log2) {
-      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
-      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
+      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
+      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
    }
 
@@ -1399,8 +1520,83 @@ lp_build_log2_approx(struct lp_build_context *bld,
       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
    }
 
-   if(p_exp)
+   if(p_exp) {
+      exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
       *p_exp = exp;
+   }
+
+   if(p_floor_log2)
+      *p_floor_log2 = logexp;
+
+   if(p_log2)
+      *p_log2 = res;
+}
+
+
+/** scalar version of above function */
+static void
+lp_build_float_log2_approx(struct lp_build_context *bld,
+                           LLVMValueRef x,
+                           LLVMValueRef *p_exp,
+                           LLVMValueRef *p_floor_log2,
+                           LLVMValueRef *p_log2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
+   LLVMTypeRef int_type = LLVMIntType(type.width);
+
+   LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
+   LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
+
+   LLVMValueRef i = NULL;
+   LLVMValueRef exp = NULL;
+   LLVMValueRef mant = NULL;
+   LLVMValueRef logexp = NULL;
+   LLVMValueRef logmant = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp || p_floor_log2 || p_log2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
+
+      assert(type.floating && type.width == 32);
+
+      i = LLVMBuildBitCast(bld->builder, x, int_type, "");
+
+      /* exp = (float) exponent(x) */
+      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
+   }
+
+   if(p_floor_log2 || p_log2) {
+      LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
+      LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
+      logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
+      logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
+      logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
+   }
+
+   if(p_log2) {
+      /* mant = (float) mantissa(x) */
+      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
+      mant = LLVMBuildOr(bld->builder, mant, one, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
+
+      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
+                                    Elements(lp_build_log2_polynomial));
+
+      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+
+      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+   }
+
+   if(p_exp) {
+      exp = LLVMBuildBitCast(bld->builder, exp, float_type, "");
+      *p_exp = exp;
+   }
 
    if(p_floor_log2)
       *p_floor_log2 = logexp;
@@ -1415,6 +1611,11 @@ lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef x)
 {
    LLVMValueRef res;
-   lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   if (bld->type.length == 1) {
+      lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
+   }
+   else {
+      lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   }
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 55385e3a66..31efa9921c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -37,7 +37,7 @@
 #define LP_BLD_ARIT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -57,6 +57,10 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend.h b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
deleted file mode 100644
index da272e549f..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef LP_BLD_BLEND_H
-#define LP_BLD_BLEND_H
-
-
-/**
- * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
- */
-
-#include <llvm-c/Core.h>  
- 
-#include "pipe/p_format.h"
-
-
-struct pipe_blend_state;
-struct lp_type;
-struct lp_build_context;
-
-
-/**
- * Whether the blending function is commutative or not.
- */
-boolean
-lp_build_blend_func_commutative(unsigned func);
-
-
-/**
- * Whether the blending functions are the reverse of each other.
- */
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
-
-
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1,
-                    LLVMValueRef term2);
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle);
-
-
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef const_[4],
-                   LLVMValueRef res[4]);
-
-
-/**
- * Apply a logic op.
- *
- * src/dst parameters are packed values. It should work regardless the inputs
- * are scalars, or a vector.
- */
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst);
-
-
-#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
deleted file mode 100644
index 0215bb72ac..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- AoS layout.
- *
- * AoS blending is in general much slower than SoA, but there are some cases
- * where it might be faster. In particular, if a pixel is rendered only once
- * then the overhead of tiling and untiling will dominate over the speedup that
- * SoA gives. So we might want to detect such cases and fallback to AoS in the
- * future, but for now this function is here for historical/benchmarking
- * purposes.
- *
- * Run lp_blend_test after any change to this file.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_blend.h"
-#include "lp_bld_debug.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_aos_context
-{
-   struct lp_build_context base;
-   
-   LLVMValueRef src;
-   LLVMValueRef dst;
-   LLVMValueRef const_;
-
-   LLVMValueRef inv_src;
-   LLVMValueRef inv_dst;
-   LLVMValueRef inv_const;
-   LLVMValueRef saturate;
-
-   LLVMValueRef rgb_src_factor;
-   LLVMValueRef alpha_src_factor;
-   LLVMValueRef rgb_dst_factor;
-   LLVMValueRef alpha_dst_factor;
-};
-
-
-static LLVMValueRef
-lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
-                                 unsigned factor,
-                                 boolean alpha)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src;
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst)
-            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-         if(!bld->saturate)
-            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
-         return bld->saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->const_;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src)
-         bld->inv_src = lp_build_comp(&bld->base, bld->src);
-      return bld->inv_src;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
-         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-      return bld->inv_dst;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const)
-         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
-      return bld->inv_const;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-enum lp_build_blend_swizzle {
-   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
-   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
-};
-
-
-/**
- * How should we shuffle the base factor.
- */
-static enum lp_build_blend_swizzle
-lp_build_blend_factor_swizzle(unsigned factor)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-   case PIPE_BLENDFACTOR_ZERO:
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      return LP_BUILD_BLEND_SWIZZLE_AAAA;
-   default:
-      assert(0);
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   }
-}
-
-
-static LLVMValueRef
-lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
-                       LLVMValueRef rgb, 
-                       LLVMValueRef alpha, 
-                       enum lp_build_blend_swizzle rgb_swizzle,
-                       unsigned alpha_swizzle)
-{
-   if(rgb == alpha) {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
-         return rgb;
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
-         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
-   }
-   else {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
-         boolean cond[4] = {0, 0, 0, 0};
-         cond[alpha_swizzle] = 1;
-         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
-      }
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
-         unsigned char swizzle[4];
-         swizzle[0] = alpha_swizzle;
-         swizzle[1] = alpha_swizzle;
-         swizzle[2] = alpha_swizzle;
-         swizzle[3] = alpha_swizzle;
-         swizzle[alpha_swizzle] += 4;
-         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
-      }
-   }
-   assert(0);
-   return bld->base.undef;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
- */
-static LLVMValueRef
-lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
-                      LLVMValueRef factor1,
-                      unsigned rgb_factor,
-                      unsigned alpha_factor,
-                      unsigned alpha_swizzle)
-{
-   LLVMValueRef rgb_factor_;
-   LLVMValueRef alpha_factor_;
-   LLVMValueRef factor2;
-   enum lp_build_blend_swizzle rgb_swizzle;
-
-   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
-   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
-
-   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
-
-   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
-
-   return lp_build_mul(&bld->base, factor1, factor2);
-}
-
-
-boolean
-lp_build_blend_func_commutative(unsigned func)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-   case PIPE_BLEND_MIN:
-   case PIPE_BLEND_MAX:
-      return TRUE;
-   case PIPE_BLEND_SUBTRACT:
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return FALSE;
-   default:
-      assert(0);
-      return TRUE;
-   }
-}
-
-
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
-{
-   if(rgb_func == alpha_func)
-      return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
-      return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
-      return TRUE;
-   return FALSE;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
- */
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1, 
-                    LLVMValueRef term2)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-      return lp_build_add(bld, term1, term2);
-      break;
-   case PIPE_BLEND_SUBTRACT:
-      return lp_build_sub(bld, term1, term2);
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return lp_build_sub(bld, term2, term1);
-   case PIPE_BLEND_MIN:
-      return lp_build_min(bld, term1, term2);
-   case PIPE_BLEND_MAX:
-      return lp_build_max(bld, term1, term2);
-   default:
-      assert(0);
-      return bld->zero;
-   }
-}
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle)
-{
-   struct lp_build_blend_aos_context bld;
-   LLVMValueRef src_term;
-   LLVMValueRef dst_term;
-
-   /* FIXME */
-   assert(blend->independent_blend_enable == 0);
-   assert(blend->rt[0].colormask == 0xf);
-
-   if(!blend->rt[0].blend_enable)
-      return src;
-
-   /* It makes no sense to blend unless values are normalized */
-   assert(type.norm);
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   bld.src = src;
-   bld.dst = dst;
-   bld.const_ = const_;
-
-   /* TODO: There are still a few optimization opportunities here. For certain
-    * combinations it is possible to reorder the operations and therefore saving
-    * some instructions. */
-
-   src_term = lp_build_blend_factor(&bld, src, blend->rt[0].rgb_src_factor,
-                                    blend->rt[0].alpha_src_factor, alpha_swizzle);
-   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[0].rgb_dst_factor,
-                                    blend->rt[0].alpha_dst_factor, alpha_swizzle);
-
-   lp_build_name(src_term, "src_term");
-   lp_build_name(dst_term, "dst_term");
-
-   if(blend->rt[0].rgb_func == blend->rt[0].alpha_func) {
-      return lp_build_blend_func(&bld.base, blend->rt[0].rgb_func, src_term, dst_term);
-   }
-   else {
-      /* Seperate RGB / A functions */
-
-      LLVMValueRef rgb;
-      LLVMValueRef alpha;
-
-      rgb   = lp_build_blend_func(&bld.base, blend->rt[0].rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld.base, blend->rt[0].alpha_func, src_term, dst_term);
-
-      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
deleted file mode 100644
index 1eac0a5c89..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- logic ops.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_blend.h"
-
-
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst)
-{
-   LLVMTypeRef type;
-   LLVMValueRef res;
-
-   type = LLVMTypeOf(src);
-
-   switch (logicop_func) {
-   case PIPE_LOGICOP_CLEAR:
-      res = LLVMConstNull(type);
-      break;
-   case PIPE_LOGICOP_NOR:
-      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND_INVERTED:
-      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY_INVERTED:
-      res = LLVMBuildNot(builder, src, "");
-      break;
-   case PIPE_LOGICOP_AND_REVERSE:
-      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_INVERT:
-      res = LLVMBuildNot(builder, dst, "");
-      break;
-   case PIPE_LOGICOP_XOR:
-      res = LLVMBuildXor(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_NAND:
-      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND:
-      res = LLVMBuildAnd(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_EQUIV:
-      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_NOOP:
-      res = dst;
-      break;
-   case PIPE_LOGICOP_OR_INVERTED:
-      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY:
-      res = src;
-      break;
-   case PIPE_LOGICOP_OR_REVERSE:
-      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_OR:
-      res = LLVMBuildOr(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_SET:
-      res = LLVMConstAllOnes(type);
-      break;
-   default:
-      assert(0);
-      res = src;
-   }
-
-   return res;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
deleted file mode 100644
index 6d5a45db7a..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- SoA layout.
- *
- * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
- * factors/functions are used, since no channel masking/shuffling is necessary
- * and we can achieve the full throughput of the SIMD operations. Furthermore
- * the fragment shader output is also in SoA, so it fits nicely with the rest of
- * the fragment pipeline.
- *
- * The drawback is that to be displayed the color buffer needs to be in AoS
- * layout, so we need to tile/untile the color buffer before/after rendering.
- * A color buffer like
- *
- *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
- *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
- *
- *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
- *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
- *
- *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
- *
- * will actually be stored in memory as
- *
- *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
- *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
- *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
- *
- * NOTE: Run lp_blend_test after any change to this file.
- *
- * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
- * as:
- *
- *  lp_blend_test -o blend.tsv
- *
- * will generate a tab-seperated-file with the test results and performance
- * measurements.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_blend.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_soa_context
-{
-   struct lp_build_context base;
-
-   LLVMValueRef src[4];
-   LLVMValueRef dst[4];
-   LLVMValueRef con[4];
-
-   LLVMValueRef inv_src[4];
-   LLVMValueRef inv_dst[4];
-   LLVMValueRef inv_con[4];
-
-   LLVMValueRef src_alpha_saturate;
-
-   /**
-    * We store all factors in a table in order to eliminate redundant
-    * multiplications later.
-    */
-   LLVMValueRef factor[2][2][4];
-
-   /**
-    * Table with all terms.
-    */
-   LLVMValueRef term[2][4];
-};
-
-
-static LLVMValueRef
-lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
-                          unsigned factor, unsigned i)
-{
-   /*
-    * Compute src/first term RGB
-    */
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-      return bld->src[i];
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src[3];
-   case PIPE_BLENDFACTOR_DST_COLOR:
-      return bld->dst[i];
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst[3];
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(i == 3)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst[3])
-            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-         if(!bld->src_alpha_saturate)
-            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
-         return bld->src_alpha_saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-      return bld->con[i];
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->con[3];
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src[i])
-         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
-      return bld->inv_src[i];
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src[3])
-         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
-      return bld->inv_src[3];
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      if(!bld->inv_dst[i])
-         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
-      return bld->inv_dst[i];
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst[3])
-         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-      return bld->inv_dst[3];
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_con[i])
-         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
-      return bld->inv_con[i];
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_con[3])
-         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
-      return bld->inv_con[3];
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-/**
- * Generate blend code in SOA mode.
- * \param src  src/fragment color
- * \param dst  dst/framebuffer color
- * \param con  constant blend color
- * \param res  the result/output
- */
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef con[4],
-                   LLVMValueRef res[4])
-{
-   struct lp_build_blend_soa_context bld;
-   unsigned i, j, k;
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   for (i = 0; i < 4; ++i) {
-      bld.src[i] = src[i];
-      bld.dst[i] = dst[i];
-      bld.con[i] = con[i];
-   }
-
-   for (i = 0; i < 4; ++i) {
-      if (blend->rt[0].colormask & (1 << i)) {
-         if (blend->logicop_enable) {
-            if(!type.floating) {
-               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
-            }
-            else
-               res[i] = dst[i];
-         }
-         else if (blend->rt[0].blend_enable) {
-            unsigned src_factor = i < 3 ? blend->rt[0].rgb_src_factor : blend->rt[0].alpha_src_factor;
-            unsigned dst_factor = i < 3 ? blend->rt[0].rgb_dst_factor : blend->rt[0].alpha_dst_factor;
-            unsigned func = i < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
-            boolean func_commutative = lp_build_blend_func_commutative(func);
-
-            /* It makes no sense to blend unless values are normalized */
-            assert(type.norm);
-
-            /*
-             * Compute src/dst factors.
-             */
-
-            bld.factor[0][0][i] = src[i];
-            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
-            bld.factor[1][0][i] = dst[i];
-            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
-
-            /*
-             * Compute src/dst terms
-             */
-
-            for(k = 0; k < 2; ++k) {
-               /* See if this multiplication has been previously computed */
-               for(j = 0; j < i; ++j) {
-                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
-                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][0][i]))
-                     break;
-               }
-
-               if(j < i)
-                  bld.term[k][i] = bld.term[k][j];
-               else
-                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
-            }
-
-            /*
-             * Combine terms
-             */
-
-            /* See if this function has been previously applied */
-            for(j = 0; j < i; ++j) {
-               unsigned prev_func = j < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
-               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
-
-               if((!func_reverse &&
-                   bld.term[0][j] == bld.term[0][i] &&
-                   bld.term[1][j] == bld.term[1][i]) ||
-                  ((func_commutative || func_reverse) &&
-                   bld.term[0][j] == bld.term[1][i] &&
-                   bld.term[1][j] == bld.term[0][i]))
-                  break;
-            }
-
-            if(j < i)
-               res[i] = res[j];
-            else
-               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
-         }
-         else {
-            res[i] = src[i];
-         }
-      }
-      else {
-         res[i] = dst[i];
-      }
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index c8eaa8c394..57843e9a60 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -221,8 +221,16 @@ lp_build_undef(struct lp_type type)
 LLVMValueRef
 lp_build_zero(struct lp_type type)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   return LLVMConstNull(vec_type);
+   if (type.length == 1) {
+      if (type.floating)
+         return LLVMConstReal(LLVMFloatType(), 0.0);
+      else
+         return LLVMConstInt(LLVMIntType(type.width), 0, 0);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      return LLVMConstNull(vec_type);
+   }
 }
                
 
@@ -255,7 +263,7 @@ lp_build_one(struct lp_type type)
       if(type.sign)
          /* TODO: Unfortunately this caused "Tried to create a shift operation
           * on a non-integer type!" */
-         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
+         vec = LLVMConstLShr(vec, lp_build_const_int_vec(type, 1));
 #endif
 
       return vec;
@@ -264,13 +272,19 @@ lp_build_one(struct lp_type type)
    for(i = 1; i < type.length; ++i)
       elems[i] = elems[0];
 
-   return LLVMConstVector(elems, type.length);
+   if (type.length == 1)
+      return elems[0];
+   else
+      return LLVMConstVector(elems, type.length);
 }
                
 
+/**
+ * Build constant-valued vector from a scalar value.
+ */
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val)
+lp_build_const_vec(struct lp_type type,
+                   double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -295,7 +309,7 @@ lp_build_const_scalar(struct lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
+lp_build_const_int_vec(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index cb8e1c7b00..9ca2f0664e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -37,9 +37,9 @@
 #define LP_BLD_CONST_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 struct lp_type;
@@ -85,13 +85,11 @@ lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val);
+lp_build_const_vec(struct lp_type type, double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
-                          long long val);
+lp_build_const_int_vec(struct lp_type type, long long val);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index f77cf78721..3f7f2ebde9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -114,13 +114,13 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    scale = (double)mask/ubound;
    bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
+   res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), "");
+   res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), "");
    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 
    if(dst_width > n) {
       int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
+      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
 
       /* TODO: Fill in the empty lower bits for additional precision? */
       /* YES: this fixes progs/trivial/tri-z-eq.c.
@@ -130,21 +130,21 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 #if 0
       {
          LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
+         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
+         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
+         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
          res = LLVMBuildOr(builder, res, msb, "");
       }
 #elif 0
       while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
+         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
          shift -= n;
          n *= 2;
       }
 #endif
    }
    else
-      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -183,10 +183,10 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    if(src_width > mantissa) {
       int shift = src_width - mantissa;
-      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
+      res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), "");
    }
 
-   bias_ = lp_build_const_scalar(dst_type, bias);
+   bias_ = lp_build_const_vec(dst_type, bias);
 
    res = LLVMBuildOr(builder,
                      res,
@@ -195,7 +195,7 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
    res = LLVMBuildBitCast(builder, res, vec_type, "");
 
    res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
+   res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 
    return res;
 }
@@ -251,7 +251,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_min == 0.0)
             thres = bld.zero;
          else
-            thres = lp_build_const_scalar(src_type, dst_min);
+            thres = lp_build_const_vec(src_type, dst_min);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_max(&bld, tmp[i], thres);
       }
@@ -260,7 +260,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_max == 1.0)
             thres = bld.one;
          else
-            thres = lp_build_const_scalar(src_type, dst_max);
+            thres = lp_build_const_vec(src_type, dst_max);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_min(&bld, tmp[i], thres);
       }
@@ -288,7 +288,7 @@ lp_build_conv(LLVMBuilderRef builder,
          LLVMTypeRef tmp_vec_type;
 
          if (dst_scale != 1.0) {
-            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
+            LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
             for(i = 0; i < num_tmps; ++i)
                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
          }
@@ -315,7 +315,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
       /* FIXME: compensate different offsets too */
       if(src_shift > dst_shift) {
-         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
+         LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift);
          for(i = 0; i < num_tmps; ++i)
             if(src_type.sign)
                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
@@ -388,7 +388,7 @@ lp_build_conv(LLVMBuilderRef builder,
           }
 
           if (src_scale != 1.0) {
-             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
+             LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
              for(i = 0; i < num_tmps; ++i)
                 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
           }
@@ -400,7 +400,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
        /* FIXME: compensate different offsets too */
        if(src_shift < dst_shift) {
-          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
+          LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift);
           for(i = 0; i < num_tmps; ++i)
              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
        }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index 948e68fae4..628831c3ad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -37,7 +37,7 @@
 #define LP_BLD_CONV_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 583e6132b4..7b010cbdb0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -30,7 +30,7 @@
 #define LP_BLD_DEBUG_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_compiler.h"
 #include "util/u_string.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.c b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
deleted file mode 100644
index f08f8eb6d8..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Depth/stencil testing to LLVM IR translation.
- *
- * To be done accurately/efficiently the depth/stencil test must be done with
- * the same type/format of the depth/stencil buffer, which implies massaging
- * the incoming depths to fit into place. Using a more straightforward
- * type/format for depth/stencil values internally and only convert when
- * flushing would avoid this, but it would most likely result in depth fighting
- * artifacts.
- *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
- *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
- *
- * FIXME: Code generate stencil test
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_depth.h"
-
-
-/**
- * Return a type appropriate for depth/stencil testing.
- */
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length)
-{
-   struct lp_type type;
-   unsigned swizzle;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   swizzle = format_desc->swizzle[0];
-   assert(swizzle < 4);
-
-   memset(&type, 0, sizeof type);
-   type.width = format_desc->block.bits;
-
-   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
-      type.floating = TRUE;
-      assert(swizzle == 0);
-      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
-   }
-   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-      assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
-   }
-   else
-      assert(0);
-
-   assert(type.width <= length);
-   type.length = length / type.width;
-
-   return type;
-}
-
-
-/**
- * Depth test.
- */
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr)
-{
-   struct lp_build_context bld;
-   unsigned z_swizzle;
-   LLVMValueRef dst;
-   LLVMValueRef z_bitmask = NULL;
-   LLVMValueRef test;
-
-   if(!state->enabled)
-      return;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   z_swizzle = format_desc->swizzle[0];
-   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
-      return;
-
-   /* Sanity checking */
-   assert(z_swizzle < 4);
-   assert(format_desc->block.bits == type.width);
-   if(type.floating) {
-      assert(z_swizzle == 0);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
-      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
-   }
-   else {
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].normalized);
-      assert(!type.fixed);
-      assert(!type.sign);
-      assert(type.norm);
-   }
-
-   /* Setup build context */
-   lp_build_context_init(&bld, builder, type);
-
-   dst = LLVMBuildLoad(builder, dst_ptr, "");
-
-   lp_build_name(dst, "zsbuf");
-
-   /* Align the source depth bits with the destination's, and mask out any
-    * stencil or padding bits from both */
-   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
-      assert(z_swizzle == 0);
-      /* nothing to do */
-   }
-   else {
-      unsigned padding_left;
-      unsigned padding_right;
-      unsigned chan;
-
-      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
-      assert(format_desc->channel[z_swizzle].normalized);
-
-      padding_right = 0;
-      for(chan = 0; chan < z_swizzle; ++chan)
-         padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits -
-                     (padding_right + format_desc->channel[z_swizzle].size);
-
-      if(padding_left || padding_right) {
-         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
-      }
-
-      if(padding_left)
-         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
-      if(padding_right)
-         src = LLVMBuildAnd(builder, src, z_bitmask, "");
-      if(padding_left || padding_right)
-         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
-   }
-
-   lp_build_name(dst, "zsbuf.z");
-
-   test = lp_build_cmp(&bld, state->func, src, dst);
-   lp_build_mask_update(mask, test);
-
-   if(state->writemask) {
-      if(z_bitmask)
-         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
-      else
-         z_bitmask = mask->value;
-
-      dst = lp_build_select(&bld, z_bitmask, src, dst);
-      LLVMBuildStore(builder, dst, dst_ptr);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index bc83138908..8f15b1d287 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -308,7 +308,7 @@ lp_build_flow_scope_end(struct lp_build_flow_context *flow)
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
-static LLVMBasicBlockRef
+LLVMBasicBlockRef
 lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 {
    LLVMBasicBlockRef current_block;
@@ -570,6 +570,35 @@ lp_build_loop_end(LLVMBuilderRef builder,
    LLVMPositionBuilderAtEnd(builder, after_block);
 }
 
+void
+lp_build_loop_end_cond(LLVMBuilderRef builder,
+                       LLVMValueRef end,
+                       LLVMValueRef step,
+                       int llvm_cond,
+                       struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   LLVMValueRef next;
+   LLVMValueRef cond;
+   LLVMBasicBlockRef after_block;
+
+   if (!step)
+      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
+
+   next = LLVMBuildAdd(builder, state->counter, step, "");
+
+   cond = LLVMBuildICmp(builder, llvm_cond, next, end, "");
+
+   after_block = LLVMAppendBasicBlock(function, "");
+
+   LLVMBuildCondBr(builder, cond, after_block, state->block);
+
+   LLVMAddIncoming(state->counter, &next, &block, 1);
+
+   LLVMPositionBuilderAtEnd(builder, after_block);
+}
+
 
 
 /*
@@ -648,7 +677,9 @@ lp_build_if(struct lp_build_if_state *ctx,
       ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
 
       /* add add the initial value of the var from the entry block */
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->entry_block, 1);
+      if (!LLVMIsUndef(*flow->variables[i]))
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
+                         &ifthen->entry_block, 1);
    }
 
    /* create/insert true_block before merge_block */
@@ -695,18 +726,21 @@ lp_build_endif(struct lp_build_if_state *ctx)
 {
    struct lp_build_flow_context *flow = ctx->flow;
    struct lp_build_flow_if *ifthen;
+   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
    unsigned i;
 
    ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
 
+   /* Insert branch to the merge block from current block */
+   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+
    if (ifthen->false_block) {
       LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
       /* for each variable, update the Phi node with a (variable, block) pair */
       for (i = 0; i < flow->num_variables; i++) {
          assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->false_block, 1);
-
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
          /* replace the variable ref with the phi function */
          *flow->variables[i] = ifthen->phi[i];
       }
@@ -742,16 +776,94 @@ lp_build_endif(struct lp_build_if_state *ctx)
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Append an unconditional Br(anch) instruction on the true_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   /* Insert branch from end of true_block to merge_block */
    if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the false_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+      /* Append an unconditional Br(anch) instruction on the true_block */
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
       LLVMBuildBr(ctx->builder, ifthen->merge_block);
    }
-
+   else {
+      /* No else clause.
+       * Note that we've already inserted the branch at the end of
+       * true_block.  See the very first LLVMBuildBr() call in this function.
+       */
+   }
 
    /* Resume building code at end of the ifthen->merge_block */
    LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
 }
+
+
+/**
+ * Allocate a scalar (or vector) variable.
+ *
+ * Although not strictly part of control flow, control flow has deep impact in
+ * how variables should be allocated.
+ *
+ * The mem2reg optimization pass is the recommended way to dealing with mutable
+ * variables, and SSA. It looks for allocas and if it can handle them, it
+ * promotes them, but only looks for alloca instructions in the entry block of
+ * the function. Being in the entry block guarantees that the alloca is only
+ * executed once, which makes analysis simpler.
+ *
+ * See also:
+ * - http://www.llvm.org/docs/tutorial/OCamlLangImpl7.html#memory
+ */
+LLVMValueRef
+lp_build_alloca(LLVMBuilderRef builder,
+                LLVMTypeRef type,
+                const char *name)
+{
+   LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+   LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+   LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+   LLVMBuilderRef first_builder = LLVMCreateBuilder();
+   LLVMValueRef res;
+
+   LLVMPositionBuilderAtEnd(first_builder, first_block);
+   LLVMPositionBuilderBefore(first_builder, first_instr);
+
+   res = LLVMBuildAlloca(first_builder, type, name);
+
+   LLVMDisposeBuilder(first_builder);
+
+   return res;
+}
+
+
+/**
+ * Allocate an array of scalars/vectors.
+ *
+ * mem2reg pass is not capable of promoting structs or arrays to registers, but
+ * we still put it in the first block anyway as failure to put allocas in the
+ * first block may prevent the X86 backend from successfully align the stack as
+ * required.
+ *
+ * Also the scalarrepl pass is supossedly more powerful and can promote
+ * arrays in many cases.
+ *
+ * See also:
+ * - http://www.llvm.org/docs/tutorial/OCamlLangImpl7.html#memory
+ */
+LLVMValueRef
+lp_build_array_alloca(LLVMBuilderRef builder,
+                      LLVMTypeRef type,
+                      LLVMValueRef count,
+                      const char *name)
+{
+   LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+   LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+   LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+   LLVMBuilderRef first_builder = LLVMCreateBuilder();
+   LLVMValueRef res;
+
+   LLVMPositionBuilderBefore(first_builder, first_instr);
+
+   res = LLVMBuildArrayAlloca(first_builder, type, count, name);
+
+   LLVMDisposeBuilder(first_builder);
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 4c225a0d4f..fffb493a93 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -35,7 +35,7 @@
 #define LP_BLD_FLOW_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -124,6 +124,13 @@ lp_build_loop_end(LLVMBuilderRef builder,
                   LLVMValueRef step,
                   struct lp_build_loop_state *state);
 
+void
+lp_build_loop_end_cond(LLVMBuilderRef builder,
+                       LLVMValueRef end,
+                       LLVMValueRef step,
+                       int cond, /* LLVM condition */
+                       struct lp_build_loop_state *state);
+
 
 
 
@@ -145,7 +152,19 @@ lp_build_else(struct lp_build_if_state *ctx);
 
 void
 lp_build_endif(struct lp_build_if_state *ctx);
-              
 
+LLVMBasicBlockRef
+lp_build_insert_new_block(LLVMBuilderRef builder, const char *name);
+
+LLVMValueRef
+lp_build_alloca(LLVMBuilderRef builder,
+                LLVMTypeRef type,
+                const char *name);
+
+LLVMValueRef
+lp_build_array_alloca(LLVMBuilderRef builder,
+                      LLVMTypeRef type,
+                      LLVMValueRef count,
+                      const char *name);
 
 #endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 970bee379f..085937588f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -34,7 +34,7 @@
  * Pixel format helpers.
  */
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_format.h"
 
@@ -42,35 +42,37 @@ struct util_format_description;
 struct lp_type;
 
 
-boolean
-lp_format_is_rgba8(const struct util_format_description *desc);
-
-
-void
-lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
-                            const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled);
-
+/*
+ * AoS
+ */
 
 LLVMValueRef
 lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
                          const struct util_format_description *desc,
                          LLVMValueRef packed);
 
-
-LLVMValueRef
-lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
-                          const struct util_format_description *desc,
-                          struct lp_type type,
-                          LLVMValueRef packed);
-
-
 LLVMValueRef
 lp_build_pack_rgba_aos(LLVMBuilderRef builder,
                        const struct util_format_description *desc,
                        LLVMValueRef rgba);
 
+LLVMValueRef
+lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        LLVMValueRef ptr,
+                        LLVMValueRef i,
+                        LLVMValueRef j);
+
+
+/*
+ * SoA
+ */
+
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled);
 
 void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
@@ -80,4 +82,15 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          LLVMValueRef *rgba);
 
 
+void
+lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offsets,
+                        LLVMValueRef i,
+                        LLVMValueRef j,
+                        LLVMValueRef *rgba);
+
+
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index a07f7418f2..6257e9a404 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -33,12 +33,14 @@
  */
 
 
-#include "util/u_cpu_detect.h"
 #include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_string.h"
 
+#include "lp_bld_init.h"
 #include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_swizzle.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
 
 
@@ -48,16 +50,12 @@
  * @param packed integer.
  *
  * @return RGBA in a 4 floats vector.
- *
- * XXX: This is mostly for reference and testing -- operating a single pixel at
- * a time is rarely if ever needed.
  */
 LLVMValueRef
 lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
                          const struct util_format_description *desc,
                          LLVMValueRef packed)
 {
-   LLVMTypeRef type;
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
@@ -66,17 +64,16 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    LLVMValueRef aux[4];
    bool normalized;
    int empty_channel;
+   bool needs_uitofp;
    unsigned shift;
    unsigned i;
 
-   /* FIXME: Support more formats */
+   /* TODO: Support more formats */
    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    assert(desc->block.width == 1);
    assert(desc->block.height == 1);
    assert(desc->block.bits <= 32);
 
-   type = LLVMIntType(desc->block.bits);
-
    /* Do the intermediate integer computations with 32bit integers since it
     * matches floating point size */
    if (desc->block.bits < 32)
@@ -96,6 +93,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    /* Initialize vector constants */
    normalized = FALSE;
+   needs_uitofp = FALSE;
    empty_channel = -1;
    shift = 0;
    for (i = 0; i < 4; ++i) {
@@ -108,10 +106,13 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
          empty_channel = i;
       }
       else {
-         unsigned mask = (1 << bits) - 1;
+         unsigned long long mask = (1ULL << bits) - 1;
 
          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(bits < 32);
+
+         if (bits == 32) {
+            needs_uitofp = TRUE;
+         }
 
          shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
          masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0);
@@ -129,8 +130,12 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
-   /* UIToFP can't be expressed in SSE2 */
-   casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
+   if (!needs_uitofp) {
+      /* UIToFP can't be expressed in SSE2 */
+      casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
+   } else {
+      casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
+   }
 
    if (normalized)
       scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
@@ -141,7 +146,22 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
       aux[i] = LLVMGetUndef(LLVMFloatType());
 
    for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle = desc->swizzle[i];
+      enum util_format_swizzle swizzle;
+
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         /*
+          * For ZS formats do RGBA = ZZZ1
+          */
+         if (i == 3) {
+            swizzle = UTIL_FORMAT_SWIZZLE_1;
+         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
+            swizzle = UTIL_FORMAT_SWIZZLE_0;
+         } else {
+            swizzle = desc->swizzle[0];
+         }
+      } else {
+         swizzle = desc->swizzle[i];
+      }
 
       switch (swizzle) {
       case UTIL_FORMAT_SWIZZLE_X:
@@ -170,117 +190,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
 
 /**
- * Take a vector with packed pixels and unpack into a rgba8 vector.
- *
- * Formats with bit depth smaller than 32bits are accepted, but they must be
- * padded to 32bits.
- */
-LLVMValueRef
-lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
-                          const struct util_format_description *desc,
-                          struct lp_type type,
-                          LLVMValueRef packed)
-{
-   struct lp_build_context bld;
-   bool rgba8;
-   LLVMValueRef res;
-   unsigned i;
-
-   lp_build_context_init(&bld, builder, type);
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   assert(!type.floating);
-   assert(!type.fixed);
-   assert(type.norm);
-   assert(type.width == 8);
-   assert(type.length % 4 == 0);
-
-   rgba8 = TRUE;
-   for(i = 0; i < 4; ++i) {
-      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
-      if(desc->channel[0].size != 8)
-         rgba8 = FALSE;
-   }
-
-   if(rgba8) {
-      /*
-       * The pixel is already in a rgba8 format variant. All it is necessary
-       * is to swizzle the channels.
-       */
-
-      unsigned char swizzles[4];
-      boolean zeros[4]; /* bitwise AND mask */
-      boolean ones[4]; /* bitwise OR mask */
-      boolean swizzles_needed = FALSE;
-      boolean zeros_needed = FALSE;
-      boolean ones_needed = FALSE;
-
-      for(i = 0; i < 4; ++i) {
-         enum util_format_swizzle swizzle = desc->swizzle[i];
-
-         /* Initialize with the no-op case */
-         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
-         zeros[i] = TRUE;
-         ones[i] = FALSE;
-
-         switch (swizzle) {
-         case UTIL_FORMAT_SWIZZLE_X:
-         case UTIL_FORMAT_SWIZZLE_Y:
-         case UTIL_FORMAT_SWIZZLE_Z:
-         case UTIL_FORMAT_SWIZZLE_W:
-            if(swizzle != swizzles[i]) {
-               swizzles[i] = swizzle;
-               swizzles_needed = TRUE;
-            }
-            break;
-         case UTIL_FORMAT_SWIZZLE_0:
-            zeros[i] = FALSE;
-            zeros_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_1:
-            ones[i] = TRUE;
-            ones_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_NONE:
-            assert(0);
-            break;
-         }
-      }
-
-      res = packed;
-
-      if(swizzles_needed)
-         res = lp_build_swizzle1_aos(&bld, res, swizzles);
-
-      if(zeros_needed) {
-         /* Mask out zero channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
-         res = LLVMBuildAnd(builder, res, mask, "");
-      }
-
-      if(ones_needed) {
-         /* Or one channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
-         res = LLVMBuildOr(builder, res, mask, "");
-      }
-   }
-   else {
-      /* FIXME */
-      assert(0);
-      res = lp_build_undef(type);
-   }
-
-   return res;
-}
-
-
-/**
  * Pack a single pixel.
  *
  * @param rgba 4 float vector with the unpacked components.
@@ -381,3 +290,106 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 
    return packed;
 }
+
+
+/**
+ * Fetch a pixel into a 4 float AoS.
+ *
+ * i and j are the sub-block pixel coordinates.
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        LLVMValueRef ptr,
+                        LLVMValueRef i,
+                        LLVMValueRef j)
+{
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       util_is_pot(format_desc->block.bits) &&
+       format_desc->block.bits <= 32 &&
+       format_desc->is_bitmask &&
+       !format_desc->is_mixed &&
+       (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED))
+   {
+      LLVMValueRef packed;
+
+      ptr = LLVMBuildBitCast(builder, ptr,
+                             LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) ,
+                             "");
+
+      packed = LLVMBuildLoad(builder, ptr, "packed");
+
+      return lp_build_unpack_rgba_aos(builder, format_desc, packed);
+   }
+   else if (format_desc->fetch_rgba_float) {
+      /*
+       * Fallback to calling util_format_description::fetch_rgba_float.
+       *
+       * This is definitely not the most efficient way of fetching pixels, as
+       * we miss the opportunity to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+      char name[256];
+      LLVMValueRef function;
+      LLVMValueRef tmp;
+      LLVMValueRef args[4];
+
+      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
+                    format_desc->short_name);
+
+      /*
+       * Declare and bind format_desc->fetch_rgba_float().
+       */
+
+      function = LLVMGetNamedFunction(module, name);
+      if (!function) {
+         LLVMTypeRef ret_type;
+         LLVMTypeRef arg_types[4];
+         LLVMTypeRef function_type;
+
+         ret_type = LLVMVoidType();
+         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
+         arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
+         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
+         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
+         function = LLVMAddFunction(module, name, function_type);
+
+         LLVMSetFunctionCallConv(function, LLVMCCallConv);
+         LLVMSetLinkage(function, LLVMExternalLinkage);
+
+         assert(LLVMIsDeclaration(function));
+
+         LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float);
+      }
+
+      tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), "");
+
+      /*
+       * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
+       * in the SoA vectors.
+       */
+
+      args[0] = LLVMBuildBitCast(builder, tmp,
+                                 LLVMPointerType(LLVMFloatType(), 0), "");
+      args[1] = ptr;
+      args[2] = i;
+      args[3] = j;
+
+      LLVMBuildCall(builder, function, args, 4, "");
+
+      return LLVMBuildLoad(builder, tmp, "");
+   }
+   else {
+      assert(0);
+      return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_query.c b/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
deleted file mode 100644
index f3832d07ff..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Utility functions to make assertions about formats.
- *
- * This module centralizes most of logic used when determining what algorithm
- * is most suitable (i.e., most efficient yet correct) for a given format.
- *
- * It might be possible to move some of these functions to u_format module,
- * but since tiny differences in the format my render it more/less
- * appropriate to a given algorithm it is impossible to make any long term
- * guarantee about the semantics of these functions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_format.h"
-
-#include "lp_bld_format.h"
-
-
-/**
- * Whether this format is a 4 rgba8 variant
- */
-boolean
-lp_format_is_rgba8(const struct util_format_description *desc)
-{
-   unsigned chan;
-
-   if(desc->block.width != 1 ||
-      desc->block.height != 1 ||
-      desc->block.bits != 32)
-      return FALSE;
-
-   for(chan = 0; chan < 4; ++chan) {
-      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
-         return FALSE;
-      if(desc->channel[chan].size != 8)
-         return FALSE;
-   }
-
-   return TRUE;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index abb27e4c32..26b947b3b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -27,10 +27,13 @@
 
 
 #include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
+#include "lp_bld_sample.h" /* for lp_build_gather */
 #include "lp_bld_format.h"
 
 
@@ -80,6 +83,24 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
 }
 
 
+/**
+ * Unpack several pixels in SoA.
+ *
+ * It takes a vector of packed pixels:
+ *
+ *   packed = {P0, P1, P2, P3, ..., Pn}
+ *
+ * And will produce four vectors:
+ *
+ *   red    = {R0, R1, R2, R3, ..., Rn}
+ *   green  = {G0, G1, G2, G3, ..., Gn}
+ *   blue   = {B0, B1, B2, B3, ..., Bn}
+ *   alpha  = {A0, A1, A2, A3, ..., An}
+ *
+ * It requires that a packed pixel fits into an element of the output
+ * channels. The common case is when converting pixel with a depth of 32 bit or
+ * less into floats.
+ */
 void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          const struct util_format_description *format_desc,
@@ -91,15 +112,17 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
    unsigned start;
    unsigned chan;
 
-   /* FIXME: Support more formats */
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
-   assert(format_desc->block.bits <= 32);
+   assert(format_desc->block.bits <= type.width);
+   /* FIXME: Support more output types */
+   assert(type.floating);
+   assert(type.width == 32);
 
    /* Decode the input vector components */
    start = 0;
-   for (chan = 0; chan < 4; ++chan) {
+   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
       unsigned width = format_desc->channel[chan].size;
       unsigned stop = start + width;
       LLVMValueRef input;
@@ -108,22 +131,106 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 
       switch(format_desc->channel[chan].type) {
       case UTIL_FORMAT_TYPE_VOID:
-         input = NULL;
+         input = lp_build_undef(type);
          break;
 
       case UTIL_FORMAT_TYPE_UNSIGNED:
-         if(type.floating) {
-            if(start)
-               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
-            if(stop < format_desc->block.bits) {
-               unsigned mask = ((unsigned long long)1 << width) - 1;
-               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
-            }
+         /*
+          * Align the LSB
+          */
+
+         if (start) {
+            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
+         }
+
+         /*
+          * Zero the MSBs
+          */
 
+         if (stop < format_desc->block.bits) {
+            unsigned mask = ((unsigned long long)1 << width) - 1;
+            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
+         }
+
+         /*
+          * Type conversion
+          */
+
+         if (type.floating) {
             if(format_desc->channel[chan].normalized)
                input = lp_build_unsigned_norm_to_float(builder, width, type, input);
             else
-               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+               input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+
+         break;
+
+      case UTIL_FORMAT_TYPE_SIGNED:
+         /*
+          * Align the sign bit first.
+          */
+
+         if (stop < type.width) {
+            unsigned bits = type.width - stop;
+            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            input = LLVMBuildShl(builder, input, bits_val, "");
+         }
+
+         /*
+          * Align the LSB (with an arithmetic shift to preserve the sign)
+          */
+
+         if (format_desc->channel[chan].size < type.width) {
+            unsigned bits = type.width - format_desc->channel[chan].size;
+            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            input = LLVMBuildAShr(builder, input, bits_val, "");
+         }
+
+         /*
+          * Type conversion
+          */
+
+         if (type.floating) {
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            if (format_desc->channel[chan].normalized) {
+               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
+               LLVMValueRef scale_val = lp_build_const_vec(type, scale);
+               input = LLVMBuildMul(builder, input, scale_val, "");
+            }
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+
+         break;
+
+      case UTIL_FORMAT_TYPE_FLOAT:
+         if (type.floating) {
+            assert(start == 0);
+            assert(stop == 32);
+            assert(type.width == 32);
+            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      case UTIL_FORMAT_TYPE_FIXED:
+         if (type.floating) {
+            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
+            LLVMValueRef scale_val = lp_build_const_vec(type, scale);
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            input = LLVMBuildMul(builder, input, scale_val, "");
          }
          else {
             /* FIXME */
@@ -133,7 +240,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
          break;
 
       default:
-         /* fall through */
+         assert(0);
          input = lp_build_undef(type);
          break;
       }
@@ -145,3 +252,100 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 
    lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
 }
+
+
+/**
+ * Fetch a pixel into a SoA.
+ *
+ * i and j are the sub-block pixel coordinates.
+ */
+void
+lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
+                        LLVMValueRef i,
+                        LLVMValueRef j,
+                        LLVMValueRef *rgba)
+{
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits <= type.width &&
+       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
+        format_desc->channel[0].size == 32))
+   {
+      /*
+       * The packed pixel fits into an element of the destination format. Put
+       * the packed pixels into a vector and estract each component for all
+       * vector elements in parallel.
+       */
+
+      LLVMValueRef packed;
+
+      /*
+       * gather the texels from the texture
+       */
+      packed = lp_build_gather(builder,
+                               type.length,
+                               format_desc->block.bits,
+                               type.width,
+                               base_ptr, offset);
+
+      /*
+       * convert texels to float rgba
+       */
+      lp_build_unpack_rgba_soa(builder,
+                               format_desc,
+                               type,
+                               packed, rgba);
+   }
+   else {
+      /*
+       * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
+       *
+       * This is not the most efficient way of fetching pixels, as
+       * we miss some opportunities to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      unsigned k, chan;
+
+      assert(type.floating);
+
+      for (chan = 0; chan < 4; ++chan) {
+         rgba[chan] = lp_build_undef(type);
+      }
+
+      for(k = 0; k < type.length; ++k) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef offset_elem;
+         LLVMValueRef ptr;
+         LLVMValueRef i_elem, j_elem;
+         LLVMValueRef tmp;
+
+         offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
+         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
+
+         i_elem = LLVMBuildExtractElement(builder, i, index, "");
+         j_elem = LLVMBuildExtractElement(builder, j, index, "");
+
+         tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr, i_elem, j_elem);
+
+         /*
+          * AoS to SoA
+          */
+
+         for (chan = 0; chan < 4; ++chan) {
+            LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
+            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
+            rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, "");
+         }
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.cpp b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 067397a520..5067d0a164 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,33 +26,52 @@
  **************************************************************************/
 
 
-#include <llvm/Config/config.h>
-#include <llvm/Target/TargetSelect.h>
-#include <llvm/Target/TargetOptions.h>
-
-#include "pipe/p_config.h"
-
+#include "pipe/p_compiler.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_debug.h"
 #include "lp_bld_init.h"
 
 
-extern "C" void LLVMLinkInJIT();
+LLVMModuleRef lp_build_module = NULL;
+LLVMExecutionEngineRef lp_build_engine = NULL;
+LLVMModuleProviderRef lp_build_provider = NULL;
+LLVMTargetDataRef lp_build_target = NULL;
 
 
-extern "C" void
+void
 lp_build_init(void)
 {
-#if defined(PIPE_OS_WINDOWS) && defined(PIPE_ARCH_X86)
-   /*
-    * This is mis-detected on some hardware / software combinations.
-    */
-   llvm::StackAlignment = 4;
-   llvm::RealignStack = true;
-#endif
-
-   /* Same as LLVMInitializeNativeTarget(); */
-   llvm::InitializeNativeTarget();
+   LLVMInitializeNativeTarget();
 
    LLVMLinkInJIT();
+
+   if (!lp_build_module)
+      lp_build_module = LLVMModuleCreateWithName("gallivm");
+
+   if (!lp_build_provider)
+      lp_build_provider = LLVMCreateModuleProviderForExistingModule(lp_build_module);
+
+   if (!lp_build_engine) {
+      char *error = NULL;
+
+      if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider, 1, &error)) {
+         _debug_printf("%s\n", error);
+         LLVMDisposeMessage(error);
+         assert(0);
+      }
+   }
+
+   if (!lp_build_target)
+      lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine);
+
+   util_cpu_detect();
+
+#if 0
+   /* For simulating less capable machines */
+   util_cpu_caps.has_sse3 = 0;
+   util_cpu_caps.has_ssse3 = 0;
+   util_cpu_caps.has_sse4_1 = 0;
+#endif
 }
 
 
@@ -64,6 +83,6 @@ lp_build_init(void)
  */
 #if defined(_MSC_VER) && defined(_DEBUG)
 #include <crtdefs.h>
-extern "C" _CRTIMP void __cdecl
+_CRTIMP void __cdecl
 _invalid_parameter_noinfo(void) {}
 #endif
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 07f50d1c43..0ec2afcd1b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -30,18 +30,18 @@
 #define LP_BLD_INIT_H
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
 
 
-void
-lp_build_init(void);
+extern LLVMModuleRef lp_build_module;
+extern LLVMExecutionEngineRef lp_build_engine;
+extern LLVMModuleProviderRef lp_build_provider;
+extern LLVMTargetDataRef lp_build_target;
 
 
-#ifdef __cplusplus
-}
-#endif
+void
+lp_build_init(void);
 
 
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.c b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
deleted file mode 100644
index 2fc894017d..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "tgsi/tgsi_parse.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_interp.h"
-
-
-/*
- * The shader JIT function operates on blocks of quads.
- * Each block has 2x2 quads and each quad has 2x2 pixels.
- *
- * We iterate over the quads in order 0, 1, 2, 3:
- *
- * #################
- * #   |   #   |   #
- * #---0---#---1---#
- * #   |   #   |   #
- * #################
- * #   |   #   |   #
- * #---2---#---3---#
- * #   |   #   |   #
- * #################
- *
- * Within each quad, we have four pixels which are represented in SOA
- * order:
- *
- * #########
- * # 0 | 1 #
- * #---+---#
- * # 2 | 3 #
- * #########
- *
- * So the green channel (for example) of the four pixels is stored in
- * a single vector register: {g0, g1, g2, g3}.
- */
-
-
-static void
-attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
-{
-   if(attrib == 0)
-      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
-   else
-      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
-}
-
-
-/**
- * Initialize the bld->a0, dadx, dady fields.  This involves fetching
- * those values from the arrays which are passed into the JIT function.
- */
-static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
-            LLVMValueRef a0_ptr,
-            LLVMValueRef dadx_ptr,
-            LLVMValueRef dady_ptr)
-{
-   LLVMBuilderRef builder = bld->base.builder;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
-            LLVMValueRef a0 = NULL;
-            LLVMValueRef dadx = NULL;
-            LLVMValueRef dady = NULL;
-
-            switch( mode ) {
-            case TGSI_INTERPOLATE_PERSPECTIVE:
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_LINEAR:
-               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
-               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
-               dady = lp_build_broadcast_scalar(&bld->base, dady);
-               attrib_name(dadx, attrib, chan, ".dadx");
-               attrib_name(dady, attrib, chan, ".dady");
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_CONSTANT:
-               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
-               a0 = lp_build_broadcast_scalar(&bld->base, a0);
-               attrib_name(a0, attrib, chan, ".a0");
-               break;
-
-            default:
-               assert(0);
-               break;
-            }
-
-            bld->a0  [attrib][chan] = a0;
-            bld->dadx[attrib][chan] = dadx;
-            bld->dady[attrib][chan] = dady;
-         }
-      }
-   }
-}
-
-
-/**
- * Emit LLVM code to compute the fragment shader input attribute values.
- * For example, for a color input, we'll compute red, green, blue and alpha
- * values for the four pixels in a quad.
- * Recall that we're operating on 4-element vectors so each arithmetic
- * operation is operating on the four pixels in a quad.
- */
-static void
-attribs_init(struct lp_build_interp_soa_context *bld)
-{
-   LLVMValueRef x = bld->pos[0];
-   LLVMValueRef y = bld->pos[1];
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef a0   = bld->a0  [attrib][chan];
-            LLVMValueRef dadx = bld->dadx[attrib][chan];
-            LLVMValueRef dady = bld->dady[attrib][chan];
-            LLVMValueRef res;
-
-            res = a0;
-
-            if (mode != TGSI_INTERPOLATE_CONSTANT) {
-               /* res = res + x * dadx */
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
-               /* res = res + y * dady */
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
-            }
-
-            /* Keep the value of the attribue before perspective divide for faster updates */
-            bld->attribs_pre[attrib][chan] = res;
-
-            if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-               LLVMValueRef w = bld->pos[3];
-               assert(attrib != 0);
-               if(!oow)
-                  oow = lp_build_rcp(&bld->base, w);
-               res = lp_build_mul(&bld->base, res, oow);
-            }
-
-            attrib_name(res, attrib, chan, "");
-
-            bld->attribs[attrib][chan] = res;
-         }
-      }
-   }
-}
-
-
-/**
- * Increment the shader input attribute values.
- * This is called when we move from one quad to the next.
- */
-static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
-{
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   assert(quad_index < 4);
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-
-      if (mode != TGSI_INTERPOLATE_CONSTANT) {
-         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-            if(mask & (1 << chan)) {
-               LLVMValueRef dadx = bld->dadx[attrib][chan];
-               LLVMValueRef dady = bld->dady[attrib][chan];
-               LLVMValueRef res;
-
-               res = bld->attribs_pre[attrib][chan];
-
-               if (quad_index == 1 || quad_index == 3) {
-                  /* top-right or bottom-right quad */
-                  /* build res = res + dadx + dadx */
-                  res = lp_build_add(&bld->base, res, dadx);
-                  res = lp_build_add(&bld->base, res, dadx);
-               }
-
-               if (quad_index == 2 || quad_index == 3) {
-                  /* bottom-left or bottom-right quad */
-                  /* build res = res + dady + dady */
-                  res = lp_build_add(&bld->base, res, dady);
-                  res = lp_build_add(&bld->base, res, dady);
-               }
-
-               //XXX bld->attribs_pre[attrib][chan] = res;
-
-               if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-                  LLVMValueRef w = bld->pos[3];
-                  assert(attrib != 0);
-                  if(!oow)
-                     oow = lp_build_rcp(&bld->base, w);
-                  res = lp_build_mul(&bld->base, res, oow);
-               }
-
-               attrib_name(res, attrib, chan, "");
-
-               bld->attribs[attrib][chan] = res;
-            }
-         }
-      }
-   }
-}
-
-
-/**
- * Generate the position vectors.
- *
- * Parameter x0, y0 are the integer values with the quad upper left coordinates.
- */
-static void
-pos_init(struct lp_build_interp_soa_context *bld,
-         LLVMValueRef x0,
-         LLVMValueRef y0)
-{
-   lp_build_name(x0, "pos.x");
-   lp_build_name(y0, "pos.y");
-
-   bld->attribs[0][0] = x0;
-   bld->attribs[0][1] = y0;
-}
-
-
-/**
- * Update quad position values when moving to the next quad.
- */
-static void
-pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
-{
-   LLVMValueRef x = bld->attribs[0][0];
-   LLVMValueRef y = bld->attribs[0][1];
-   const int xstep = 2, ystep = 2;
-
-   if (quad_index == 1 || quad_index == 3) {
-      /* top-right or bottom-right quad in block */
-      /* build x += xstep */
-      x = lp_build_add(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
-   }
-
-   if (quad_index == 2) {
-      /* bottom-left quad in block */
-      /* build y += ystep */
-      y = lp_build_add(&bld->base, y,
-                       lp_build_const_scalar(bld->base.type, ystep));
-      /* build x -= xstep */
-      x = lp_build_sub(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
-   }
-
-   lp_build_name(x, "pos.x");
-   lp_build_name(y, "pos.y");
-
-   bld->attribs[0][0] = x;
-   bld->attribs[0][1] = y;
-}
-
-
-/**
- * Initialize fragment shader input attribute info.
- */
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         boolean flatshade,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0)
-{
-   struct tgsi_parse_context parse;
-   struct tgsi_full_declaration *decl;
-
-   memset(bld, 0, sizeof *bld);
-
-   lp_build_context_init(&bld->base, builder, type);
-
-   /* For convenience */
-   bld->pos = bld->attribs[0];
-   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
-
-   /* Position */
-   bld->num_attribs = 1;
-   bld->mask[0] = TGSI_WRITEMASK_ZW;
-   bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
-
-   /* Inputs */
-   tgsi_parse_init( &parse, tokens );
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         decl = &parse.FullToken.FullDeclaration;
-         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-            unsigned first, last, mask;
-            unsigned attrib;
-
-            first = decl->Range.First;
-            last = decl->Range.Last;
-            mask = decl->Declaration.UsageMask;
-
-            for( attrib = first; attrib <= last; ++attrib ) {
-               bld->mask[1 + attrib] = mask;
-
-               /* XXX: have mesa set INTERP_CONSTANT in the fragment
-                * shader.
-                */
-               if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
-                   flatshade)
-                  bld->mode[1 + attrib] = TGSI_INTERPOLATE_CONSTANT;
-               else
-                  bld->mode[1 + attrib] = decl->Declaration.Interpolate;
-            }
-
-            bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-      case TGSI_TOKEN_TYPE_PROPERTY:
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-   tgsi_parse_free( &parse );
-
-   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   pos_init(bld, x0, y0);
-
-   attribs_init(bld);
-}
-
-
-/**
- * Advance the position and inputs to the given quad within the block.
- */
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
-{
-   assert(quad_index < 4);
-
-   pos_update(bld, quad_index);
-
-   attribs_update(bld, quad_index);
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.h b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
deleted file mode 100644
index ca958cdf34..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * Special attention is given to the interpolation of side by side quads.
- * Multiplications are made only for the first quad. Interpolation of
- * inputs for posterior quads are done exclusively with additions, and
- * perspective divide if necessary.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_INTERP_H
-#define LP_BLD_INTERP_H
-
-
-#include <llvm-c/Core.h>
-
-#include "tgsi/tgsi_exec.h"
-
-#include "lp_bld_type.h"
-
-
-struct tgsi_token;
-
-
-struct lp_build_interp_soa_context
-{
-   struct lp_build_context base;
-
-   unsigned num_attribs;
-   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS];
-   unsigned mode[1 + PIPE_MAX_SHADER_INPUTS];
-
-   LLVMValueRef a0  [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   /* Attribute values before perspective divide */
-   LLVMValueRef attribs_pre[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   /*
-    * Convenience pointers. Callers may access this one.
-    */
-   const LLVMValueRef *pos;
-   const LLVMValueRef (*inputs)[NUM_CHANNELS];
-};
-
-
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         boolean flatshade,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0);
-
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index);
-
-
-#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index f813f27074..977f767322 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -37,7 +37,7 @@
 #define LP_BLD_INTR_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 /**
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 2726747eae..d13fa1a5d0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -42,6 +42,26 @@
 #include "lp_bld_logic.h"
 
 
+/*
+ * XXX
+ *
+ * Selection with vector conditional like
+ *
+ *    select <4 x i1> %C, %A, %B
+ *
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
+ * supported on any backend.
+ *
+ * Expanding the boolean vector to full SIMD register width, as in
+ *
+ *    sext <4 x i1> %C to <4 x i32>
+ *
+ * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
+ * it causes assertion failures in LLVM 2.6. It appears to work correctly on 
+ * LLVM 2.7.
+ */
+
+
 /**
  * Build code to compare two values 'a' and 'b' of 'type' using the given func.
  * \param func  one of PIPE_FUNC_x
@@ -54,13 +74,11 @@ lp_build_compare(LLVMBuilderRef builder,
                  LLVMValueRef a,
                  LLVMValueRef b)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
    LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
    LLVMValueRef cond;
    LLVMValueRef res;
-   unsigned i;
 
    assert(func >= PIPE_FUNC_NEVER);
    assert(func <= PIPE_FUNC_ALWAYS);
@@ -74,10 +92,12 @@ lp_build_compare(LLVMBuilderRef builder,
 
    /* XXX: It is not clear if we should use the ordered or unordered operators */
 
+#if HAVE_LLVM < 0x0207
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating && util_cpu_caps.has_sse) {
          /* float[4] comparison */
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -147,6 +167,7 @@ lp_build_compare(LLVMBuilderRef builder,
          const char *pcmpgt;
          LLVMValueRef args[2];
          LLVMValueRef res;
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
 
          switch (type.width) {
          case 8:
@@ -172,7 +193,7 @@ lp_build_compare(LLVMBuilderRef builder,
          if(table[func].gt &&
             ((type.width == 8 && type.sign) ||
              (type.width != 8 && !type.sign))) {
-            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+            LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
          }
@@ -198,8 +219,9 @@ lp_build_compare(LLVMBuilderRef builder,
 
          return res;
       }
-   }
+   } /* if (type.width * type.length == 128) */
 #endif
+#endif /* HAVE_LLVM < 0x0207 */
 
    if(type.floating) {
       LLVMRealPredicate op;
@@ -233,25 +255,33 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildFCmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildFCmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildFCmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise float"
+                      " vector comparison\n", __FUNCTION__);
+         for (i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildFCmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -281,25 +311,34 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildICmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise int vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildICmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildICmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise int"
+                      " vector comparison\n", __FUNCTION__);
+
+         for(i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildICmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -326,6 +365,8 @@ lp_build_cmp(struct lp_build_context *bld,
 
 /**
  * Return mask ? a : b;
+ *
+ * mask is a bitwise mask, composed of 0 or ~0 for each element.
  */
 LLVMValueRef
 lp_build_select(struct lp_build_context *bld,
@@ -339,26 +380,32 @@ lp_build_select(struct lp_build_context *bld,
    if(a == b)
       return a;
 
-   if(type.floating) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   if (type.length == 1) {
+      mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
+      res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
+   else {
+      if(type.floating) {
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+      }
 
-   a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
 
-   /* This often gets translated to PANDN, but sometimes the NOT is
-    * pre-computed and stored in another constant. The best strategy depends
-    * on available registers, so it is not a big deal -- hopefully LLVM does
-    * the right decision attending the rest of the program.
-    */
-   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+      /* This often gets translated to PANDN, but sometimes the NOT is
+       * pre-computed and stored in another constant. The best strategy depends
+       * on available registers, so it is not a big deal -- hopefully LLVM does
+       * the right decision attending the rest of the program.
+       */
+      b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
 
-   res = LLVMBuildOr(bld->builder, a, b, "");
+      res = LLVMBuildOr(bld->builder, a, b, "");
 
-   if(type.floating) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      if(type.floating) {
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
+         res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      }
    }
 
    return res;
@@ -425,14 +472,12 @@ lp_build_select_aos(struct lp_build_context *bld,
    }
 }
 
+
+/** Return (a & ~b) */
 LLVMValueRef
-lp_build_alloca(struct lp_build_context *bld)
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
 {
-   const struct lp_type type = bld->type;
-
-   if (type.length > 1) { /*vector*/
-      return LLVMBuildAlloca(bld->builder, lp_build_vec_type(type), "");
-   } else { /*scalar*/
-      return LLVMBuildAlloca(bld->builder, lp_build_elem_type(type), "");
-   }
+   b = LLVMBuildNot(bld->builder, b, "");
+   b = LLVMBuildAnd(bld->builder, a, b, "");
+   return b;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index a399ebf39e..29f9fc3b20 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -37,7 +37,7 @@
 #define LP_BLD_LOGIC_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
@@ -76,7 +76,9 @@ lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef b,
                     const boolean cond[4]);
 
+
 LLVMValueRef
-lp_build_alloca(struct lp_build_context *bld);
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
 
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index bc360ad77a..186f8849b8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -164,7 +164,7 @@ lp_build_unpack2(LLVMBuilderRef builder,
 
    if(dst_type.sign && src_type.sign) {
       /* Replicate the sign bit in the most significant bits */
-      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+      msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(src_type, src_type.width - 1), "");
    }
    else
       /* Most significant bits always zero */
@@ -256,13 +256,13 @@ lp_build_pack2(LLVMBuilderRef builder,
                LLVMValueRef lo,
                LLVMValueRef hi)
 {
+#if HAVE_LLVM < 0x0207
    LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+#endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
    LLVMValueRef shuffle;
    LLVMValueRef res;
 
-   dst_vec_type = lp_build_vec_type(dst_type);
-
    assert(!src_type.floating);
    assert(!dst_type.floating);
    assert(src_type.width == dst_type.width * 2);
@@ -272,11 +272,14 @@ lp_build_pack2(LLVMBuilderRef builder,
       switch(src_type.width) {
       case 32:
          if(dst_type.sign) {
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+#endif
          }
          else {
             if (util_cpu_caps.has_sse4_1) {
-               /* PACKUSDW is the only instrinsic with a consistent signature */
                return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
             }
             else {
@@ -288,9 +291,17 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       case 16:
          if(dst_type.sign)
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+#endif
          else
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+#endif
          break;
 
       default:
@@ -348,7 +359,7 @@ lp_build_packs2(LLVMBuilderRef builder,
    if(clamp) {
       struct lp_build_context bld;
       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      LLVMValueRef dst_max = lp_build_const_int_vec(src_type, ((unsigned long long)1 << dst_bits) - 1);
       lp_build_context_init(&bld, builder, src_type);
       lo = lp_build_min(&bld, lo, dst_max);
       hi = lp_build_min(&bld, hi, dst_max);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index fb2a34984a..41adeed220 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,7 +37,7 @@
 #define LP_BLD_PACK_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
new file mode 100644
index 0000000000..153ba5b15b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -0,0 +1,121 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "lp_bld_printf.h"
+
+
+static int
+lp_get_printf_arg_count(const char *fmt)
+{
+   int count =0;
+   const char *p = fmt;
+   int c;
+
+   while ((c = *p++)) {
+      if (c != '%')
+         continue;
+      switch (*p) {
+         case '\0':
+	    continue;
+         case '%':
+	    p++;
+	    continue;
+	 case '.':
+	    if (p[1] == '*' && p[2] == 's') {
+	       count += 2;
+	       p += 3;
+               continue;
+	    }
+	    /* fallthrough */
+	 default:
+	    count ++;
+      }
+   }
+   return count;
+}
+
+LLVMValueRef 
+lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len)
+{
+   LLVMValueRef string = LLVMAddGlobal(module, LLVMArrayType(LLVMInt8Type(), len + 1), "");
+   LLVMSetGlobalConstant(string, TRUE);
+   LLVMSetLinkage(string, LLVMInternalLinkage);
+   LLVMSetInitializer(string, LLVMConstString(str, len + 1, TRUE));
+   return string;
+}
+ 
+
+/**
+ * lp_build_printf.
+ *
+ * Build printf call in LLVM IR. The output goes to stdout.
+ * The additional variable arguments need to have type
+ * LLVMValueRef.
+ */
+LLVMValueRef
+lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...)
+{
+   va_list arglist;
+   int i = 0;
+   int argcount = lp_get_printf_arg_count(fmt);
+   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+   LLVMValueRef params[50];
+   LLVMValueRef fmtarg = lp_build_const_string_variable(module, fmt, strlen(fmt) + 1);
+   LLVMValueRef int0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef index[2];
+   LLVMValueRef func_printf = LLVMGetNamedFunction(module, "printf");
+
+   assert(Elements(params) >= argcount + 1);
+
+   index[0] = index[1] = int0;
+
+   if (!func_printf) {
+      LLVMTypeRef printf_type = LLVMFunctionType(LLVMIntType(32), NULL, 0, 1);
+      func_printf = LLVMAddFunction(module, "printf", printf_type);
+   }
+
+   params[0] = LLVMBuildGEP(builder, fmtarg, index, 2, "");
+
+   va_start(arglist, fmt);
+   for (i = 1; i <= argcount; i++) {
+      LLVMValueRef val = va_arg(arglist, LLVMValueRef);
+      LLVMTypeRef type = LLVMTypeOf(val);
+      /* printf wants doubles, so lets convert so that
+       * we can actually print them */
+      if (LLVMGetTypeKind(type) == LLVMFloatTypeKind)
+         val = LLVMBuildFPExt(builder, val, LLVMDoubleType(), "");
+      params[i] = val;
+   }
+   va_end(arglist);
+
+   return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 79d6981bb5..83bd8f1d55 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,39 +25,15 @@
  *
  **************************************************************************/
 
+#ifndef LP_BLD_PRINTF_H
+#define LP_BLD_PRINTF_H
 
-/**
- * Depth/stencil testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_DEPTH_H
-#define LP_BLD_DEPTH_H
-
-
-#include <llvm-c/Core.h>  
-
- 
-struct pipe_depth_state;
-struct util_format_description;
-struct lp_type;
-struct lp_build_mask_context;
-
-
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length);
 
+#include "pipe/p_compiler.h"
+#include "lp_bld.h"
 
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr);
+LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len);
+LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...);
 
+#endif
 
-#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6a026e468e..195a4953ab 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -51,9 +51,11 @@
  */
 void
 lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_view *view,
                         const struct pipe_sampler_state *sampler)
 {
+   const struct pipe_resource *texture = view->texture;
+
    memset(state, 0, sizeof *state);
 
    if(!texture)
@@ -62,7 +64,24 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    if(!sampler)
       return;
 
-   state->format            = texture->format;
+   /*
+    * We don't copy sampler state over unless it is actually enabled, to avoid
+    * spurious recompiles, as the sampler static state is part of the shader
+    * key.
+    *
+    * Ideally the state tracker or cso_cache module would make all state
+    * canonical, but until that happens it's better to be safe than sorry here.
+    *
+    * XXX: Actually there's much more than can be done here, especially
+    * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
+    */
+
+   state->format            = view->format;
+   state->swizzle_r         = view->swizzle_r;
+   state->swizzle_g         = view->swizzle_g;
+   state->swizzle_b         = view->swizzle_b;
+   state->swizzle_a         = view->swizzle_a;
+
    state->target            = texture->target;
    state->pot_width         = util_is_pot(texture->width0);
    state->pot_height        = util_is_pot(texture->height0);
@@ -72,10 +91,18 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_t            = sampler->wrap_t;
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
-   state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
+   if (texture->last_level) {
+      state->min_mip_filter = sampler->min_mip_filter;
+   } else {
+      state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   }
+
    state->compare_mode      = sampler->compare_mode;
-   state->compare_func      = sampler->compare_func;
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
+      state->compare_func   = sampler->compare_func;
+   }
+
    state->normalized_coords = sampler->normalized_coords;
    state->lod_bias          = sampler->lod_bias;
    state->min_lod           = sampler->min_lod;
@@ -84,6 +111,10 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->border_color[1]   = sampler->border_color[1];
    state->border_color[2]   = sampler->border_color[2];
    state->border_color[3]   = sampler->border_color[3];
+
+   /*
+    * FIXME: Handle the remainder of pipe_sampler_view.
+    */
 }
 
 
@@ -137,62 +168,34 @@ lp_build_gather(LLVMBuilderRef builder,
 
 
 /**
- * Compute the offset of a pixel.
+ * Compute the offset of a pixel block.
  *
- * x, y, y_stride are vectors
+ * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as
+ * per format description, and not individual pixels.
  */
 LLVMValueRef
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef z_stride)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
 
-   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
-
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      LLVMValueRef x_lo, x_hi;
-      LLVMValueRef y_lo, y_hi;
-      LLVMValueRef x_stride_lo, x_stride_hi;
-      LLVMValueRef y_stride_lo, y_stride_hi;
-      LLVMValueRef x_offset_lo, x_offset_hi;
-      LLVMValueRef y_offset_lo, y_offset_hi;
-      LLVMValueRef offset_lo, offset_hi;
+   x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
+   offset = lp_build_mul(bld, x, x_stride);
 
-      x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
-      y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
-
-      x_hi = LLVMBuildLShr(bld->builder, x, bld->one, "");
-      y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
-
-      x_stride_lo = x_stride;
-      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
-
-      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
-      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
-
-      x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
-      y_offset_lo = lp_build_mul(bld, y_lo, y_stride_lo);
-      offset_lo = lp_build_add(bld, x_offset_lo, y_offset_lo);
-
-      x_offset_hi = lp_build_mul(bld, x_hi, x_stride_hi);
-      y_offset_hi = lp_build_mul(bld, y_hi, y_stride_hi);
-      offset_hi = lp_build_add(bld, x_offset_hi, y_offset_hi);
-
-      offset = lp_build_add(bld, offset_hi, offset_lo);
+   if (y && y_stride) {
+      LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+      offset = lp_build_add(bld, offset, y_offset);
    }
-   else {
-      LLVMValueRef x_offset;
-      LLVMValueRef y_offset;
-
-      x_offset = lp_build_mul(bld, x, x_stride);
-      y_offset = lp_build_mul(bld, y, y_stride);
 
-      offset = lp_build_add(bld, x_offset, y_offset);
+   if (z && z_stride) {
+      LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+      offset = lp_build_add(bld, offset, z_offset);
    }
 
    return offset;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5ba0925bb6..8ceb20473d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,9 +36,10 @@
 #define LP_BLD_SAMPLE_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
-struct pipe_texture;
+struct pipe_resource;
+struct pipe_sampler_view;
 struct pipe_sampler_state;
 struct util_format_description;
 struct lp_type;
@@ -48,14 +49,20 @@ struct lp_build_context;
 /**
  * Sampler static state.
  *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * These are the bits of state from pipe_resource and pipe_sampler_state that
  * are embedded in the generated code.
  */
 struct lp_sampler_static_state
 {
-   /* pipe_texture's state */
+   /* pipe_sampler_view's state */
    enum pipe_format format;
-   unsigned target:2;
+   unsigned swizzle_r:3;
+   unsigned swizzle_g:3;
+   unsigned swizzle_b:3;
+   unsigned swizzle_a:3;
+
+   /* pipe_texture's state */
+   unsigned target:3;
    unsigned pot_width:1;
    unsigned pot_height:1;
    unsigned pot_depth:1;
@@ -78,7 +85,7 @@ struct lp_sampler_static_state
 /**
  * Sampler dynamic state.
  *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * These are the bits of state from pipe_resource and pipe_sampler_state that
  * are computed in runtime.
  *
  * There are obtained through callbacks, as we don't want to tie the texture
@@ -113,9 +120,14 @@ struct lp_sampler_dynamic_state
                   unsigned unit);
 
    LLVMValueRef
-   (*stride)( struct lp_sampler_dynamic_state *state,
-              LLVMBuilderRef builder,
-              unsigned unit);
+   (*row_stride)( struct lp_sampler_dynamic_state *state,
+                  LLVMBuilderRef builder,
+                  unsigned unit);
+
+   LLVMValueRef
+   (*img_stride)( struct lp_sampler_dynamic_state *state,
+                  LLVMBuilderRef builder,
+                  unsigned unit);
 
    LLVMValueRef
    (*data_ptr)( struct lp_sampler_dynamic_state *state,
@@ -130,7 +142,7 @@ struct lp_sampler_dynamic_state
  */
 void
 lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_view *view,
                         const struct pipe_sampler_state *sampler);
 
 
@@ -148,8 +160,9 @@ lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr);
+                       LLVMValueRef z_stride);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 9058f76c1d..54ef921678 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -48,6 +48,7 @@
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
@@ -65,6 +66,14 @@ struct lp_build_sample_context
 
    const struct util_format_description *format_desc;
 
+   /** regular scalar float type */
+   struct lp_type float_type;
+   struct lp_build_context float_bld;
+
+   /** regular scalar float type */
+   struct lp_type int_type;
+   struct lp_build_context int_bld;
+
    /** Incoming coordinates type and build context */
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
@@ -108,9 +117,125 @@ wrap_mode_uses_border_color(unsigned mode)
 }
 
 
+static LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], data_ptr;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
+   return data_ptr;
+}
+
+
+static LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_mipmap_level(bld, data_array, lvl);
+}
+
+
+/**
+ * Dereference stride_array[mipmap_level] array to get a stride.
+ * Return stride as a vector.
+ */
+static LLVMValueRef
+lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
+                              LLVMValueRef stride_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], stride;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
+   stride = LLVMBuildLoad(bld->builder, stride, "");
+   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   return stride;
+}
+
+
+/** Dereference stride_array[0] array to get a stride (as vector). */
+static LLVMValueRef
+lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld,
+                                    LLVMValueRef stride_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_level_stride_vec(bld, stride_array, lvl);
+}
+
+
+static int
+texture_dims(enum pipe_texture_target tex)
+{
+   switch (tex) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return 2;
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      assert(0 && "bad texture target in texture_dims()");
+      return 2;
+   }
+}
+
+
+static LLVMValueRef
+lp_build_swizzle_chan_soa(struct lp_type type,
+                          const LLVMValueRef *unswizzled,
+                          enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case PIPE_SWIZZLE_RED:
+   case PIPE_SWIZZLE_GREEN:
+   case PIPE_SWIZZLE_BLUE:
+   case PIPE_SWIZZLE_ALPHA:
+      return unswizzled[swizzle];
+   case PIPE_SWIZZLE_ZERO:
+      return lp_build_zero(type);
+   case PIPE_SWIZZLE_ONE:
+      return lp_build_one(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+static void
+lp_build_swizzle_soa(struct lp_build_sample_context *bld,
+                     LLVMValueRef *texel)
+{
+   LLVMValueRef unswizzled[4];
+   unsigned char swizzles[4];
+   unsigned chan;
+
+   for (chan = 0; chan < 4; ++chan) {
+      unswizzled[chan] = texel[chan];
+   }
+
+   swizzles[0] = bld->static_state->swizzle_r;
+   swizzles[1] = bld->static_state->swizzle_g;
+   swizzles[2] = bld->static_state->swizzle_b;
+   swizzles[3] = bld->static_state->swizzle_a;
+
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned swizzle = swizzles[chan];
+      texel[chan] = lp_build_swizzle_chan_soa(bld->texel_type,
+                                              unswizzled, swizzle);
+   }
+}
+
+
 
 /**
- * Gen code to fetch a texel from a texture at int coords (x, y).
+ * Generate code to fetch a texel from a texture at int coords (x, y, z).
+ * The computation depends on whether the texture is 1D, 2D or 3D.
  * The result, texel, will be:
  *   texel[0] = red values
  *   texel[1] = green values
@@ -121,15 +246,19 @@ static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef width,
                           LLVMValueRef height,
+                          LLVMValueRef depth,
                           LLVMValueRef x,
                           LLVMValueRef y,
+                          LLVMValueRef z,
                           LLVMValueRef y_stride,
+                          LLVMValueRef z_stride,
                           LLVMValueRef data_ptr,
                           LLVMValueRef *texel)
 {
+   const int dims = texture_dims(bld->static_state->target);
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
-   LLVMValueRef packed;
+   LLVMValueRef i, j;
    LLVMValueRef use_border = NULL;
 
    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
@@ -140,7 +269,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
    }
 
-   if (wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+   if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
@@ -153,6 +282,58 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
+   if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) {
+      LLVMValueRef b1, b2;
+      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
+      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+      if (use_border) {
+         use_border = LLVMBuildOr(bld->builder, use_border, b1, "ub_or_b1");
+         use_border = LLVMBuildOr(bld->builder, use_border, b2, "ub_or_b2");
+      }
+      else {
+         use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
+      }
+   }
+
+   /*
+    * Describe the coordinates in terms of pixel blocks.
+    *
+    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
+    * bit arithmetic. Verify this.
+    */
+
+   if (bld->format_desc->block.width == 1) {
+      i = bld->uint_coord_bld.zero;
+   }
+   else {
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
+      i = LLVMBuildURem(bld->builder, x, block_width, "");
+      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
+   }
+
+   if (bld->format_desc->block.height == 1) {
+      j = bld->uint_coord_bld.zero;
+   }
+   else {
+      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
+      j = LLVMBuildURem(bld->builder, y, block_height, "");
+      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
+   }
+
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
+   offset = lp_build_sample_offset(&bld->uint_coord_bld,
+                                   bld->format_desc,
+                                   x, y, z, y_stride, z_stride);
+
+   lp_build_fetch_rgba_soa(bld->builder,
+                           bld->format_desc,
+                           bld->texel_type,
+                           data_ptr, offset,
+                           i, j,
+                           texel);
+
+   lp_build_swizzle_soa(bld, texel);
+
    /*
     * Note: if we find an app which frequently samples the texture border
     * we might want to implement a true conditional here to avoid sampling
@@ -168,35 +349,12 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
     * the texel color results with the border color.
     */
 
-   /* convert x,y coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   /* gather the texels from the texture */
-   packed = lp_build_gather(bld->builder,
-                            bld->texel_type.length,
-                            bld->format_desc->block.bits,
-                            bld->texel_type.width,
-                            data_ptr, offset);
-
-   /* convert texels to float rgba */
-   lp_build_unpack_rgba_soa(bld->builder,
-                            bld->format_desc,
-                            bld->texel_type,
-                            packed, texel);
-
    if (use_border) {
       /* select texel color or border color depending on use_border */
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_scalar(bld->texel_type,
+            lp_build_const_vec(bld->texel_type,
                                   bld->static_state->border_color[chan]);
          texel[chan] = lp_build_select(&bld->texel_bld, use_border,
                                        border_chan, texel[chan]);
@@ -210,19 +368,22 @@ lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef data_array)
 {
    LLVMValueRef offset;
+   LLVMValueRef data_ptr;
 
    offset = lp_build_sample_offset(&bld->uint_coord_bld,
                                    bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
+                                   x, y, NULL, y_stride, NULL);
 
    assert(bld->format_desc->block.width == 1);
    assert(bld->format_desc->block.height == 1);
    assert(bld->format_desc->block.bits <= bld->texel_type.width);
 
+   /* get pointer to mipmap level 0 data */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
+
    return lp_build_gather(bld->builder,
                           bld->texel_type.length,
                           bld->format_desc->block.bits,
@@ -358,8 +519,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
-   LLVMValueRef half = lp_build_const_scalar(coord_bld->type, 0.5);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -413,7 +574,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_scalar(coord_bld->type, 0.5F);
+         min = lp_build_const_vec(coord_bld->type, 0.5F);
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -434,7 +595,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = 1.0 - min */
             max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -446,7 +607,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
             coord = lp_build_clamp(coord_bld, coord, min, max);
             coord = lp_build_sub(coord_bld, coord, half);
@@ -521,7 +682,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          LLVMValueRef min, max;
          /* min = -1.0 / (2 * length) = -0.5 / length */
          min = lp_build_mul(coord_bld,
-                            lp_build_const_scalar(coord_bld->type, -0.5F),
+                            lp_build_const_vec(coord_bld->type, -0.5F),
                             lp_build_rcp(coord_bld, length_f));
          /* max = 1.0 - min */
          max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -566,7 +727,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -609,7 +770,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, 0.5F);
+            min = lp_build_const_vec(coord_bld->type, 0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -625,7 +786,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = length - min */
             max = lp_build_sub(coord_bld, length_f, min);
@@ -634,7 +795,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -711,83 +872,916 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
 
 /**
- * Sample 2D texture with nearest filtering.
+ * Codegen equivalent for u_minify().
+ * Return max(1, base_size >> level);
+ */
+static LLVMValueRef
+lp_build_minify(struct lp_build_sample_context *bld,
+                LLVMValueRef base_size,
+                LLVMValueRef level)
+{
+   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
+   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+   return size;
+}
+
+
+/**
+ * Generate code to compute texture level of detail (lambda).
+ * \param s  vector of texcoord s values
+ * \param t  vector of texcoord t values
+ * \param r  vector of texcoord r values
+ * \param shader_lod_bias  vector float with the shader lod bias,
+ * \param width  scalar int texture width
+ * \param height  scalar int texture height
+ * \param depth  scalar int texture depth
+ */
+static LLVMValueRef
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      LLVMValueRef s,
+                      LLVMValueRef t,
+                      LLVMValueRef r,
+                      LLVMValueRef shader_lod_bias,
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth)
+
+{
+   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+      /* User is forcing sampling from a particular mipmap level.
+       * This is hit during mipmap generation.
+       */
+      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+   }
+   else {
+      const int dims = texture_dims(bld->static_state->target);
+      struct lp_build_context *float_bld = &bld->float_bld;
+      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
+                                                    bld->static_state->lod_bias);
+      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->min_lod);
+      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->max_lod);
+
+      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+      LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+
+      LLVMValueRef s0, s1, s2;
+      LLVMValueRef t0, t1, t2;
+      LLVMValueRef r0, r1, r2;
+      LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
+      LLVMValueRef rho, lod;
+
+      /*
+       * dsdx = abs(s[1] - s[0]);
+       * dsdy = abs(s[2] - s[0]);
+       * dtdx = abs(t[1] - t[0]);
+       * dtdy = abs(t[2] - t[0]);
+       * drdx = abs(r[1] - r[0]);
+       * drdy = abs(r[2] - r[0]);
+       * XXX we're assuming a four-element quad in 2x2 layout here.
+       */
+      s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
+      s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
+      s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
+      dsdx = LLVMBuildSub(bld->builder, s1, s0, "");
+      dsdx = lp_build_abs(float_bld, dsdx);
+      dsdy = LLVMBuildSub(bld->builder, s2, s0, "");
+      dsdy = lp_build_abs(float_bld, dsdy);
+      if (dims > 1) {
+         t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
+         t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
+         t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
+         dtdx = LLVMBuildSub(bld->builder, t1, t0, "");
+         dtdx = lp_build_abs(float_bld, dtdx);
+         dtdy = LLVMBuildSub(bld->builder, t2, t0, "");
+         dtdy = lp_build_abs(float_bld, dtdy);
+         if (dims > 2) {
+            r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
+            r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
+            r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
+            drdx = LLVMBuildSub(bld->builder, r1, r0, "");
+            drdx = lp_build_abs(float_bld, drdx);
+            drdy = LLVMBuildSub(bld->builder, r2, r0, "");
+            drdy = lp_build_abs(float_bld, drdy);
+         }
+      }
+
+      /* Compute rho = max of all partial derivatives scaled by texture size.
+       * XXX this could be vectorized somewhat
+       */
+      rho = LLVMBuildMul(bld->builder,
+                         lp_build_max(float_bld, dsdx, dsdy),
+                         lp_build_int_to_float(float_bld, width), "");
+      if (dims > 1) {
+         LLVMValueRef max;
+         max = LLVMBuildMul(bld->builder,
+                            lp_build_max(float_bld, dtdx, dtdy),
+                            lp_build_int_to_float(float_bld, height), "");
+         rho = lp_build_max(float_bld, rho, max);
+         if (dims > 2) {
+            max = LLVMBuildMul(bld->builder,
+                               lp_build_max(float_bld, drdx, drdy),
+                               lp_build_int_to_float(float_bld, depth), "");
+            rho = lp_build_max(float_bld, rho, max);
+         }
+      }
+
+      /* compute lod = log2(rho) */
+      lod = lp_build_log2(float_bld, rho);
+
+      /* add sampler lod bias */
+      lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler LOD bias");
+
+      /* add shader lod bias */
+      /* XXX for now we take only the first element since our lod is scalar */
+      shader_lod_bias = LLVMBuildExtractElement(bld->builder, shader_lod_bias,
+                                                LLVMConstInt(LLVMInt32Type(), 0, 0), "");
+      lod = LLVMBuildAdd(bld->builder, lod, shader_lod_bias, "shader LOD bias");
+
+      /* clamp lod */
+      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+
+      return lod;
+   }
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
+ * mipmap level index.
+ * Note: this is all scalar code.
+ * \param lod  scalar float texture level of detail
+ * \param level_out  returns integer 
  */
 static void
-lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
-                               LLVMValueRef s,
-                               LLVMValueRef t,
-                               LLVMValueRef width,
-                               LLVMValueRef height,
-                               LLVMValueRef stride,
-                               LLVMValueRef data_ptr,
-                               LLVMValueRef *texel)
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out)
 {
-   LLVMValueRef x, y;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
 
-   x = lp_build_sample_wrap_nearest(bld, s, width,
-                                    bld->static_state->pot_width,
-                                    bld->static_state->wrap_s);
-   y = lp_build_sample_wrap_nearest(bld, t, height,
-                                    bld->static_state->pot_height,
-                                    bld->static_state->wrap_t);
+   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-   lp_build_name(x, "tex.x.wrapped");
-   lp_build_name(y, "tex.y.wrapped");
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_iround(float_bld, lod);
 
-   lp_build_sample_texel_soa(bld, width, height, x, y, stride, data_ptr, texel);
+   /* clamp level to legal range of levels */
+   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
 }
 
 
 /**
- * Sample 2D texture with bilinear filtering.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
+ * two (adjacent) mipmap level indexes.  Later, we'll sample from those
+ * two mipmap levels and interpolate between them.
  */
 static void
-lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_ifloor(float_bld, lod);
+
+   /* compute level 0 and clamp to legal range of levels */
+   *level0_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+   /* compute level 1 and clamp to legal range of levels */
+   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
+   *level1_out = lp_build_min(int_bld, *level1_out, last_level);
+
+   *weight_out = lp_build_fract(float_bld, lod);
+}
+
+
+/**
+ * Generate code to sample a mipmap level with nearest filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
+ */
+static void
+lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              LLVMValueRef width_vec,
+                              LLVMValueRef height_vec,
+                              LLVMValueRef depth_vec,
+                              LLVMValueRef row_stride_vec,
+                              LLVMValueRef img_stride_vec,
+                              LLVMValueRef data_ptr,
                               LLVMValueRef s,
                               LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
-                              LLVMValueRef *texel)
+                              LLVMValueRef r,
+                              LLVMValueRef colors_out[4])
 {
-   LLVMValueRef s_fpart;
-   LLVMValueRef t_fpart;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x, y, z;
+
+   /*
+    * Compute integer texcoords.
+    */
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s);
+   lp_build_name(x, "tex.x.wrapped");
+
+   if (dims >= 2) {
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+                                       bld->static_state->pot_height,
+                                       bld->static_state->wrap_t);
+      lp_build_name(y, "tex.y.wrapped");
+
+      if (dims == 3) {
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
+                                          bld->static_state->pot_height,
+                                          bld->static_state->wrap_r);
+         lp_build_name(z, "tex.z.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z = r;
+      }
+      else {
+         z = NULL;
+      }
+   }
+   else {
+      y = z = NULL;
+   }
+
+   /*
+    * Get texture colors.
+    */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x, y, z,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, colors_out);
+}
+
+
+/**
+ * Generate code to sample a mipmap level with linear filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
+ */
+static void
+lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             LLVMValueRef width_vec,
+                             LLVMValueRef height_vec,
+                             LLVMValueRef depth_vec,
+                             LLVMValueRef row_stride_vec,
+                             LLVMValueRef img_stride_vec,
+                             LLVMValueRef data_ptr,
+                             LLVMValueRef s,
+                             LLVMValueRef t,
+                             LLVMValueRef r,
+                             LLVMValueRef colors_out[4])
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x0, y0, z0, x1, y1, z1;
+   LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
-   unsigned chan;
+   int chan;
 
-   lp_build_sample_wrap_linear(bld, s, width, bld->static_state->pot_width,
-                               bld->static_state->wrap_s, &x0, &x1, &s_fpart);
-   lp_build_sample_wrap_linear(bld, t, height, bld->static_state->pot_height,
-                               bld->static_state->wrap_t, &y0, &y1, &t_fpart);
+   /*
+    * Compute integer texcoords.
+    */
+   lp_build_sample_wrap_linear(bld, s, width_vec,
+                               bld->static_state->pot_width,
+                               bld->static_state->wrap_s,
+                               &x0, &x1, &s_fpart);
+   lp_build_name(x0, "tex.x0.wrapped");
+   lp_build_name(x1, "tex.x1.wrapped");
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear(bld, t, height_vec,
+                                  bld->static_state->pot_height,
+                                  bld->static_state->wrap_t,
+                                  &y0, &y1, &t_fpart);
+      lp_build_name(y0, "tex.y0.wrapped");
+      lp_build_name(y1, "tex.y1.wrapped");
+
+      if (dims == 3) {
+         lp_build_sample_wrap_linear(bld, r, depth_vec,
+                                     bld->static_state->pot_depth,
+                                     bld->static_state->wrap_r,
+                                     &z0, &z1, &r_fpart);
+         lp_build_name(z0, "tex.z0.wrapped");
+         lp_build_name(z1, "tex.z1.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z0 = z1 = r;  /* cube face */
+         r_fpart = NULL;
+      }
+      else {
+         z0 = z1 = NULL;
+         r_fpart = NULL;
+      }
+   }
+   else {
+      y0 = y1 = t_fpart = NULL;
+      z0 = z1 = r_fpart = NULL;
+   }
+
+   /*
+    * Get texture colors.
+    */
+   /* get x0/x1 texels */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x0, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x1, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][1]);
+
+   if (dims == 1) {
+      /* Interpolate two samples from 1D image to produce one color */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan]);
+      }
+   }
+   else {
+      /* 2D/3D texture */
+      LLVMValueRef colors0[4];
+
+      /* get x0/x1 texels at y1 */
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x0, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][0]);
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x1, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][1]);
+
+      /* Bilinear interpolate the four samples from the 2D image / 3D slice */
+      for (chan = 0; chan < 4; chan++) {
+         colors0[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                          s_fpart, t_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan],
+                                          neighbors[1][0][chan],
+                                          neighbors[1][1][chan]);
+      }
 
-   lp_build_sample_texel_soa(bld, width, height, x0, y0, stride, data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y0, stride, data_ptr, neighbors[0][1]);
-   lp_build_sample_texel_soa(bld, width, height, x0, y1, stride, data_ptr, neighbors[1][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y1, stride, data_ptr, neighbors[1][1]);
+      if (dims == 3) {
+         LLVMValueRef neighbors1[2][2][4];
+         LLVMValueRef colors1[4];
+
+         /* get x0/x1/y0/y1 texels at z1 */
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][1]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][1]);
+
+         /* Bilinear interpolate the four samples from the second Z slice */
+         for (chan = 0; chan < 4; chan++) {
+            colors1[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                             s_fpart, t_fpart,
+                                             neighbors1[0][0][chan],
+                                             neighbors1[0][1][chan],
+                                             neighbors1[1][0][chan],
+                                             neighbors1[1][1][chan]);
+         }
 
-   /* TODO: Don't interpolate missing channels */
-   for(chan = 0; chan < 4; ++chan) {
-      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
-                                     s_fpart, t_fpart,
-                                     neighbors[0][0][chan],
-                                     neighbors[0][1][chan],
-                                     neighbors[1][0][chan],
-                                     neighbors[1][1][chan]);
+         /* Linearly interpolate the two samples from the two 3D slices */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = lp_build_lerp(&bld->texel_bld,
+                                             r_fpart,
+                                             colors0[chan], colors1[chan]);
+         }
+      }
+      else {
+         /* 2D tex */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = colors0[chan];
+         }
+      }
+   }
+}
+
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = -0.5 / abs(coord); */
+   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
+                                   lp_build_rcp(coord_bld, absCoord));
+   return ima;
+}
+
+
+/**
+ * Helper used by lp_build_cube_lookup()
+ * \param sign  scalar +1 or -1
+ * \param coord  float vector
+ * \param ima  float vector
+ */
+static LLVMValueRef
+lp_build_cube_coord(struct lp_build_context *coord_bld,
+                    LLVMValueRef sign, int negate_coord,
+                    LLVMValueRef coord, LLVMValueRef ima)
+{
+   /* return negate(coord) * ima * sign + 0.5; */
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
+   LLVMValueRef res;
+
+   assert(negate_coord == +1 || negate_coord == -1);
+
+   if (negate_coord == -1) {
+      coord = lp_build_negate(coord_bld, coord);
+   }
+
+   res = lp_build_mul(coord_bld, coord, ima);
+   if (sign) {
+      sign = lp_build_broadcast_scalar(coord_bld, sign);
+      res = lp_build_mul(coord_bld, res, sign);
+   }
+   res = lp_build_add(coord_bld, res, half);
+
+   return res;
+}
+
+
+/** Helper used by lp_build_cube_lookup()
+ * Return (major_coord >= 0) ? pos_face : neg_face;
+ */
+static LLVMValueRef
+lp_build_cube_face(struct lp_build_sample_context *bld,
+                   LLVMValueRef major_coord,
+                   unsigned pos_face, unsigned neg_face)
+{
+   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                                    major_coord,
+                                    bld->float_bld.zero, "");
+   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
+   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
+   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
+   return res;
+}
+
+
+
+/**
+ * Generate code to do cube face selection and per-face texcoords.
+ */
+static void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef rx, ry, rz;
+   LLVMValueRef arx, ary, arz;
+   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
+   LLVMValueRef arx_ge_ary, arx_ge_arz;
+   LLVMValueRef ary_ge_arx, ary_ge_arz;
+   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+   LLVMValueRef rx_pos, ry_pos, rz_pos;
+
+   assert(bld->coord_bld.type.length == 4);
+
+   /*
+    * Use the average of the four pixel's texcoords to choose the face.
+    */
+   rx = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, s));
+   ry = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, t));
+   rz = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, r));
+
+   arx = lp_build_abs(float_bld, rx);
+   ary = lp_build_abs(float_bld, ry);
+   arz = lp_build_abs(float_bld, rz);
+
+   /*
+    * Compare sign/magnitude of rx,ry,rz to determine face
+    */
+   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
+   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
+   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
+   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
+
+   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
+   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
+   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
+   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
+
+   {
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      *face_s = bld->coord_bld.undef;
+      *face_t = bld->coord_bld.undef;
+      *face = bld->int_bld.undef;
+
+      lp_build_name(*face_s, "face_s");
+      lp_build_name(*face_t, "face_t");
+      lp_build_name(*face, "face");
+
+      lp_build_flow_scope_declare(flow_ctx, face_s);
+      lp_build_flow_scope_declare(flow_ctx, face_t);
+      lp_build_flow_scope_declare(flow_ctx, face);
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      {
+         /* +/- X face */
+         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
+         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
+         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+         *face = lp_build_cube_face(bld, rx,
+                                    PIPE_TEX_FACE_POS_X,
+                                    PIPE_TEX_FACE_NEG_X);
+      }
+      lp_build_else(&if_ctx);
+      {
+         struct lp_build_flow_context *flow_ctx2;
+         struct lp_build_if_state if_ctx2;
+
+         LLVMValueRef face_s2 = bld->coord_bld.undef;
+         LLVMValueRef face_t2 = bld->coord_bld.undef;
+         LLVMValueRef face2 = bld->int_bld.undef;
+
+         flow_ctx2 = lp_build_flow_create(bld->builder);
+         lp_build_flow_scope_begin(flow_ctx2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
+         lp_build_flow_scope_declare(flow_ctx2, &face2);
+
+         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         {
+            /* +/- Y face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            face2 = lp_build_cube_face(bld, ry,
+                                       PIPE_TEX_FACE_POS_Y,
+                                       PIPE_TEX_FACE_NEG_Y);
+         }
+         lp_build_else(&if_ctx2);
+         {
+            /* +/- Z face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            face2 = lp_build_cube_face(bld, rz,
+                                       PIPE_TEX_FACE_POS_Z,
+                                       PIPE_TEX_FACE_NEG_Z);
+         }
+         lp_build_endif(&if_ctx2);
+         lp_build_flow_scope_end(flow_ctx2);
+         lp_build_flow_destroy(flow_ctx2);
+
+         *face_s = face_s2;
+         *face_t = face_t2;
+         *face = face2;
+      }
+
+      lp_build_endif(&if_ctx);
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
    }
 }
 
 
+
+/**
+ * Sample the texture/mipmap using given image filter and mip filter.
+ * data0_ptr and data1_ptr point to the two mipmap levels to sample
+ * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
+ * If we're using nearest miplevel sampling the '1' values will be null/unused.
+ */
+static void
+lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned img_filter,
+                       unsigned mip_filter,
+                       LLVMValueRef s,
+                       LLVMValueRef t,
+                       LLVMValueRef r,
+                       LLVMValueRef lod_fpart,
+                       LLVMValueRef width0_vec,
+                       LLVMValueRef width1_vec,
+                       LLVMValueRef height0_vec,
+                       LLVMValueRef height1_vec,
+                       LLVMValueRef depth0_vec,
+                       LLVMValueRef depth1_vec,
+                       LLVMValueRef row_stride0_vec,
+                       LLVMValueRef row_stride1_vec,
+                       LLVMValueRef img_stride0_vec,
+                       LLVMValueRef img_stride1_vec,
+                       LLVMValueRef data_ptr0,
+                       LLVMValueRef data_ptr1,
+                       LLVMValueRef *colors_out)
+{
+   LLVMValueRef colors0[4], colors1[4];
+   int chan;
+
+   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      lp_build_sample_image_nearest(bld,
+                                    width0_vec, height0_vec, depth0_vec,
+                                    row_stride0_vec, img_stride0_vec,
+                                    data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_nearest(bld,
+                                       width1_vec, height1_vec, depth1_vec,
+                                       row_stride1_vec, img_stride1_vec,
+                                       data_ptr1, s, t, r, colors1);
+      }
+   }
+   else {
+      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+
+      lp_build_sample_image_linear(bld,
+                                   width0_vec, height0_vec, depth0_vec,
+                                   row_stride0_vec, img_stride0_vec,
+                                   data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_linear(bld,
+                                      width1_vec, height1_vec, depth1_vec,
+                                      row_stride1_vec, img_stride1_vec,
+                                      data_ptr1, s, t, r, colors1);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* interpolate samples from the two mipmap levels */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
+                                          colors0[chan], colors1[chan]);
+      }
+   }
+   else {
+      /* use first/only level's colors */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = colors0[chan];
+      }
+   }
+}
+
+
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef width,
+                        LLVMValueRef height,
+                        LLVMValueRef depth,
+                        LLVMValueRef width_vec,
+                        LLVMValueRef height_vec,
+                        LLVMValueRef depth_vec,
+                        LLVMValueRef row_stride_array,
+                        LLVMValueRef img_stride_array,
+                        LLVMValueRef data_array,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL, ilevel0_vec, ilevel1_vec = NULL;
+   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
+   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
+   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0, data_ptr1 = NULL;
+
+   /*
+   printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
+          mip_filter, min_filter, mag_filter);
+   */
+
+   /*
+    * Compute the level of detail (float).
+    */
+   if (min_filter != mag_filter ||
+       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* Need to compute lod either to choose mipmap levels or to
+       * distinguish between minification/magnification with one mipmap level.
+       */
+      lod = lp_build_lod_selector(bld, s, t, r, lodbias, width, height, depth);
+   }
+
+   /*
+    * Compute integer mipmap level(s) to fetch texels from.
+    */
+   if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* always use mip level 0 */
+      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   }
+   else {
+      if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR);
+         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
+                                    &lod_fpart);
+         lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart);
+      }
+   }
+
+   /*
+    * Convert scalar integer mipmap levels into vectors.
+    */
+   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+
+   /*
+    * Compute width, height at mipmap level 'ilevel0'
+    */
+   width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   if (dims >= 2) {
+      height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
+      row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                      ilevel0);
+      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         img_stride0_vec = lp_build_get_level_stride_vec(bld,
+                                                         img_stride_array,
+                                                         ilevel0);
+         if (dims == 3) {
+            depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
+         }
+      }
+   }
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* compute width, height, depth for second mipmap level at 'ilevel1' */
+      width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
+      if (dims >= 2) {
+         height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
+         row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                         ilevel1);
+         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+            img_stride1_vec = lp_build_get_level_stride_vec(bld,
+                                                            img_stride_array,
+                                                            ilevel1);
+            if (dims ==3) {
+               depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
+            }
+         }
+      }
+   }
+
+   /*
+    * Choose cube face, recompute per-face texcoords.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+   }
+
+   /*
+    * Get pointer(s) to image data for mipmap level(s).
+    */
+   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+   }
+
+   /*
+    * Get/interpolate texture colors.
+    */
+   if (min_filter == mag_filter) {
+      /* no need to distinquish between minification and magnification */
+      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
+                             width0_vec, width1_vec,
+                             height0_vec, height1_vec,
+                             depth0_vec, depth1_vec,
+                             row_stride0_vec, row_stride1_vec,
+                             img_stride0_vec, img_stride1_vec,
+                             data_ptr0, data_ptr1,
+                             colors_out);
+   }
+   else {
+      /* Emit conditional to choose min image filter or mag image filter
+       * depending on the lod being >0 or <= 0, respectively.
+       */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef minify;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[0]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[1]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[2]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[3]);
+
+      /* minify = lod > 0.0 */
+      minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                             lod, float_bld->zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
+      {
+         /* Use the minification filter */
+         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_else(&if_ctx);
+      {
+         /* Use the magnification filter */
+         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+
 static void
 lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
                           struct lp_type dst_type,
                           LLVMValueRef packed,
                           LLVMValueRef *rgba)
 {
-   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
    unsigned chan;
 
    /* Decode the input vector components */
@@ -799,7 +1793,7 @@ lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
       input = packed;
 
       if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
 
       if(stop < 32)
          input = LLVMBuildAnd(builder, input, mask, "");
@@ -817,8 +1811,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef t,
                               LLVMValueRef width,
                               LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
+                              LLVMValueRef stride_array,
+                              LLVMValueRef data_array,
                               LLVMValueRef *texel)
 {
    LLVMBuilderRef builder = bld->builder;
@@ -834,8 +1828,9 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
+   LLVMValueRef stride;
 
-   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
    lp_build_context_init(&h16, builder, lp_type_ufixed(16));
    lp_build_context_init(&u8n, builder, lp_type_unorm(8));
 
@@ -860,17 +1855,17 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
 
    /* subtract 0.5 (add -128) */
-   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   i32_c128 = lp_build_const_int_vec(i32.type, -128);
    s = LLVMBuildAdd(builder, s, i32_c128, "");
    t = LLVMBuildAdd(builder, t, i32_c128, "");
 
    /* compute floor (shift right 8) */
-   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
    t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
 
    /* compute fractional part (AND with 0xff) */
-   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   i32_c255 = lp_build_const_int_vec(i32.type, 255);
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
@@ -941,6 +1936,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
+   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
     *
@@ -958,10 +1955,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
 
    neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
    neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
@@ -1006,6 +2003,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    lp_build_format_swizzle_soa(bld->format_desc,
                                bld->texel_type, unswizzled,
                                texel);
+
+   lp_build_swizzle_soa(bld, texel);
 }
 
 
@@ -1035,7 +2034,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    }
 
    assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+   res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25));
 
    /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
    for(chan = 0; chan < 3; ++chan)
@@ -1044,194 +2043,11 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 }
 
 
-static int
-texture_dims(enum pipe_texture_target tex)
-{
-   switch (tex) {
-   case PIPE_TEXTURE_1D:
-      return 1;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-      return 2;
-   case PIPE_TEXTURE_3D:
-      return 3;
-   default:
-      assert(0 && "bad texture target in texture_dims()");
-      return 2;
-   }
-}
-
-
-/**
- * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
- * \param width  scalar int texture width
- * \param height  scalar int texture height
- * \param depth  scalar int texture depth
- */
-static LLVMValueRef
-lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
-
-{
-   const int dims = texture_dims(bld->static_state->target);
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-
-   LLVMValueRef lod_bias = lp_build_const_scalar(bld->coord_bld.type,
-                                                 bld->static_state->lod_bias);
-   LLVMValueRef min_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->min_lod);
-   LLVMValueRef max_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->max_lod);
-
-   LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
-   LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
-
-   LLVMValueRef s0, s1, s2;
-   LLVMValueRef t0, t1, t2;
-   LLVMValueRef r0, r1, r2;
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho, lod;
-
-   /*
-    * dsdx = abs(s[1] - s[0]);
-    * dsdy = abs(s[2] - s[0]);
-    * dtdx = abs(t[1] - t[0]);
-    * dtdy = abs(t[2] - t[0]);
-    * drdx = abs(r[1] - r[0]);
-    * drdy = abs(r[2] - r[0]);
-    * XXX we're assuming a four-element quad in 2x2 layout here.
-    */
-   s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
-   s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
-   s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
-   dsdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s1, s0));
-   dsdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s2, s0));
-   if (dims > 1) {
-      t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
-      t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
-      t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
-      dtdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t1, t0));
-      dtdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t2, t0));
-      if (dims > 2) {
-         r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
-         r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
-         r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
-         drdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r1, r0));
-         drdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r2, r0));
-      }
-   }
-
-   /* Compute rho = max of all partial derivatives scaled by texture size.
-    * XXX this can be vectorized somewhat
-    */
-   rho = lp_build_mul(coord_bld,
-                       lp_build_max(coord_bld, dsdx, dsdy),
-                       lp_build_int_to_float(coord_bld, width));
-   if (dims > 1) {
-      LLVMValueRef max;
-      max = lp_build_mul(coord_bld,
-                         lp_build_max(coord_bld, dtdx, dtdy),
-                         lp_build_int_to_float(coord_bld, height));
-      rho = lp_build_max(coord_bld, rho, max);
-      if (dims > 2) {
-         max = lp_build_mul(coord_bld,
-                            lp_build_max(coord_bld, drdx, drdy),
-                            lp_build_int_to_float(coord_bld, depth));
-         rho = lp_build_max(coord_bld, rho, max);
-      }
-   }
-
-   /* compute lod = log2(rho) */
-   lod = lp_build_log2(coord_bld, rho);
-
-   /* add lod bias */
-   lod = lp_build_add(coord_bld, lod, lod_bias);
-
-   /* clamp lod */
-   lod = lp_build_clamp(coord_bld, lod, min_lod, max_lod);
-
-   return lod;
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
- * mipmap level index.
- * \param lod  scalar float texture level of detail
- * \param level_out  returns integer 
- */
-static void
-lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_iround(coord_bld, lod);
-
-   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_coord_bld, level,
-                               int_coord_bld->zero,
-                               last_level);
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
- */
-static void
-lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_ifloor(coord_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_coord_bld, level,
-                                int_coord_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   *level1_out = lp_build_add(int_coord_bld, *level0_out, int_coord_bld->one);
-   *level1_out = lp_build_min(int_coord_bld, *level1_out, int_coord_bld->zero);
-
-   *weight_out = lp_build_fract(coord_bld, lod);
-}
-
-
-
 /**
  * Build texture sampling code.
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
+ * \param type  vector float type to use for coords, etc.
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -1245,28 +2061,31 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     LLVMValueRef *texel)
 {
    struct lp_build_sample_context bld;
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef stride;
-   LLVMValueRef data_ptr;
+   LLVMValueRef width, width_vec;
+   LLVMValueRef height, height_vec;
+   LLVMValueRef depth, depth_vec;
+   LLVMValueRef row_stride_array, img_stride_array;
+   LLVMValueRef data_array;
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
 
-   (void) lp_build_lod_selector;   /* temporary to silence warning */
-   (void) lp_build_nearest_mip_level;
-   (void) lp_build_linear_mip_levels;
-
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
    bld.builder = builder;
    bld.static_state = static_state;
    bld.dynamic_state = dynamic_state;
    bld.format_desc = util_format_description(static_state->format);
+
+   bld.float_type = lp_type_float(32);
+   bld.int_type = lp_type_int(32);
    bld.coord_type = type;
    bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
    bld.texel_type = type;
+
+   lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
    lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
@@ -1275,41 +2094,39 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    /* Get the dynamic state */
    width = dynamic_state->width(dynamic_state, builder, unit);
    height = dynamic_state->height(dynamic_state, builder, unit);
-   stride = dynamic_state->stride(dynamic_state, builder, unit);
-   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   depth = dynamic_state->depth(dynamic_state, builder, unit);
+   row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
+   img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit);
+   data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   /* Note that data_array is an array[level] of pointers to texture images */
 
    s = coords[0];
    t = coords[1];
    r = coords[2];
 
-   width = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
-   height = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
-   stride = lp_build_broadcast_scalar(&bld.uint_coord_bld, stride);
-
-   if(static_state->target == PIPE_TEXTURE_1D)
-      t = bld.coord_bld.zero;
-
-   switch (static_state->min_img_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height,
-                                     stride, data_ptr, texel);
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      if(lp_format_is_rgba8(bld.format_desc) &&
-         is_simple_wrap_mode(static_state->wrap_s) &&
-         is_simple_wrap_mode(static_state->wrap_t))
-         lp_build_sample_2d_linear_aos(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      else
-         lp_build_sample_2d_linear_soa(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      break;
-   default:
-      assert(0);
+   width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
+   height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
+   depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
+
+   if (util_format_is_rgba8_variant(bld.format_desc) &&
+       static_state->target == PIPE_TEXTURE_2D &&
+       static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+       is_simple_wrap_mode(static_state->wrap_s) &&
+       is_simple_wrap_mode(static_state->wrap_t)) {
+      /* special case */
+      lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
+                                    row_stride_array, data_array, texel);
+   }
+   else {
+      lp_build_sample_general(&bld, unit, s, t, r, lodbias,
+                              width, height, depth,
+                              width_vec, height_vec, depth_vec,
+                              row_stride_array, img_stride_array,
+                              data_array,
+                              texel);
    }
-
-   /* FIXME: respect static_state->min_mip_filter */;
-   /* FIXME: respect static_state->mag_img_filter */;
 
    lp_build_sample_compare(&bld, r, texel);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
index 740392f561..147336edb4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -37,7 +37,7 @@
 #define LP_BLD_STRUCT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 #include <llvm-c/Target.h>
 
 #include "util/u_debug.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 64e81f7b1f..278c838eac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -144,9 +144,9 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 #endif
 
          if(shift > 0)
-            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
+            tmp = LLVMBuildLShr(bld->builder, a, lp_build_const_int_vec(type4, shift*type.width), "");
          if(shift < 0)
-            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
+            tmp = LLVMBuildShl(bld->builder, a, lp_build_const_int_vec(type4, -shift*type.width), "");
 
          assert(tmp);
          if(tmp)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index b9472127a6..138ca620e6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -37,7 +37,7 @@
 #define LP_BLD_SWIZZLE_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index eddb7a83fa..2eac5da6c6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -35,10 +35,11 @@
 #ifndef LP_BLD_TGSI_H
 #define LP_BLD_TGSI_H
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 
 struct tgsi_token;
+struct tgsi_shader_info;
 struct lp_type;
 struct lp_build_context;
 struct lp_build_mask_context;
@@ -78,7 +79,8 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
-                  struct lp_build_sampler_soa *sampler);
+                  struct lp_build_sampler_soa *sampler,
+                  struct tgsi_shader_info *info);
 
 
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 5f2c2a54ee..d3c769e28b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -41,10 +41,12 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_scan.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
@@ -95,6 +97,19 @@ struct lp_exec_mask {
    int cond_stack_size;
    LLVMValueRef cond_mask;
 
+   LLVMValueRef break_stack[LP_TGSI_MAX_NESTING];
+   int break_stack_size;
+   LLVMValueRef break_mask;
+
+   LLVMValueRef cont_stack[LP_TGSI_MAX_NESTING];
+   int cont_stack_size;
+   LLVMValueRef cont_mask;
+
+   LLVMBasicBlockRef loop_stack[LP_TGSI_MAX_NESTING];
+   int loop_stack_size;
+   LLVMBasicBlockRef loop_block;
+
+
    LLVMValueRef exec_mask;
 };
 
@@ -111,6 +126,12 @@ struct lp_build_tgsi_soa_context
 
    LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
+   LLVMValueRef addr[LP_MAX_TEMPS][NUM_CHANNELS];
+
+   /* we allocate an array of temps if we have indirect
+    * addressing and then the temps above is unused */
+   LLVMValueRef temps_array;
+   boolean has_indirect_addressing;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
@@ -145,15 +166,33 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context
    mask->bld = bld;
    mask->has_mask = FALSE;
    mask->cond_stack_size = 0;
+   mask->loop_stack_size = 0;
+   mask->break_stack_size = 0;
+   mask->cont_stack_size = 0;
 
    mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
 }
 
 static void lp_exec_mask_update(struct lp_exec_mask *mask)
 {
-   mask->exec_mask = mask->cond_mask;
-   if (mask->cond_stack_size > 0)
-      mask->has_mask = TRUE;
+   if (mask->loop_stack_size) {
+      /*for loops we need to update the entire mask at runtime */
+      LLVMValueRef tmp;
+      assert(mask->break_mask);
+      tmp = LLVMBuildAnd(mask->bld->builder,
+                         mask->cont_mask,
+                         mask->break_mask,
+                         "maskcb");
+      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cond_mask,
+                                     tmp,
+                                     "maskfull");
+   } else
+      mask->exec_mask = mask->cond_mask;
+
+
+   mask->has_mask = (mask->cond_stack_size > 0 ||
+                     mask->loop_stack_size > 0);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
@@ -190,6 +229,104 @@ static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
    lp_exec_mask_update(mask);
 }
 
+static void lp_exec_bgnloop(struct lp_exec_mask *mask)
+{
+
+   if (mask->cont_stack_size == 0)
+      mask->cont_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->break_stack_size == 0)
+      mask->break_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->cond_stack_size == 0)
+      mask->cond_mask = LLVMConstAllOnes(mask->int_vec_type);
+
+   mask->break_stack[mask->break_stack_size++] = mask->break_mask;
+   mask->cont_stack[mask->cont_stack_size++] = mask->cont_mask;
+   mask->loop_stack[mask->loop_stack_size++] = mask->loop_block;
+   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
+   LLVMBuildBr(mask->bld->builder, mask->loop_block);
+   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_break(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "break");
+
+   /* mask->break_stack_size > 1 implies that we encountered a break
+    * statemant already and if that's the case we want to make sure
+    * our mask is a combination of the previous break and the current
+    * execution mask */
+   if (mask->break_stack_size > 1) {
+      mask->break_mask = LLVMBuildAnd(mask->bld->builder,
+                                      mask->break_mask,
+                                      exec_mask, "break_full");
+   } else
+      mask->break_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_continue(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "");
+
+   if (mask->cont_stack_size > 1) {
+      mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cont_mask,
+                                     exec_mask, "");
+   } else
+      mask->cont_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+
+static void lp_exec_endloop(struct lp_exec_mask *mask)
+{
+   LLVMBasicBlockRef endloop;
+   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
+                                      mask->bld->type.length);
+   LLVMValueRef i1cond;
+
+   assert(mask->break_mask);
+
+   /* i1cond = (mask == 0) */
+   i1cond = LLVMBuildICmp(
+      mask->bld->builder,
+      LLVMIntNE,
+      LLVMBuildBitCast(mask->bld->builder, mask->break_mask, reg_type, ""),
+      LLVMConstNull(reg_type), "");
+
+   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
+
+   LLVMBuildCondBr(mask->bld->builder,
+                   i1cond, mask->loop_block, endloop);
+
+   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
+
+   mask->loop_block = mask->loop_stack[--mask->loop_stack_size];
+   /* pop the cont mask */
+   if (mask->cont_stack_size) {
+      mask->cont_mask = mask->cont_stack[--mask->cont_stack_size];
+   }
+   /* pop the break mask */
+   if (mask->break_stack_size) {
+      mask->break_mask = mask->break_stack[--mask->break_stack_size];
+   }
+
+   lp_exec_mask_update(mask);
+}
+
+/* stores val into an address pointed to by dst.
+ * mask->exec_mask is used to figure out which bits of val
+ * should be stored into the address
+ * (0 means don't store this bit, 1 means do store).
+ */
 static void lp_exec_mask_store(struct lp_exec_mask *mask,
                                LLVMValueRef val,
                                LLVMValueRef dst)
@@ -227,6 +364,23 @@ emit_ddy(struct lp_build_tgsi_soa_context *bld,
    return lp_build_sub(&bld->base, src_top, src_bottom);
 }
 
+static LLVMValueRef
+get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
+             unsigned index,
+             unsigned swizzle,
+             boolean is_indirect,
+             LLVMValueRef addr)
+{
+   if (!bld->has_indirect_addressing) {
+      return bld->temps[index][swizzle];
+   } else {
+      LLVMValueRef lindex =
+         LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
+      if (is_indirect)
+         lindex = lp_build_add(&bld->base, lindex, addr);
+      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
+   }
+}
 
 /**
  * Register fetch.
@@ -241,6 +395,7 @@ emit_fetch(
    const struct tgsi_full_src_register *reg = &inst->Src[index];
    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    LLVMValueRef res;
+   LLVMValueRef addr;
 
    switch (swizzle) {
    case TGSI_SWIZZLE_X:
@@ -248,11 +403,34 @@ emit_fetch(
    case TGSI_SWIZZLE_Z:
    case TGSI_SWIZZLE_W:
 
+      if (reg->Register.Indirect) {
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
+         unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
+         addr = LLVMBuildLoad(bld->base.builder,
+                              bld->addr[reg->Indirect.Index][swizzle],
+                              "");
+         /* for indexing we want integers */
+         addr = LLVMBuildFPToSI(bld->base.builder, addr,
+                                int_vec_type, "");
+         addr = LLVMBuildExtractElement(bld->base.builder,
+                                        addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
+                                        "");
+         addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      }
+
       switch (reg->Register.File) {
       case TGSI_FILE_CONSTANT: {
          LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
-         LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
-         LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+         LLVMValueRef scalar, scalar_ptr;
+
+         if (reg->Register.Indirect) {
+            /*lp_build_printf(bld->base.builder,
+              "\taddr = %d\n", addr);*/
+            index = lp_build_add(&bld->base, index, addr);
+         }
+         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
+         scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
          res = lp_build_broadcast_scalar(&bld->base, scalar);
          break;
       }
@@ -267,11 +445,16 @@ emit_fetch(
          assert(res);
          break;
 
-      case TGSI_FILE_TEMPORARY:
-         res = LLVMBuildLoad(bld->base.builder, bld->temps[reg->Register.Index][swizzle], "");
+      case TGSI_FILE_TEMPORARY: {
+         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                              swizzle,
+                                              reg->Register.Indirect,
+                                              addr);
+         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
          if(!res)
             return bld->base.undef;
          break;
+      }
 
       default:
          assert( 0 );
@@ -349,6 +532,7 @@ emit_store(
    LLVMValueRef value)
 {
    const struct tgsi_full_dst_register *reg = &inst->Dst[index];
+   LLVMValueRef addr;
 
    switch( inst->Instruction.Saturate ) {
    case TGSI_SAT_NONE:
@@ -360,7 +544,7 @@ emit_store(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
+      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
       value = lp_build_min(&bld->base, value, bld->base.one);
       break;
 
@@ -368,18 +552,42 @@ emit_store(
       assert(0);
    }
 
+   if (reg->Register.Indirect) {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
+      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
+      addr = LLVMBuildLoad(bld->base.builder,
+                           bld->addr[reg->Indirect.Index][swizzle],
+                           "");
+      /* for indexing we want integers */
+      addr = LLVMBuildFPToSI(bld->base.builder, addr,
+                             int_vec_type, "");
+      addr = LLVMBuildExtractElement(bld->base.builder,
+                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
+                                     "");
+      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+   }
+
    switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
       lp_exec_mask_store(&bld->exec_mask, value,
                          bld->outputs[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_TEMPORARY:
-      lp_exec_mask_store(&bld->exec_mask, value,
-                         bld->temps[reg->Register.Index][chan_index]);
+   case TGSI_FILE_TEMPORARY: {
+      LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                           chan_index,
+                                           reg->Register.Indirect,
+                                           addr);
+      lp_exec_mask_store(&bld->exec_mask, value, temp_ptr);
       break;
+   }
 
    case TGSI_FILE_ADDRESS:
+      lp_exec_mask_store(&bld->exec_mask, value,
+                         bld->addr[reg->Indirect.Index][chan_index]);
+      break;
+
+   case TGSI_FILE_PREDICATE:
       /* FIXME */
       assert(0);
       break;
@@ -456,6 +664,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 }
 
 
+/**
+ * Kill fragment if any of the src register values are negative.
+ */
 static void
 emit_kil(
    struct lp_build_tgsi_soa_context *bld,
@@ -486,6 +697,9 @@ emit_kil(
       if(terms[chan_index]) {
          LLVMValueRef chan_mask;
 
+         /*
+          * If term < 0 then mask = 0 else mask = ~0.
+          */
          chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
 
          if(mask)
@@ -501,26 +715,28 @@ emit_kil(
 
 
 /**
- * Check if inst src/dest regs use indirect addressing into temporary
- * register file.
+ * Predicated fragment kill.
+ * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
+ * The only predication is the execution mask which will apply if
+ * we're inside a loop or conditional.
  */
-static boolean
-indirect_temp_reference(const struct tgsi_full_instruction *inst)
+static void
+emit_kilp(struct lp_build_tgsi_soa_context *bld,
+          const struct tgsi_full_instruction *inst)
 {
-   uint i;
-   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *reg = &inst->Src[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
-          reg->Register.Indirect)
-         return TRUE;
+   LLVMValueRef mask;
+
+   /* For those channels which are "alive", disable fragment shader
+    * execution.
+    */
+   if (bld->exec_mask.has_mask) {
+      mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
    }
-   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
-          reg->Register.Indirect)
-         return TRUE;
+   else {
+      mask = bld->base.zero;
    }
-   return FALSE;
+
+   lp_build_mask_update(bld->mask, mask);
 }
 
 static int
@@ -528,39 +744,54 @@ emit_declaration(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_declaration *decl)
 {
+   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
+
    unsigned first = decl->Range.First;
    unsigned last = decl->Range.Last;
    unsigned idx, i;
 
    for (idx = first; idx <= last; ++idx) {
-      boolean ok;
-
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            bld->temps[idx][i] = lp_build_alloca(&bld->base);
-         ok = TRUE;
+         if (bld->has_indirect_addressing) {
+            LLVMValueRef val = LLVMConstInt(LLVMInt32Type(),
+                                            last*4 + 4, 0);
+            bld->temps_array = lp_build_array_alloca(bld->base.builder,
+                                                     vec_type, val, "");
+         } else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
+                                                    vec_type, "");
+         }
          break;
 
       case TGSI_FILE_OUTPUT:
          for (i = 0; i < NUM_CHANNELS; i++)
-            bld->outputs[idx][i] = lp_build_alloca(&bld->base);
-         ok = TRUE;
+            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
+                                                   vec_type, "");
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
+                                                vec_type, "");
          break;
 
       default:
          /* don't need to declare other vars */
-         ok = TRUE;
+         break;
       }
-
-      if (!ok)
-         return FALSE;
    }
 
    return TRUE;
 }
 
-static int
+
+/**
+ * Emit LLVM for one TGSI instruction.
+ * \param return TRUE for success, FALSE otherwise
+ */
+static boolean
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
@@ -577,9 +808,16 @@ emit_instruction(
    LLVMValueRef res;
    LLVMValueRef dst0[NUM_CHANNELS];
 
-   /* we can't handle indirect addressing into temp register file yet */
-   if (indirect_temp_reference(inst))
-      return FALSE;
+   /*
+    * Stores and write masks are handled in a general fashion after the long
+    * instruction opcode switch statement.
+    *
+    * Although not stricitly necessary, we avoid generating instructions for
+    * channels which won't be stored, in cases where's that easy. For some
+    * complex instructions, like texture sampling, it is more convenient to
+    * assume a full writemask and then let LLVM optimization passes eliminate
+    * redundant code.
+    */
 
    assert(info->num_dst <= 1);
    if(info->num_dst) {
@@ -589,17 +827,13 @@ emit_instruction(
    }
 
    switch (inst->Instruction.Opcode) {
-#if 0
    case TGSI_OPCODE_ARL:
-      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_flr(bld, 0, 0);
-         emit_f2it( bld, 0 );
+         tmp0 = lp_build_floor(&bld->base, tmp0);
          dst0[chan_index] = tmp0;
       }
       break;
-#endif
 
    case TGSI_OPCODE_MOV:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -865,7 +1099,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
          dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
       }
@@ -997,7 +1231,7 @@ emit_instruction(
    case TGSI_OPCODE_RCC:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
 
    case TGSI_OPCODE_DPH:
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
@@ -1040,8 +1274,7 @@ emit_instruction(
 
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      /* FIXME */
-      return 0;
+      emit_kilp( bld, inst );
       break;
 
    case TGSI_OPCODE_KIL:
@@ -1050,23 +1283,23 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_PK2US:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_PK4B:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_PK4UB:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_RFL:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_SEQ:
@@ -1126,77 +1359,72 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      /* XXX what about dst0 writemask? */
       emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
       /* FIXME */
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_UP2H:
       /* deprecated */
       assert (0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_UP2US:
       /* deprecated */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_UP4B:
       /* deprecated */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_UP4UB:
       /* deprecated */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_X2D:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_ARA:
       /* deprecated */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
-#if 0
    case TGSI_OPCODE_ARR:
-      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_rnd( bld, 0, 0 );
-         emit_f2it( bld, 0 );
+         tmp0 = lp_build_round(&bld->base, tmp0);
          dst0[chan_index] = tmp0;
       }
       break;
-#endif
 
    case TGSI_OPCODE_BRA:
       /* deprecated */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_CAL:
       /* FIXME */
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_RET:
       /* FIXME */
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_END:
@@ -1326,7 +1554,7 @@ emit_instruction(
    case TGSI_OPCODE_DIV:
       /* deprecated */
       assert( 0 );
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_DP2:
@@ -1349,27 +1577,20 @@ emit_instruction(
    case TGSI_OPCODE_TXP:
       emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
-      
+
    case TGSI_OPCODE_BRK:
-      /* FIXME */
-      return 0;
+      lp_exec_break(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_IF:
       tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
+                          tmp0, bld->base.zero);
       lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
       break;
 
-   case TGSI_OPCODE_BGNFOR:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_REP:
-      /* deprecated */
-      assert(0);
-      return 0;
+   case TGSI_OPCODE_BGNLOOP:
+      lp_exec_bgnloop(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_ELSE:
@@ -1380,28 +1601,20 @@ emit_instruction(
       lp_exec_mask_cond_pop(&bld->exec_mask);
       break;
 
-   case TGSI_OPCODE_ENDFOR:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDREP:
-      /* deprecated */
-      assert(0);
-      return 0;
+   case TGSI_OPCODE_ENDLOOP:
+      lp_exec_endloop(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_PUSHA:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_POPA:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_CEIL:
@@ -1414,13 +1627,13 @@ emit_instruction(
    case TGSI_OPCODE_I2F:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_NOT:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_TRUNC:
@@ -1433,75 +1646,74 @@ emit_instruction(
    case TGSI_OPCODE_SHL:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_ISHR:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_AND:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_OR:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_MOD:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_XOR:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_SAD:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_TXF:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_TXQ:
       /* deprecated? */
       assert(0);
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_CONT:
-      /* FIXME */
-      return 0;
+      lp_exec_continue(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_EMIT:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_ENDPRIM:
-      return 0;
+      return FALSE;
       break;
 
    case TGSI_OPCODE_NOP:
       break;
 
    default:
-      return 0;
+      return FALSE;
    }
    
    if(info->num_dst) {
@@ -1510,7 +1722,7 @@ emit_instruction(
       }
    }
 
-   return 1;
+   return TRUE;
 }
 
 
@@ -1523,7 +1735,8 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
-                  struct lp_build_sampler_soa *sampler)
+                  struct lp_build_sampler_soa *sampler,
+                  struct tgsi_shader_info *info)
 {
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
@@ -1539,6 +1752,8 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
+   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
+                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
 
    lp_exec_mask_init(&bld.exec_mask, &bld.base);
 
@@ -1559,10 +1774,10 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          {
             unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
-            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+            const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(opcode);
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, opcode_info ))
                _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-                             info ? info->mnemonic : "<invalid>");
+                             opcode_info->mnemonic);
          }
 
          break;
@@ -1575,7 +1790,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
             assert(num_immediates < LP_MAX_IMMEDIATES);
             for( i = 0; i < size; ++i )
                bld.immediates[num_immediates][i] =
-                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
+                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
             for( i = size; i < 4; ++i )
                bld.immediates[num_immediates][i] = bld.base.undef;
             num_immediates++;
@@ -1589,7 +1804,14 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          assert( 0 );
       }
    }
-
+   if (0) {
+      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+      LLVMValueRef function = LLVMGetBasicBlockParent(block);
+      debug_printf("11111111111111111111111111111 \n");
+      tgsi_dump(tokens, 0);
+      LLVMDumpValue(function);
+      debug_printf("2222222222222222222222222222 \n");
+   }
    tgsi_parse_free( &parse );
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index c327ba045a..796af88caa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -58,7 +58,10 @@ LLVMTypeRef
 lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
@@ -115,6 +118,9 @@ lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type)
    if(!vec_type)
       return FALSE;
 
+   if (type.length == 1)
+      return lp_check_elem_type(type, vec_type);
+
    if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
       return FALSE;
 
@@ -153,7 +159,10 @@ LLVMTypeRef
 lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 16946cc28a..cd59d2faa6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -37,9 +37,9 @@
 #define LP_BLD_TYPE_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 /**
@@ -103,7 +103,7 @@ struct lp_type {
    unsigned width:14;
 
    /**
-    * Vector length.
+    * Vector length.  If length==1, this is a scalar (float/int) type.
     *
     * width*length should be a power of two greater or equal to eight.
     *
@@ -139,6 +139,7 @@ struct lp_build_context
 };
 
 
+/** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
 {
@@ -148,12 +149,29 @@ lp_type_float(unsigned width)
    res_type.floating = TRUE;
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector of float type */
+static INLINE struct lp_type
+lp_type_float_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar int type */
 static INLINE struct lp_type
 lp_type_int(unsigned width)
 {
@@ -162,12 +180,28 @@ lp_type_int(unsigned width)
    memset(&res_type, 0, sizeof res_type);
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector int type */
+static INLINE struct lp_type
+lp_type_int_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar uint type */
 static INLINE struct lp_type
 lp_type_uint(unsigned width)
 {
@@ -175,6 +209,20 @@ lp_type_uint(unsigned width)
 
    memset(&res_type, 0, sizeof res_type);
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector uint type */
+static INLINE struct lp_type
+lp_type_uint_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;