33 files changed, 2345 insertions, 580 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld.h b/src/gallium/auxiliary/gallivm/lp_bld.h
new file mode 100644
index 0000000000..70a4960f91
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Wrapper for LLVM header file #includes.
+ */
+
+
+#ifndef LP_BLD_H
+#define LP_BLD_H
+
+
+#include <llvm-c/Core.h>  
+
+
+/** Set version to 0 if missing to avoid #ifdef HAVE_LLVM everywhere */
+#ifndef HAVE_LLVM
+#define HAVE_LLVM 0x0207
+#endif
+
+
+#endif /* LP_BLD_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h b/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
index 634575670d..0f99fec65e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
@@ -35,7 +35,7 @@
 #define LP_BLD_ALPHA_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 struct pipe_alpha_state;
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 32f9e5201c..8e8fcccf56 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -232,6 +232,37 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
+/** Return the sum of the elements of a */
+LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef index, res;
+   int i;
+
+   if (a == bld->zero)
+      return bld->zero;
+   if (a == bld->undef)
+      return bld->undef;
+   assert(type.length > 1);
+
+   assert(!bld->type.norm);
+
+   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   res = LLVMBuildExtractElement(bld->builder, a, index, "");
+
+   for (i = 1; i < type.length; i++) {
+      index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildAdd(bld->builder, res,
+                         LLVMBuildExtractElement(bld->builder, a, index, ""),
+                         "");
+   }
+
+   return res;
+}
+
+
 /**
  * Generate a - b
  */
@@ -330,12 +361,12 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    LLVMValueRef c8;
    LLVMValueRef ab;
 
-   c8 = lp_build_int_const_scalar(i16_type, 8);
+   c8 = lp_build_const_int_vec(i16_type, 8);
    
 #if 0
    
    /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
+   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
    ab = LLVMBuildMul(builder, a, b, "");
 
 #else
@@ -343,7 +374,7 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    ab = LLVMBuildMul(builder, a, b, "");
    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
-   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 
 #endif
    
@@ -398,7 +429,7 @@ lp_build_mul(struct lp_build_context *bld,
    }
 
    if(type.fixed)
-      shift = lp_build_int_const_scalar(type, type.width/2);
+      shift = lp_build_const_int_vec(type, type.width/2);
    else
       shift = NULL;
 
@@ -460,7 +491,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
           * for Inf and NaN.
           */
          unsigned mantissa = lp_mantissa(bld->type);
-         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
          a = LLVMBuildAdd(bld->builder, a, factor, "");
          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
@@ -468,12 +499,12 @@ lp_build_mul_imm(struct lp_build_context *bld,
 #endif
       }
       else {
-         factor = lp_build_const_scalar(bld->type, shift);
+         factor = lp_build_const_vec(bld->type, shift);
          return LLVMBuildShl(bld->builder, a, factor, "");
       }
    }
 
-   factor = lp_build_const_scalar(bld->type, (double)b);
+   factor = lp_build_const_vec(bld->type, (double)b);
    return lp_build_mul(bld, a, factor);
 }
 
@@ -536,7 +567,7 @@ lp_build_lerp(struct lp_build_context *bld,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
-      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 
    return res;
 }
@@ -644,13 +675,26 @@ lp_build_abs(struct lp_build_context *bld,
 
    if(type.floating) {
       /* Mask out the sign bit */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      unsigned long long absMask = ~(1ULL << (type.width - 1));
-      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      a = LLVMBuildAnd(bld->builder, a, mask, "");
-      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-      return a;
+      if (type.length == 1) {
+         LLVMTypeRef int_type = LLVMIntType(type.width);
+         LLVMTypeRef float_type = LLVMFloatType();
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
+         a = LLVMBuildBitCast(bld->builder, a, int_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, float_type, "");
+         return a;
+      }
+      else {
+         /* vector of floats */
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+         return a;
+      }
    }
 
    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -676,12 +720,12 @@ lp_build_negate(struct lp_build_context *bld,
 }
 
 
+/** Return -1, 0 or +1 depending on the sign of a */
 LLVMValueRef
 lp_build_sgn(struct lp_build_context *bld,
              LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMValueRef cond;
    LLVMValueRef res;
 
@@ -691,27 +735,42 @@ lp_build_sgn(struct lp_build_context *bld,
       res = bld->one;
    }
    else if(type.floating) {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMTypeRef vec_type;
+      LLVMTypeRef int_type;
+      LLVMValueRef mask;
       LLVMValueRef sign;
       LLVMValueRef one;
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
+
+      if (type.length == 1) {
+         int_type = lp_build_int_elem_type(type);
+         vec_type = lp_build_elem_type(type);
+         mask = LLVMConstInt(int_type, maskBit, 0);
+      }
+      else {
+         /* vector */
+         int_type = lp_build_int_vec_type(type);
+         vec_type = lp_build_vec_type(type);
+         mask = lp_build_const_int_vec(type, maskBit);
+      }
+
+      /* Take the sign bit and add it to 1 constant */
+      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      one = LLVMConstBitCast(bld->one, int_vec_type);
+      one = LLVMConstBitCast(bld->one, int_type);
       res = LLVMBuildOr(bld->builder, sign, one, "");
       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
    }
    else
    {
-      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
    }
 
    /* Handle zero */
    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
-   res = lp_build_select(bld, cond, bld->zero, bld->one);
+   res = lp_build_select(bld, cond, bld->zero, res);
 
    return res;
 }
@@ -730,8 +789,8 @@ lp_build_set_sign(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMValueRef shift = lp_build_int_const_scalar(type, type.width - 1);
-   LLVMValueRef mask = lp_build_int_const_scalar(type,
+   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
+   LLVMValueRef mask = lp_build_const_int_vec(type,
                              ~((unsigned long long) 1 << (type.width - 1)));
    LLVMValueRef val, res;
 
@@ -753,7 +812,7 @@ lp_build_set_sign(struct lp_build_context *bld,
 
 
 /**
- * Convert vector of int to vector of float.
+ * Convert vector of (or scalar) int to vector of (or scalar) float.
  */
 LLVMValueRef
 lp_build_int_to_float(struct lp_build_context *bld,
@@ -764,7 +823,11 @@ lp_build_int_to_float(struct lp_build_context *bld,
    assert(type.floating);
    /*assert(lp_check_value(type, a));*/
 
-   {
+   if (type.length == 1) {
+      LLVMTypeRef float_type = LLVMFloatType();
+      return LLVMBuildSIToFP(bld->builder, a, float_type, "");
+   }
+   else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
       LLVMValueRef res;
@@ -866,6 +929,13 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
+   if (type.length == 1) {
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
+      return res;
+   }
+
    if(util_cpu_caps.has_sse4_1)
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    else {
@@ -921,15 +991,24 @@ lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
    assert(type.floating);
-   assert(lp_check_value(type, a));
 
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   if (type.length == 1) {
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      return LLVMBuildFPToSI(bld->builder, a, int_type, "");
+   }
+   else {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      assert(lp_check_value(type, a));
+      return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   }
 }
 
 
+/**
+ * Convert float[] to int[] with round().
+ */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -939,6 +1018,15 @@ lp_build_iround(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      /* XXX we want rounding here! */
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -946,7 +1034,7 @@ lp_build_iround(struct lp_build_context *bld,
    }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef half;
 
@@ -955,7 +1043,7 @@ lp_build_iround(struct lp_build_context *bld,
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 
       /* sign * 0.5 */
-      half = lp_build_const_scalar(type, 0.5);
+      half = lp_build_const_vec(type, 0.5);
       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
       half = LLVMBuildOr(bld->builder, sign, half, "");
       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
@@ -981,6 +1069,14 @@ lp_build_ifloor(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -990,18 +1086,18 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* Take the sign bit and add it to 1 constant */
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef offset;
 
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
       lp_build_name(sign, "floor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
@@ -1172,7 +1268,7 @@ lp_build_exp(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log2(e) = 1/log(2) */
-   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
+   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
 
    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 }
@@ -1186,7 +1282,7 @@ lp_build_log(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
+   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
 
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
@@ -1207,6 +1303,7 @@ lp_build_polynomial(struct lp_build_context *bld,
                     unsigned num_coeffs)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
    LLVMValueRef res = NULL;
    unsigned i;
 
@@ -1216,7 +1313,13 @@ lp_build_polynomial(struct lp_build_context *bld,
                    __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
-      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
+      LLVMValueRef coeff;
+
+      if (type.length == 1)
+         coeff = LLVMConstReal(float_type, coeffs[i]);
+      else
+         coeff = lp_build_const_vec(type, coeffs[i]);
+
       if(res)
          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
       else
@@ -1272,11 +1375,11 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
       assert(type.floating && type.width == 32);
 
-      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
-      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
+      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
+      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
 
       /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
+      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
 
       /* fpart = x - ipart */
@@ -1286,8 +1389,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
    if(p_exp2_int_part || p_exp2) {
       /* expipart = (float) (1 << ipart) */
-      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
-      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
+      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
+      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
    }
 
@@ -1353,8 +1456,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
-   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
-   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
+   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
 
    LLVMValueRef i = NULL;
@@ -1379,8 +1482,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    }
 
    if(p_floor_log2 || p_log2) {
-      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
-      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
+      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
+      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
    }
 
@@ -1410,11 +1513,87 @@ lp_build_log2_approx(struct lp_build_context *bld,
 }
 
 
+/** scalar version of above function */
+static void
+lp_build_float_log2_approx(struct lp_build_context *bld,
+                           LLVMValueRef x,
+                           LLVMValueRef *p_exp,
+                           LLVMValueRef *p_floor_log2,
+                           LLVMValueRef *p_log2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
+   LLVMTypeRef int_type = LLVMIntType(type.width);
+
+   LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
+   LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
+
+   LLVMValueRef i = NULL;
+   LLVMValueRef exp = NULL;
+   LLVMValueRef mant = NULL;
+   LLVMValueRef logexp = NULL;
+   LLVMValueRef logmant = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp || p_floor_log2 || p_log2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
+
+      assert(type.floating && type.width == 32);
+
+      i = LLVMBuildBitCast(bld->builder, x, int_type, "");
+
+      /* exp = (float) exponent(x) */
+      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
+   }
+
+   if(p_floor_log2 || p_log2) {
+      LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
+      LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
+      logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
+      logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
+      logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
+   }
+
+   if(p_log2) {
+      /* mant = (float) mantissa(x) */
+      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
+      mant = LLVMBuildOr(bld->builder, mant, one, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
+
+      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
+                                    Elements(lp_build_log2_polynomial));
+
+      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+
+      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+   }
+
+   if(p_exp)
+      *p_exp = exp;
+
+   if(p_floor_log2)
+      *p_floor_log2 = logexp;
+
+   if(p_log2)
+      *p_log2 = res;
+}
+
+
 LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef x)
 {
    LLVMValueRef res;
-   lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   if (bld->type.length == 1) {
+      lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
+   }
+   else {
+      lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   }
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 55385e3a66..31efa9921c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -37,7 +37,7 @@
 #define LP_BLD_ARIT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -57,6 +57,10 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend.h b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
index da272e549f..ebbdb1a604 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
@@ -40,7 +40,7 @@
  * for a standalone example.
  */
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
  
 #include "pipe/p_format.h"
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index c8eaa8c394..57843e9a60 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -221,8 +221,16 @@ lp_build_undef(struct lp_type type)
 LLVMValueRef
 lp_build_zero(struct lp_type type)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   return LLVMConstNull(vec_type);
+   if (type.length == 1) {
+      if (type.floating)
+         return LLVMConstReal(LLVMFloatType(), 0.0);
+      else
+         return LLVMConstInt(LLVMIntType(type.width), 0, 0);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      return LLVMConstNull(vec_type);
+   }
 }
                
 
@@ -255,7 +263,7 @@ lp_build_one(struct lp_type type)
       if(type.sign)
          /* TODO: Unfortunately this caused "Tried to create a shift operation
           * on a non-integer type!" */
-         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
+         vec = LLVMConstLShr(vec, lp_build_const_int_vec(type, 1));
 #endif
 
       return vec;
@@ -264,13 +272,19 @@ lp_build_one(struct lp_type type)
    for(i = 1; i < type.length; ++i)
       elems[i] = elems[0];
 
-   return LLVMConstVector(elems, type.length);
+   if (type.length == 1)
+      return elems[0];
+   else
+      return LLVMConstVector(elems, type.length);
 }
                
 
+/**
+ * Build constant-valued vector from a scalar value.
+ */
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val)
+lp_build_const_vec(struct lp_type type,
+                   double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -295,7 +309,7 @@ lp_build_const_scalar(struct lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
+lp_build_const_int_vec(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index cb8e1c7b00..9ca2f0664e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -37,9 +37,9 @@
 #define LP_BLD_CONST_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 struct lp_type;
@@ -85,13 +85,11 @@ lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val);
+lp_build_const_vec(struct lp_type type, double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
-                          long long val);
+lp_build_const_int_vec(struct lp_type type, long long val);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index f77cf78721..3f7f2ebde9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -114,13 +114,13 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    scale = (double)mask/ubound;
    bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
+   res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), "");
+   res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), "");
    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 
    if(dst_width > n) {
       int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
+      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
 
       /* TODO: Fill in the empty lower bits for additional precision? */
       /* YES: this fixes progs/trivial/tri-z-eq.c.
@@ -130,21 +130,21 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 #if 0
       {
          LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
+         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
+         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
+         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
          res = LLVMBuildOr(builder, res, msb, "");
       }
 #elif 0
       while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
+         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
          shift -= n;
          n *= 2;
       }
 #endif
    }
    else
-      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -183,10 +183,10 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    if(src_width > mantissa) {
       int shift = src_width - mantissa;
-      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
+      res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), "");
    }
 
-   bias_ = lp_build_const_scalar(dst_type, bias);
+   bias_ = lp_build_const_vec(dst_type, bias);
 
    res = LLVMBuildOr(builder,
                      res,
@@ -195,7 +195,7 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
    res = LLVMBuildBitCast(builder, res, vec_type, "");
 
    res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
+   res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 
    return res;
 }
@@ -251,7 +251,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_min == 0.0)
             thres = bld.zero;
          else
-            thres = lp_build_const_scalar(src_type, dst_min);
+            thres = lp_build_const_vec(src_type, dst_min);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_max(&bld, tmp[i], thres);
       }
@@ -260,7 +260,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_max == 1.0)
             thres = bld.one;
          else
-            thres = lp_build_const_scalar(src_type, dst_max);
+            thres = lp_build_const_vec(src_type, dst_max);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_min(&bld, tmp[i], thres);
       }
@@ -288,7 +288,7 @@ lp_build_conv(LLVMBuilderRef builder,
          LLVMTypeRef tmp_vec_type;
 
          if (dst_scale != 1.0) {
-            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
+            LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
             for(i = 0; i < num_tmps; ++i)
                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
          }
@@ -315,7 +315,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
       /* FIXME: compensate different offsets too */
       if(src_shift > dst_shift) {
-         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
+         LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift);
          for(i = 0; i < num_tmps; ++i)
             if(src_type.sign)
                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
@@ -388,7 +388,7 @@ lp_build_conv(LLVMBuilderRef builder,
           }
 
           if (src_scale != 1.0) {
-             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
+             LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
              for(i = 0; i < num_tmps; ++i)
                 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
           }
@@ -400,7 +400,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
        /* FIXME: compensate different offsets too */
        if(src_shift < dst_shift) {
-          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
+          LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift);
           for(i = 0; i < num_tmps; ++i)
              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
        }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index 948e68fae4..628831c3ad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -37,7 +37,7 @@
 #define LP_BLD_CONV_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 583e6132b4..7b010cbdb0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -30,7 +30,7 @@
 #define LP_BLD_DEBUG_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_compiler.h"
 #include "util/u_string.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.c b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
index f08f8eb6d8..4ce1a27a06 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
@@ -52,7 +52,14 @@
  *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
  *  ... ... ... ... ... ... ... ... ...
  *
- * FIXME: Code generate stencil test
+ *
+ * Stencil test:
+ * Two-sided stencil test is supported but probably not as efficient as
+ * it could be.  Currently, we use if/then/else constructs to do the
+ * operations for front vs. back-facing polygons.  We could probably do
+ * both the front and back arithmetic then use a Select() instruction to
+ * choose the result depending on polyon orientation.  We'd have to
+ * measure performance both ways and see which is better.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
@@ -61,11 +68,264 @@
 #include "util/u_format.h"
 
 #include "lp_bld_type.h"
+#include "lp_bld_arit.h"
 #include "lp_bld_const.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_flow.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_depth.h"
+#include "lp_bld_swizzle.h"
+
+
+/** Used to select fields from pipe_stencil_state */
+enum stencil_op {
+   S_FAIL_OP,
+   Z_FAIL_OP,
+   Z_PASS_OP
+};
+
+
+
+/**
+ * Do the stencil test comparison (compare FB stencil values against ref value).
+ * This will be used twice when generating two-sided stencil code.
+ * \param stencil  the front/back stencil state
+ * \param stencilRef  the stencil reference value, replicated as a vector
+ * \param stencilVals  vector of stencil values from framebuffer
+ * \return vector mask of pass/fail values (~0 or 0)
+ */
+static LLVMValueRef
+lp_build_stencil_test_single(struct lp_build_context *bld,
+                             const struct pipe_stencil_state *stencil,
+                             LLVMValueRef stencilRef,
+                             LLVMValueRef stencilVals)
+{
+   const unsigned stencilMax = 255; /* XXX fix */
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(type.sign);
+
+   assert(stencil->enabled);
+
+   if (stencil->valuemask != stencilMax) {
+      /* compute stencilRef = stencilRef & valuemask */
+      LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask);
+      stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, "");
+      /* compute stencilVals = stencilVals & valuemask */
+      stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, "");
+   }
+
+   res = lp_build_cmp(bld, stencil->func, stencilVals, stencilRef);
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test comparison.
+ * \sa lp_build_stencil_test_single
+ * \param face  an integer indicating front (+) or back (-) facing polygon.
+ *              If NULL, assume front-facing.
+ */
+static LLVMValueRef
+lp_build_stencil_test(struct lp_build_context *bld,
+                      const struct pipe_stencil_state stencil[2],
+                      LLVMValueRef stencilRefs[2],
+                      LLVMValueRef stencilVals,
+                      LLVMValueRef face)
+{
+   LLVMValueRef res;
+
+   assert(stencil[0].enabled);
+
+   if (stencil[1].enabled && face) {
+      /* do two-sided test */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef front_facing;
+      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+      LLVMValueRef result = bld->undef;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &result);
+
+      /* front_facing = face > 0.0 */
+      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
+      {
+         result = lp_build_stencil_test_single(bld, &stencil[0],
+                                               stencilRefs[0], stencilVals);
+      }
+      lp_build_else(&if_ctx);
+      {
+         result = lp_build_stencil_test_single(bld, &stencil[1],
+                                               stencilRefs[1], stencilVals);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+
+      res = result;
+   }
+   else {
+      /* do single-side test */
+      res = lp_build_stencil_test_single(bld, &stencil[0],
+                                         stencilRefs[0], stencilVals);
+   }
+
+   return res;
+}
+
+
+/**
+ * Apply the stencil operator (add/sub/keep/etc) to the given vector
+ * of stencil values.
+ * \return  new stencil values vector
+ */
+static LLVMValueRef
+lp_build_stencil_op_single(struct lp_build_context *bld,
+                           const struct pipe_stencil_state *stencil,
+                           enum stencil_op op,
+                           LLVMValueRef stencilRef,
+                           LLVMValueRef stencilVals,
+                           LLVMValueRef mask)
+
+{
+   const unsigned stencilMax = 255; /* XXX fix */
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   unsigned stencil_op;
+
+   assert(type.sign);
+
+   switch (op) {
+   case S_FAIL_OP:
+      stencil_op = stencil->fail_op;
+      break;
+   case Z_FAIL_OP:
+      stencil_op = stencil->zfail_op;
+      break;
+   case Z_PASS_OP:
+      stencil_op = stencil->zpass_op;
+      break;
+   default:
+      assert(0 && "Invalid stencil_op mode");
+      stencil_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   switch (stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      res = stencilVals;
+      /* we can return early for this case */
+      return res;
+   case PIPE_STENCIL_OP_ZERO:
+      res = bld->zero;
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      res = stencilRef;
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = lp_build_min(bld, res, max);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = lp_build_max(bld, res, bld->zero);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      res = LLVMBuildNot(bld->builder, stencilVals, "");
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   default:
+      assert(0 && "bad stencil op mode");
+      res = NULL;
+   }
+
+   if (stencil->writemask != stencilMax) {
+      /* compute res = (res & mask) | (stencilVals & ~mask) */
+      LLVMValueRef mask = lp_build_const_int_vec(type, stencil->writemask);
+      LLVMValueRef cmask = LLVMBuildNot(bld->builder, mask, "notWritemask");
+      LLVMValueRef t1 = LLVMBuildAnd(bld->builder, res, mask, "t1");
+      LLVMValueRef t2 = LLVMBuildAnd(bld->builder, stencilVals, cmask, "t2");
+      res = LLVMBuildOr(bld->builder, t1, t2, "t1_or_t2");
+   }
+
+   /* only the update the vector elements enabled by 'mask' */
+   res = lp_build_select(bld, mask, res, stencilVals);
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test op/update.
+ */
+static LLVMValueRef
+lp_build_stencil_op(struct lp_build_context *bld,
+                    const struct pipe_stencil_state stencil[2],
+                    enum stencil_op op,
+                    LLVMValueRef stencilRefs[2],
+                    LLVMValueRef stencilVals,
+                    LLVMValueRef mask,
+                    LLVMValueRef face)
+
+{
+   assert(stencil[0].enabled);
+
+   if (stencil[1].enabled && face) {
+      /* do two-sided op */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef front_facing;
+      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+      LLVMValueRef result = bld->undef;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &result);
+
+      /* front_facing = face > 0.0 */
+      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
+      {
+         result = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                             stencilRefs[0], stencilVals, mask);
+      }
+      lp_build_else(&if_ctx);
+      {
+         result = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                             stencilRefs[1], stencilVals, mask);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+
+      return result;
+   }
+   else {
+      /* do single-sided op */
+      return lp_build_stencil_op_single(bld, &stencil[0], op,
+                                        stencilRefs[0], stencilVals, mask);
+   }
+}
+
 
 
 /**
@@ -109,105 +369,303 @@ lp_depth_type(const struct util_format_description *format_desc,
 
 
 /**
- * Depth test.
+ * Compute bitmask and bit shift to apply to the incoming fragment Z values
+ * and the Z buffer values needed before doing the Z comparison.
+ *
+ * Note that we leave the Z bits in the position that we find them
+ * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
+ * get by with fewer bit twiddling steps.
  */
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr)
+static boolean
+get_z_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *mask)
 {
-   struct lp_build_context bld;
+   const unsigned total_bits = format_desc->block.bits;
    unsigned z_swizzle;
-   LLVMValueRef dst;
-   LLVMValueRef z_bitmask = NULL;
-   LLVMValueRef test;
-
-   if(!state->enabled)
-      return;
-
+   int chan;
+   unsigned padding_left, padding_right;
+   
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
 
    z_swizzle = format_desc->swizzle[0];
-   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
-      return;
 
-   /* Sanity checking */
-   assert(z_swizzle < 4);
-   assert(format_desc->block.bits == type.width);
-   if(type.floating) {
-      assert(z_swizzle == 0);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
-      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
+   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   padding_right = 0;
+   for (chan = 0; chan < z_swizzle; ++chan)
+      padding_right += format_desc->channel[chan].size;
+
+   padding_left =
+      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+
+   if (padding_left || padding_right) {
+      unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
+      unsigned long long mask_right = (1ULL << (padding_right)) - 1;
+      *mask = mask_left ^ mask_right;
    }
    else {
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].normalized);
-      assert(!type.fixed);
-      assert(!type.sign);
-      assert(type.norm);
+      *mask = 0xffffffff;
+   }
+
+   *shift = padding_left;
+
+   return TRUE;
+}
+
+
+/**
+ * Compute bitmask and bit shift to apply to the framebuffer pixel values
+ * to put the stencil bits in the least significant position.
+ * (i.e. 0x000000ff)
+ */
+static boolean
+get_s_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *mask)
+{
+   unsigned s_swizzle;
+   int chan, sz;
+
+   s_swizzle = format_desc->swizzle[1];
+
+   if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   *shift = 0;
+   for (chan = 0; chan < s_swizzle; chan++)
+      *shift += format_desc->channel[chan].size;
+
+   sz = format_desc->channel[s_swizzle].size;
+   *mask = (1U << sz) - 1U;
+
+   return TRUE;
+}
+
+
+
+/**
+ * Generate code for performing depth and/or stencil tests.
+ * We operate on a vector of values (typically a 2x2 quad).
+ *
+ * \param depth  the depth test state
+ * \param stencil  the front/back stencil state
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param stencil_refs  the front/back stencil ref values (scalar)
+ * \param z_src  the incoming depth/stencil values (a 2x2 quad)
+ * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
+ * \param facing  contains float value indicating front/back facing polygon
+ */
+void
+lp_build_depth_stencil_test(LLVMBuilderRef builder,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef z_src,
+                            LLVMValueRef zs_dst_ptr,
+                            LLVMValueRef face)
+{
+   struct lp_build_context bld;
+   struct lp_build_context sbld;
+   struct lp_type s_type;
+   LLVMValueRef zs_dst, z_dst = NULL;
+   LLVMValueRef stencil_vals = NULL;
+   LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
+   LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
+   LLVMValueRef orig_mask = mask->value;
+
+   /* Sanity checking */
+   {
+      const unsigned z_swizzle = format_desc->swizzle[0];
+      const unsigned s_swizzle = format_desc->swizzle[1];
+
+      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
+             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+
+      assert(depth->enabled || stencil[0].enabled);
+
+      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+      assert(format_desc->block.width == 1);
+      assert(format_desc->block.height == 1);
+
+      if (stencil[0].enabled) {
+         assert(format_desc->format == PIPE_FORMAT_Z24S8_UNORM ||
+                format_desc->format == PIPE_FORMAT_S8Z24_UNORM);
+      }
+
+      assert(z_swizzle < 4);
+      assert(format_desc->block.bits == type.width);
+      if (type.floating) {
+         assert(z_swizzle == 0);
+         assert(format_desc->channel[z_swizzle].type ==
+                UTIL_FORMAT_TYPE_FLOAT);
+         assert(format_desc->channel[z_swizzle].size ==
+                format_desc->block.bits);
+      }
+      else {
+         assert(format_desc->channel[z_swizzle].type ==
+                UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(format_desc->channel[z_swizzle].normalized);
+         assert(!type.fixed);
+         assert(!type.sign);
+         assert(type.norm);
+      }
    }
 
-   /* Setup build context */
+
+   /* Setup build context for Z vals */
    lp_build_context_init(&bld, builder, type);
 
-   dst = LLVMBuildLoad(builder, dst_ptr, "");
+   /* Setup build context for stencil vals */
+   s_type = lp_type_int_vec(type.width);
+   lp_build_context_init(&sbld, builder, s_type);
+
+   /* Load current z/stencil value from z/stencil buffer */
+   zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
+
+   lp_build_name(zs_dst, "zsbufval");
 
-   lp_build_name(dst, "zsbuf");
 
-   /* Align the source depth bits with the destination's, and mask out any
-    * stencil or padding bits from both */
-   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
-      assert(z_swizzle == 0);
-      /* nothing to do */
+   /* Compute and apply the Z/stencil bitmasks and shifts.
+    */
+   {
+      unsigned z_shift, z_mask;
+      unsigned s_shift, s_mask;
+
+      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+
+         if (z_mask != 0xffffffff) {
+            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
+            z_src = LLVMBuildAnd(builder, z_src, mask, "");
+            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
+            z_bitmask = mask;  /* used below */
+         }
+         else {
+            z_dst = zs_dst;
+         }
+
+         lp_build_name(z_dst, "zsbuf.z");
+      }
+
+      if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
+         if (s_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+            stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
+            stencil_shift = shift;  /* used below */
+         }
+         else {
+            stencil_vals = zs_dst;
+         }
+
+         if (s_mask != 0xffffffff) {
+            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
+         }
+
+         lp_build_name(stencil_vals, "stencil");
+      }
    }
-   else {
-      unsigned padding_left;
-      unsigned padding_right;
-      unsigned chan;
-
-      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
-      assert(format_desc->channel[z_swizzle].normalized);
-
-      padding_right = 0;
-      for(chan = 0; chan < z_swizzle; ++chan)
-         padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits -
-                     (padding_right + format_desc->channel[z_swizzle].size);
-
-      if(padding_left || padding_right) {
-         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
+
+
+   if (stencil[0].enabled) {
+      /* convert scalar stencil refs into vectors */
+      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
+      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+
+      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
+                                          stencil_refs, stencil_vals, face);
+
+      /* apply stencil-fail operator */
+      {
+         LLVMValueRef s_fail_mask = lp_build_andc(&bld, orig_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            s_fail_mask, face);
+      }
+   }
+
+   if (depth->enabled) {
+      /* compare src Z to dst Z, returning 'pass' mask */
+      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+
+      if (!stencil[0].enabled) {
+         /* We can potentially skip all remaining operations here, but only
+          * if stencil is disabled because we still need to update the stencil
+          * buffer values.  Don't need to update Z buffer values.
+          */
+         lp_build_mask_update(mask, z_pass);
+      }
+
+      if (depth->writemask) {
+         if(z_bitmask)
+            z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
+         else
+            z_bitmask = mask->value;
+
+         z_dst = lp_build_select(&bld, z_bitmask, z_src, z_dst);
       }
 
-      if(padding_left)
-         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
-      if(padding_right)
-         src = LLVMBuildAnd(builder, src, z_bitmask, "");
-      if(padding_left || padding_right)
-         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
+      if (stencil[0].enabled) {
+         /* update stencil buffer values according to z pass/fail result */
+         LLVMValueRef z_fail_mask, z_pass_mask;
+
+         /* apply Z-fail operator */
+         z_fail_mask = lp_build_andc(&bld, orig_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_fail_mask, face);
+
+         /* apply Z-pass operator */
+         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_pass_mask, face);
+      }
+   }
+   else {
+      /* No depth test: apply Z-pass operator to stencil buffer values which
+       * passed the stencil test.
+       */
+      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+                                         stencil_refs, stencil_vals,
+                                         s_pass_mask, face);
    }
 
-   lp_build_name(dst, "zsbuf.z");
+   /* The Z bits are already in the right place but we may need to shift the
+    * stencil bits before ORing Z with Stencil to make the final pixel value.
+    */
+   if (stencil_vals && stencil_shift)
+      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+                                  stencil_shift, "");
 
-   test = lp_build_cmp(&bld, state->func, src, dst);
-   lp_build_mask_update(mask, test);
+   /* Finally, merge/store the z/stencil values */
+   if ((depth->enabled && depth->writemask) ||
+       (stencil[0].enabled && stencil[0].writemask)) {
 
-   if(state->writemask) {
-      if(z_bitmask)
-         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
+      if (z_dst && stencil_vals)
+         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+      else if (z_dst)
+         zs_dst = z_dst;
       else
-         z_bitmask = mask->value;
+         zs_dst = stencil_vals;
 
-      dst = lp_build_select(&bld, z_bitmask, src, dst);
-      LLVMBuildStore(builder, dst, dst_ptr);
+      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
    }
+
+   if (s_pass_mask)
+      lp_build_mask_update(mask, s_pass_mask);
+
+   if (depth->enabled && stencil[0].enabled)
+      lp_build_mask_update(mask, z_pass);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.h b/src/gallium/auxiliary/gallivm/lp_bld_depth.h
index 79d6981bb5..27dd46b625 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_depth.h
@@ -36,7 +36,7 @@
 #define LP_BLD_DEPTH_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
  
 struct pipe_depth_state;
@@ -51,13 +51,16 @@ lp_depth_type(const struct util_format_description *format_desc,
 
 
 void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr);
+lp_build_depth_stencil_test(LLVMBuilderRef builder,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef zs_src,
+                            LLVMValueRef zs_dst_ptr,
+                            LLVMValueRef facing);
 
 
 #endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index bc83138908..106fc03e46 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -308,7 +308,7 @@ lp_build_flow_scope_end(struct lp_build_flow_context *flow)
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
-static LLVMBasicBlockRef
+LLVMBasicBlockRef
 lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 {
    LLVMBasicBlockRef current_block;
@@ -648,7 +648,9 @@ lp_build_if(struct lp_build_if_state *ctx,
       ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
 
       /* add add the initial value of the var from the entry block */
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->entry_block, 1);
+      if (!LLVMIsUndef(*flow->variables[i]))
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
+                         &ifthen->entry_block, 1);
    }
 
    /* create/insert true_block before merge_block */
@@ -695,18 +697,21 @@ lp_build_endif(struct lp_build_if_state *ctx)
 {
    struct lp_build_flow_context *flow = ctx->flow;
    struct lp_build_flow_if *ifthen;
+   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
    unsigned i;
 
    ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
 
+   /* Insert branch to the merge block from current block */
+   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+
    if (ifthen->false_block) {
       LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
       /* for each variable, update the Phi node with a (variable, block) pair */
       for (i = 0; i < flow->num_variables; i++) {
          assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->false_block, 1);
-
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
          /* replace the variable ref with the phi function */
          *flow->variables[i] = ifthen->phi[i];
       }
@@ -742,15 +747,18 @@ lp_build_endif(struct lp_build_if_state *ctx)
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Append an unconditional Br(anch) instruction on the true_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   /* Insert branch from end of true_block to merge_block */
    if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the false_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+      /* Append an unconditional Br(anch) instruction on the true_block */
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
       LLVMBuildBr(ctx->builder, ifthen->merge_block);
    }
-
+   else {
+      /* No else clause.
+       * Note that we've already inserted the branch at the end of
+       * true_block.  See the very first LLVMBuildBr() call in this function.
+       */
+   }
 
    /* Resume building code at end of the ifthen->merge_block */
    LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 4c225a0d4f..c2b50e1b60 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -35,7 +35,7 @@
 #define LP_BLD_FLOW_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -145,7 +145,9 @@ lp_build_else(struct lp_build_if_state *ctx);
 
 void
 lp_build_endif(struct lp_build_if_state *ctx);
-              
+
+LLVMBasicBlockRef
+lp_build_insert_new_block(LLVMBuilderRef builder, const char *name);
 
 
 #endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 970bee379f..73ab6de3f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -34,7 +34,7 @@
  * Pixel format helpers.
  */
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_format.h"
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index abb27e4c32..45ee4b12ce 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -114,10 +114,10 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
       case UTIL_FORMAT_TYPE_UNSIGNED:
          if(type.floating) {
             if(start)
-               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+               input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
             if(stop < format_desc->block.bits) {
                unsigned mask = ((unsigned long long)1 << width) - 1;
-               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+               input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
             }
 
             if(format_desc->channel[chan].normalized)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.c b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
index 2fc894017d..09efb16121 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
@@ -289,17 +289,17 @@ pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
       /* top-right or bottom-right quad in block */
       /* build x += xstep */
       x = lp_build_add(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
+                       lp_build_const_vec(bld->base.type, xstep));
    }
 
    if (quad_index == 2) {
       /* bottom-left quad in block */
       /* build y += ystep */
       y = lp_build_add(&bld->base, y,
-                       lp_build_const_scalar(bld->base.type, ystep));
+                       lp_build_const_vec(bld->base.type, ystep));
       /* build x -= xstep */
       x = lp_build_sub(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
+                       lp_build_const_vec(bld->base.type, xstep));
    }
 
    lp_build_name(x, "pos.x");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.h b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
index ca958cdf34..a4937bbb04 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
@@ -41,7 +41,7 @@
 #define LP_BLD_INTERP_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 #include "tgsi/tgsi_exec.h"
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index f813f27074..977f767322 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -37,7 +37,7 @@
 #define LP_BLD_INTR_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 /**
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 2726747eae..a3b6970116 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -42,6 +42,26 @@
 #include "lp_bld_logic.h"
 
 
+/*
+ * XXX
+ *
+ * Selection with vector conditional like
+ *
+ *    select <4 x i1> %C, %A, %B
+ *
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
+ * supported on any backend.
+ *
+ * Expanding the boolean vector to full SIMD register width, as in
+ *
+ *    sext <4 x i1> %C to <4 x i32>
+ *
+ * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
+ * it causes assertion failures in LLVM 2.6. It appears to work correctly on 
+ * LLVM 2.7.
+ */
+
+
 /**
  * Build code to compare two values 'a' and 'b' of 'type' using the given func.
  * \param func  one of PIPE_FUNC_x
@@ -54,13 +74,11 @@ lp_build_compare(LLVMBuilderRef builder,
                  LLVMValueRef a,
                  LLVMValueRef b)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
    LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
    LLVMValueRef cond;
    LLVMValueRef res;
-   unsigned i;
 
    assert(func >= PIPE_FUNC_NEVER);
    assert(func <= PIPE_FUNC_ALWAYS);
@@ -74,10 +92,12 @@ lp_build_compare(LLVMBuilderRef builder,
 
    /* XXX: It is not clear if we should use the ordered or unordered operators */
 
+#if HAVE_LLVM < 0x0207
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating && util_cpu_caps.has_sse) {
          /* float[4] comparison */
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -147,6 +167,7 @@ lp_build_compare(LLVMBuilderRef builder,
          const char *pcmpgt;
          LLVMValueRef args[2];
          LLVMValueRef res;
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
 
          switch (type.width) {
          case 8:
@@ -172,7 +193,7 @@ lp_build_compare(LLVMBuilderRef builder,
          if(table[func].gt &&
             ((type.width == 8 && type.sign) ||
              (type.width != 8 && !type.sign))) {
-            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+            LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
          }
@@ -198,8 +219,9 @@ lp_build_compare(LLVMBuilderRef builder,
 
          return res;
       }
-   }
+   } /* if (type.width * type.length == 128) */
 #endif
+#endif /* HAVE_LLVM < 0x0207 */
 
    if(type.floating) {
       LLVMRealPredicate op;
@@ -233,25 +255,33 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildFCmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildFCmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildFCmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise float"
+                      " vector comparison\n", __FUNCTION__);
+         for (i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildFCmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -281,25 +311,34 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildICmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise int vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildICmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildICmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise int"
+                      " vector comparison\n", __FUNCTION__);
+
+         for(i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildICmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -326,6 +365,8 @@ lp_build_cmp(struct lp_build_context *bld,
 
 /**
  * Return mask ? a : b;
+ *
+ * mask is a bitwise mask, composed of 0 or ~0 for each element.
  */
 LLVMValueRef
 lp_build_select(struct lp_build_context *bld,
@@ -339,26 +380,32 @@ lp_build_select(struct lp_build_context *bld,
    if(a == b)
       return a;
 
-   if(type.floating) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   if (type.length == 1) {
+      mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
+      res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
+   else {
+      if(type.floating) {
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+      }
 
-   a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
 
-   /* This often gets translated to PANDN, but sometimes the NOT is
-    * pre-computed and stored in another constant. The best strategy depends
-    * on available registers, so it is not a big deal -- hopefully LLVM does
-    * the right decision attending the rest of the program.
-    */
-   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+      /* This often gets translated to PANDN, but sometimes the NOT is
+       * pre-computed and stored in another constant. The best strategy depends
+       * on available registers, so it is not a big deal -- hopefully LLVM does
+       * the right decision attending the rest of the program.
+       */
+      b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
 
-   res = LLVMBuildOr(bld->builder, a, b, "");
+      res = LLVMBuildOr(bld->builder, a, b, "");
 
-   if(type.floating) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      if(type.floating) {
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
+         res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      }
    }
 
    return res;
@@ -436,3 +483,13 @@ lp_build_alloca(struct lp_build_context *bld)
       return LLVMBuildAlloca(bld->builder, lp_build_elem_type(type), "");
    }
 }
+
+
+/** Return (a & ~b) */
+LLVMValueRef
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   b = LLVMBuildNot(bld->builder, b, "");
+   b = LLVMBuildAnd(bld->builder, a, b, "");
+   return b;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index a399ebf39e..00a8c75019 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -37,7 +37,7 @@
 #define LP_BLD_LOGIC_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
@@ -79,4 +79,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_alloca(struct lp_build_context *bld);
 
+
+LLVMValueRef
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index bc360ad77a..2daa8a3b58 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -164,7 +164,7 @@ lp_build_unpack2(LLVMBuilderRef builder,
 
    if(dst_type.sign && src_type.sign) {
       /* Replicate the sign bit in the most significant bits */
-      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+      msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(src_type, src_type.width - 1), "");
    }
    else
       /* Most significant bits always zero */
@@ -256,7 +256,9 @@ lp_build_pack2(LLVMBuilderRef builder,
                LLVMValueRef lo,
                LLVMValueRef hi)
 {
+#if HAVE_LLVM < 0x0207
    LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+#endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
    LLVMValueRef shuffle;
    LLVMValueRef res;
@@ -272,11 +274,14 @@ lp_build_pack2(LLVMBuilderRef builder,
       switch(src_type.width) {
       case 32:
          if(dst_type.sign) {
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+#endif
          }
          else {
             if (util_cpu_caps.has_sse4_1) {
-               /* PACKUSDW is the only instrinsic with a consistent signature */
                return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
             }
             else {
@@ -288,9 +293,17 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       case 16:
          if(dst_type.sign)
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+#endif
          else
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+#endif
          break;
 
       default:
@@ -348,7 +361,7 @@ lp_build_packs2(LLVMBuilderRef builder,
    if(clamp) {
       struct lp_build_context bld;
       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      LLVMValueRef dst_max = lp_build_const_int_vec(src_type, ((unsigned long long)1 << dst_bits) - 1);
       lp_build_context_init(&bld, builder, src_type);
       lo = lp_build_min(&bld, lo, dst_max);
       hi = lp_build_min(&bld, hi, dst_max);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index fb2a34984a..41adeed220 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,7 +37,7 @@
 #define LP_BLD_PACK_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6a026e468e..bb76ad4c6b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -62,6 +62,18 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    if(!sampler)
       return;
 
+   /*
+    * We don't copy sampler state over unless it is actually enabled, to avoid
+    * spurious recompiles, as the sampler static state is part of the shader
+    * key.
+    *
+    * Ideally the state tracker or cso_cache module would make all state
+    * canonical, but until that happens it's better to be safe than sorry here.
+    *
+    * XXX: Actually there's much more than can be done here, especially
+    * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
+    */
+
    state->format            = texture->format;
    state->target            = texture->target;
    state->pot_width         = util_is_pot(texture->width0);
@@ -72,10 +84,18 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_t            = sampler->wrap_t;
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
-   state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
+   if (texture->last_level) {
+      state->min_mip_filter = sampler->min_mip_filter;
+   } else {
+      state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   }
+
    state->compare_mode      = sampler->compare_mode;
-   state->compare_func      = sampler->compare_func;
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
+      state->compare_func   = sampler->compare_func;
+   }
+
    state->normalized_coords = sampler->normalized_coords;
    state->lod_bias          = sampler->lod_bias;
    state->min_lod           = sampler->min_lod;
@@ -139,20 +159,21 @@ lp_build_gather(LLVMBuilderRef builder,
 /**
  * Compute the offset of a pixel.
  *
- * x, y, y_stride are vectors
+ * x, y, z, y_stride, z_stride are vectors
  */
 LLVMValueRef
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef z_stride)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
 
-   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
+   x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
 
    if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
       LLVMValueRef x_lo, x_hi;
@@ -163,6 +184,10 @@ lp_build_sample_offset(struct lp_build_context *bld,
       LLVMValueRef y_offset_lo, y_offset_hi;
       LLVMValueRef offset_lo, offset_hi;
 
+      /* XXX 1D & 3D addressing not done yet */
+      assert(!z);
+      assert(!z_stride);
+
       x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
       y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
 
@@ -170,9 +195,9 @@ lp_build_sample_offset(struct lp_build_context *bld,
       y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
 
       x_stride_lo = x_stride;
-      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
+      y_stride_lo = lp_build_const_vec(bld->type, 2*format_desc->block.bits/8);
 
-      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
+      x_stride_hi = lp_build_const_vec(bld->type, 4*format_desc->block.bits/8);
       y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
 
       x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
@@ -186,13 +211,17 @@ lp_build_sample_offset(struct lp_build_context *bld,
       offset = lp_build_add(bld, offset_hi, offset_lo);
    }
    else {
-      LLVMValueRef x_offset;
-      LLVMValueRef y_offset;
+      offset = lp_build_mul(bld, x, x_stride);
 
-      x_offset = lp_build_mul(bld, x, x_stride);
-      y_offset = lp_build_mul(bld, y, y_stride);
+      if (y && y_stride) {
+         LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+         offset = lp_build_add(bld, offset, y_offset);
+      }
 
-      offset = lp_build_add(bld, x_offset, y_offset);
+      if (z && z_stride) {
+         LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+         offset = lp_build_add(bld, offset, z_offset);
+      }
    }
 
    return offset;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5ba0925bb6..92f3c57435 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,7 +36,7 @@
 #define LP_BLD_SAMPLE_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 struct pipe_texture;
 struct pipe_sampler_state;
@@ -113,9 +113,9 @@ struct lp_sampler_dynamic_state
                   unsigned unit);
 
    LLVMValueRef
-   (*stride)( struct lp_sampler_dynamic_state *state,
-              LLVMBuilderRef builder,
-              unsigned unit);
+   (*row_stride)( struct lp_sampler_dynamic_state *state,
+                  LLVMBuilderRef builder,
+                  unsigned unit);
 
    LLVMValueRef
    (*data_ptr)( struct lp_sampler_dynamic_state *state,
@@ -148,8 +148,9 @@ lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr);
+                       LLVMValueRef z_stride);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 9058f76c1d..995c016b9d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -48,6 +48,7 @@
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
@@ -65,6 +66,14 @@ struct lp_build_sample_context
 
    const struct util_format_description *format_desc;
 
+   /** regular scalar float type */
+   struct lp_type float_type;
+   struct lp_build_context float_bld;
+
+   /** regular scalar float type */
+   struct lp_type int_type;
+   struct lp_build_context int_bld;
+
    /** Incoming coordinates type and build context */
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
@@ -108,9 +117,78 @@ wrap_mode_uses_border_color(unsigned mode)
 }
 
 
+static LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], data_ptr;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
+   return data_ptr;
+}
+
+
+static LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_mipmap_level(bld, data_array, lvl);
+}
+
+
+/**
+ * Dereference stride_array[mipmap_level] array to get a stride.
+ * Return stride as a vector.
+ */
+static LLVMValueRef
+lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
+                              LLVMValueRef stride_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], stride;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
+   stride = LLVMBuildLoad(bld->builder, stride, "");
+   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   return stride;
+}
+
+
+/** Dereference stride_array[0] array to get a stride (as vector). */
+static LLVMValueRef
+lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld,
+                                    LLVMValueRef stride_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_level_stride_vec(bld, stride_array, lvl);
+}
+
+
+static int
+texture_dims(enum pipe_texture_target tex)
+{
+   switch (tex) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return 2;
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      assert(0 && "bad texture target in texture_dims()");
+      return 2;
+   }
+}
+
+
 
 /**
- * Gen code to fetch a texel from a texture at int coords (x, y).
+ * Generate code to fetch a texel from a texture at int coords (x, y, z).
+ * The computation depends on whether the texture is 1D, 2D or 3D.
  * The result, texel, will be:
  *   texel[0] = red values
  *   texel[1] = green values
@@ -121,12 +199,16 @@ static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef width,
                           LLVMValueRef height,
+                          LLVMValueRef depth,
                           LLVMValueRef x,
                           LLVMValueRef y,
+                          LLVMValueRef z,
                           LLVMValueRef y_stride,
+                          LLVMValueRef z_stride,
                           LLVMValueRef data_ptr,
                           LLVMValueRef *texel)
 {
+   const int dims = texture_dims(bld->static_state->target);
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
    LLVMValueRef packed;
@@ -140,7 +222,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
    }
 
-   if (wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+   if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
@@ -153,6 +235,19 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
+   if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) {
+      LLVMValueRef b1, b2;
+      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
+      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+      if (use_border) {
+         use_border = LLVMBuildOr(bld->builder, use_border, b1, "ub_or_b1");
+         use_border = LLVMBuildOr(bld->builder, use_border, b2, "ub_or_b2");
+      }
+      else {
+         use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
+      }
+   }
+
    /*
     * Note: if we find an app which frequently samples the texture border
     * we might want to implement a true conditional here to avoid sampling
@@ -168,11 +263,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
     * the texel color results with the border color.
     */
 
-   /* convert x,y coords to linear offset from start of texture, in bytes */
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
    offset = lp_build_sample_offset(&bld->uint_coord_bld,
                                    bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
+                                   x, y, z, y_stride, z_stride);
 
    assert(bld->format_desc->block.width == 1);
    assert(bld->format_desc->block.height == 1);
@@ -185,6 +279,8 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                             bld->texel_type.width,
                             data_ptr, offset);
 
+   texel[0] = texel[1] = texel[2] = texel[3] = NULL;
+
    /* convert texels to float rgba */
    lp_build_unpack_rgba_soa(bld->builder,
                             bld->format_desc,
@@ -196,7 +292,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_scalar(bld->texel_type,
+            lp_build_const_vec(bld->texel_type,
                                   bld->static_state->border_color[chan]);
          texel[chan] = lp_build_select(&bld->texel_bld, use_border,
                                        border_chan, texel[chan]);
@@ -210,19 +306,22 @@ lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef data_array)
 {
    LLVMValueRef offset;
+   LLVMValueRef data_ptr;
 
    offset = lp_build_sample_offset(&bld->uint_coord_bld,
                                    bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
+                                   x, y, NULL, y_stride, NULL);
 
    assert(bld->format_desc->block.width == 1);
    assert(bld->format_desc->block.height == 1);
    assert(bld->format_desc->block.bits <= bld->texel_type.width);
 
+   /* get pointer to mipmap level 0 data */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
+
    return lp_build_gather(bld->builder,
                           bld->texel_type.length,
                           bld->format_desc->block.bits,
@@ -358,8 +457,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
-   LLVMValueRef half = lp_build_const_scalar(coord_bld->type, 0.5);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -413,7 +512,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_scalar(coord_bld->type, 0.5F);
+         min = lp_build_const_vec(coord_bld->type, 0.5F);
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -434,7 +533,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = 1.0 - min */
             max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -446,7 +545,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
             coord = lp_build_clamp(coord_bld, coord, min, max);
             coord = lp_build_sub(coord_bld, coord, half);
@@ -521,7 +620,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          LLVMValueRef min, max;
          /* min = -1.0 / (2 * length) = -0.5 / length */
          min = lp_build_mul(coord_bld,
-                            lp_build_const_scalar(coord_bld->type, -0.5F),
+                            lp_build_const_vec(coord_bld->type, -0.5F),
                             lp_build_rcp(coord_bld, length_f));
          /* max = 1.0 - min */
          max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -566,7 +665,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -609,7 +708,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, 0.5F);
+            min = lp_build_const_vec(coord_bld->type, 0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -625,7 +724,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = length - min */
             max = lp_build_sub(coord_bld, length_f, min);
@@ -634,7 +733,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -711,83 +810,905 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
 
 /**
- * Sample 2D texture with nearest filtering.
+ * Codegen equivalent for u_minify().
+ * Return max(1, base_size >> level);
+ */
+static LLVMValueRef
+lp_build_minify(struct lp_build_sample_context *bld,
+                LLVMValueRef base_size,
+                LLVMValueRef level)
+{
+   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
+   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+   return size;
+}
+
+
+/**
+ * Generate code to compute texture level of detail (lambda).
+ * \param s  vector of texcoord s values
+ * \param t  vector of texcoord t values
+ * \param r  vector of texcoord r values
+ * \param width  scalar int texture width
+ * \param height  scalar int texture height
+ * \param depth  scalar int texture depth
+ */
+static LLVMValueRef
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      LLVMValueRef s,
+                      LLVMValueRef t,
+                      LLVMValueRef r,
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth)
+
+{
+   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+      /* User is forcing sampling from a particular mipmap level.
+       * This is hit during mipmap generation.
+       */
+      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+   }
+   else {
+      const int dims = texture_dims(bld->static_state->target);
+      struct lp_build_context *float_bld = &bld->float_bld;
+      LLVMValueRef lod_bias = LLVMConstReal(LLVMFloatType(),
+                                            bld->static_state->lod_bias);
+      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->min_lod);
+      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->max_lod);
+
+      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+      LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+
+      LLVMValueRef s0, s1, s2;
+      LLVMValueRef t0, t1, t2;
+      LLVMValueRef r0, r1, r2;
+      LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
+      LLVMValueRef rho, lod;
+
+      /*
+       * dsdx = abs(s[1] - s[0]);
+       * dsdy = abs(s[2] - s[0]);
+       * dtdx = abs(t[1] - t[0]);
+       * dtdy = abs(t[2] - t[0]);
+       * drdx = abs(r[1] - r[0]);
+       * drdy = abs(r[2] - r[0]);
+       * XXX we're assuming a four-element quad in 2x2 layout here.
+       */
+      s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
+      s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
+      s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
+      dsdx = LLVMBuildSub(bld->builder, s1, s0, "");
+      dsdx = lp_build_abs(float_bld, dsdx);
+      dsdy = LLVMBuildSub(bld->builder, s2, s0, "");
+      dsdy = lp_build_abs(float_bld, dsdy);
+      if (dims > 1) {
+         t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
+         t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
+         t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
+         dtdx = LLVMBuildSub(bld->builder, t1, t0, "");
+         dtdx = lp_build_abs(float_bld, dtdx);
+         dtdy = LLVMBuildSub(bld->builder, t2, t0, "");
+         dtdy = lp_build_abs(float_bld, dtdy);
+         if (dims > 2) {
+            r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
+            r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
+            r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
+            drdx = LLVMBuildSub(bld->builder, r1, r0, "");
+            drdx = lp_build_abs(float_bld, drdx);
+            drdy = LLVMBuildSub(bld->builder, r2, r0, "");
+            drdy = lp_build_abs(float_bld, drdy);
+         }
+      }
+
+      /* Compute rho = max of all partial derivatives scaled by texture size.
+       * XXX this could be vectorized somewhat
+       */
+      rho = LLVMBuildMul(bld->builder,
+                         lp_build_max(float_bld, dsdx, dsdy),
+                         lp_build_int_to_float(float_bld, width), "");
+      if (dims > 1) {
+         LLVMValueRef max;
+         max = LLVMBuildMul(bld->builder,
+                            lp_build_max(float_bld, dtdx, dtdy),
+                            lp_build_int_to_float(float_bld, height), "");
+         rho = lp_build_max(float_bld, rho, max);
+         if (dims > 2) {
+            max = LLVMBuildMul(bld->builder,
+                               lp_build_max(float_bld, drdx, drdy),
+                               lp_build_int_to_float(float_bld, depth), "");
+            rho = lp_build_max(float_bld, rho, max);
+         }
+      }
+
+      /* compute lod = log2(rho) */
+      lod = lp_build_log2(float_bld, rho);
+
+      /* add lod bias */
+      lod = LLVMBuildAdd(bld->builder, lod, lod_bias, "LOD bias");
+
+      /* clamp lod */
+      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+
+      return lod;
+   }
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
+ * mipmap level index.
+ * Note: this is all scalar code.
+ * \param lod  scalar float texture level of detail
+ * \param level_out  returns integer 
  */
 static void
-lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
-                               LLVMValueRef s,
-                               LLVMValueRef t,
-                               LLVMValueRef width,
-                               LLVMValueRef height,
-                               LLVMValueRef stride,
-                               LLVMValueRef data_ptr,
-                               LLVMValueRef *texel)
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out)
 {
-   LLVMValueRef x, y;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
 
-   x = lp_build_sample_wrap_nearest(bld, s, width,
-                                    bld->static_state->pot_width,
-                                    bld->static_state->wrap_s);
-   y = lp_build_sample_wrap_nearest(bld, t, height,
-                                    bld->static_state->pot_height,
-                                    bld->static_state->wrap_t);
+   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-   lp_build_name(x, "tex.x.wrapped");
-   lp_build_name(y, "tex.y.wrapped");
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_iround(float_bld, lod);
 
-   lp_build_sample_texel_soa(bld, width, height, x, y, stride, data_ptr, texel);
+   /* clamp level to legal range of levels */
+   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
 }
 
 
 /**
- * Sample 2D texture with bilinear filtering.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
+ * two (adjacent) mipmap level indexes.  Later, we'll sample from those
+ * two mipmap levels and interpolate between them.
  */
 static void
-lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_ifloor(float_bld, lod);
+
+   /* compute level 0 and clamp to legal range of levels */
+   *level0_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+   /* compute level 1 and clamp to legal range of levels */
+   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
+   *level1_out = lp_build_min(int_bld, *level1_out, last_level);
+
+   *weight_out = lp_build_fract(float_bld, lod);
+}
+
+
+/**
+ * Generate code to sample a mipmap level with nearest filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
+ */
+static void
+lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              LLVMValueRef width_vec,
+                              LLVMValueRef height_vec,
+                              LLVMValueRef depth_vec,
+                              LLVMValueRef row_stride_vec,
+                              LLVMValueRef img_stride_vec,
+                              LLVMValueRef data_ptr,
                               LLVMValueRef s,
                               LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
-                              LLVMValueRef *texel)
+                              LLVMValueRef r,
+                              LLVMValueRef colors_out[4])
 {
-   LLVMValueRef s_fpart;
-   LLVMValueRef t_fpart;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x, y, z;
+
+   /*
+    * Compute integer texcoords.
+    */
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s);
+   lp_build_name(x, "tex.x.wrapped");
+
+   if (dims >= 2) {
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+                                       bld->static_state->pot_height,
+                                       bld->static_state->wrap_t);
+      lp_build_name(y, "tex.y.wrapped");
+
+      if (dims == 3) {
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
+                                          bld->static_state->pot_height,
+                                          bld->static_state->wrap_r);
+         lp_build_name(z, "tex.z.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z = r;
+      }
+      else {
+         z = NULL;
+      }
+   }
+   else {
+      y = z = NULL;
+   }
+
+   /*
+    * Get texture colors.
+    */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x, y, z,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, colors_out);
+}
+
+
+/**
+ * Generate code to sample a mipmap level with linear filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
+ */
+static void
+lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             LLVMValueRef width_vec,
+                             LLVMValueRef height_vec,
+                             LLVMValueRef depth_vec,
+                             LLVMValueRef row_stride_vec,
+                             LLVMValueRef img_stride_vec,
+                             LLVMValueRef data_ptr,
+                             LLVMValueRef s,
+                             LLVMValueRef t,
+                             LLVMValueRef r,
+                             LLVMValueRef colors_out[4])
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x0, y0, z0, x1, y1, z1;
+   LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
-   unsigned chan;
+   int chan;
 
-   lp_build_sample_wrap_linear(bld, s, width, bld->static_state->pot_width,
-                               bld->static_state->wrap_s, &x0, &x1, &s_fpart);
-   lp_build_sample_wrap_linear(bld, t, height, bld->static_state->pot_height,
-                               bld->static_state->wrap_t, &y0, &y1, &t_fpart);
+   /*
+    * Compute integer texcoords.
+    */
+   lp_build_sample_wrap_linear(bld, s, width_vec,
+                               bld->static_state->pot_width,
+                               bld->static_state->wrap_s,
+                               &x0, &x1, &s_fpart);
+   lp_build_name(x0, "tex.x0.wrapped");
+   lp_build_name(x1, "tex.x1.wrapped");
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear(bld, t, height_vec,
+                                  bld->static_state->pot_height,
+                                  bld->static_state->wrap_t,
+                                  &y0, &y1, &t_fpart);
+      lp_build_name(y0, "tex.y0.wrapped");
+      lp_build_name(y1, "tex.y1.wrapped");
+
+      if (dims == 3) {
+         lp_build_sample_wrap_linear(bld, r, depth_vec,
+                                     bld->static_state->pot_depth,
+                                     bld->static_state->wrap_r,
+                                     &z0, &z1, &r_fpart);
+         lp_build_name(z0, "tex.z0.wrapped");
+         lp_build_name(z1, "tex.z1.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z0 = z1 = r;  /* cube face */
+         r_fpart = NULL;
+      }
+      else {
+         z0 = z1 = NULL;
+         r_fpart = NULL;
+      }
+   }
+   else {
+      y0 = y1 = t_fpart = NULL;
+      z0 = z1 = r_fpart = NULL;
+   }
 
-   lp_build_sample_texel_soa(bld, width, height, x0, y0, stride, data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y0, stride, data_ptr, neighbors[0][1]);
-   lp_build_sample_texel_soa(bld, width, height, x0, y1, stride, data_ptr, neighbors[1][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y1, stride, data_ptr, neighbors[1][1]);
+   /*
+    * Get texture colors.
+    */
+   /* get x0/x1 texels */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x0, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x1, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][1]);
+
+   if (dims == 1) {
+      /* Interpolate two samples from 1D image to produce one color */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan]);
+      }
+   }
+   else {
+      /* 2D/3D texture */
+      LLVMValueRef colors0[4];
+
+      /* get x0/x1 texels at y1 */
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x0, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][0]);
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x1, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][1]);
+
+      /* Bilinear interpolate the four samples from the 2D image / 3D slice */
+      for (chan = 0; chan < 4; chan++) {
+         colors0[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                          s_fpart, t_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan],
+                                          neighbors[1][0][chan],
+                                          neighbors[1][1][chan]);
+      }
 
-   /* TODO: Don't interpolate missing channels */
-   for(chan = 0; chan < 4; ++chan) {
-      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
-                                     s_fpart, t_fpart,
-                                     neighbors[0][0][chan],
-                                     neighbors[0][1][chan],
-                                     neighbors[1][0][chan],
-                                     neighbors[1][1][chan]);
+      if (dims == 3) {
+         LLVMValueRef neighbors1[2][2][4];
+         LLVMValueRef colors1[4];
+
+         /* get x0/x1/y0/y1 texels at z1 */
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][1]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][1]);
+
+         /* Bilinear interpolate the four samples from the second Z slice */
+         for (chan = 0; chan < 4; chan++) {
+            colors1[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                             s_fpart, t_fpart,
+                                             neighbors1[0][0][chan],
+                                             neighbors1[0][1][chan],
+                                             neighbors1[1][0][chan],
+                                             neighbors1[1][1][chan]);
+         }
+
+         /* Linearly interpolate the two samples from the two 3D slices */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = lp_build_lerp(&bld->texel_bld,
+                                             r_fpart,
+                                             colors0[chan], colors1[chan]);
+         }
+      }
+      else {
+         /* 2D tex */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = colors0[chan];
+         }
+      }
+   }
+}
+
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = -0.5 / abs(coord); */
+   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
+                                   lp_build_rcp(coord_bld, absCoord));
+   return ima;
+}
+
+
+/**
+ * Helper used by lp_build_cube_lookup()
+ * \param sign  scalar +1 or -1
+ * \param coord  float vector
+ * \param ima  float vector
+ */
+static LLVMValueRef
+lp_build_cube_coord(struct lp_build_context *coord_bld,
+                    LLVMValueRef sign, int negate_coord,
+                    LLVMValueRef coord, LLVMValueRef ima)
+{
+   /* return negate(coord) * ima * sign + 0.5; */
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
+   LLVMValueRef res;
+
+   assert(negate_coord == +1 || negate_coord == -1);
+
+   if (negate_coord == -1) {
+      coord = lp_build_negate(coord_bld, coord);
+   }
+
+   res = lp_build_mul(coord_bld, coord, ima);
+   if (sign) {
+      sign = lp_build_broadcast_scalar(coord_bld, sign);
+      res = lp_build_mul(coord_bld, res, sign);
+   }
+   res = lp_build_add(coord_bld, res, half);
+
+   return res;
+}
+
+
+/** Helper used by lp_build_cube_lookup()
+ * Return (major_coord >= 0) ? pos_face : neg_face;
+ */
+static LLVMValueRef
+lp_build_cube_face(struct lp_build_sample_context *bld,
+                   LLVMValueRef major_coord,
+                   unsigned pos_face, unsigned neg_face)
+{
+   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                                    major_coord,
+                                    bld->float_bld.zero, "");
+   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
+   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
+   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
+   return res;
+}
+
+
+
+/**
+ * Generate code to do cube face selection and per-face texcoords.
+ */
+static void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef rx, ry, rz;
+   LLVMValueRef arx, ary, arz;
+   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
+   LLVMValueRef arx_ge_ary, arx_ge_arz;
+   LLVMValueRef ary_ge_arx, ary_ge_arz;
+   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+   LLVMValueRef rx_pos, ry_pos, rz_pos;
+
+   assert(bld->coord_bld.type.length == 4);
+
+   /*
+    * Use the average of the four pixel's texcoords to choose the face.
+    */
+   rx = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, s));
+   ry = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, t));
+   rz = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, r));
+
+   arx = lp_build_abs(float_bld, rx);
+   ary = lp_build_abs(float_bld, ry);
+   arz = lp_build_abs(float_bld, rz);
+
+   /*
+    * Compare sign/magnitude of rx,ry,rz to determine face
+    */
+   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
+   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
+   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
+   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
+
+   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
+   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
+   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
+   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
+
+   {
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      *face_s = bld->coord_bld.undef;
+      *face_t = bld->coord_bld.undef;
+      *face = bld->int_bld.undef;
+
+      lp_build_name(*face_s, "face_s");
+      lp_build_name(*face_t, "face_t");
+      lp_build_name(*face, "face");
+
+      lp_build_flow_scope_declare(flow_ctx, face_s);
+      lp_build_flow_scope_declare(flow_ctx, face_t);
+      lp_build_flow_scope_declare(flow_ctx, face);
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      {
+         /* +/- X face */
+         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
+         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
+         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+         *face = lp_build_cube_face(bld, rx,
+                                    PIPE_TEX_FACE_POS_X,
+                                    PIPE_TEX_FACE_NEG_X);
+      }
+      lp_build_else(&if_ctx);
+      {
+         struct lp_build_flow_context *flow_ctx2;
+         struct lp_build_if_state if_ctx2;
+
+         LLVMValueRef face_s2 = bld->coord_bld.undef;
+         LLVMValueRef face_t2 = bld->coord_bld.undef;
+         LLVMValueRef face2 = bld->int_bld.undef;
+
+         flow_ctx2 = lp_build_flow_create(bld->builder);
+         lp_build_flow_scope_begin(flow_ctx2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
+         lp_build_flow_scope_declare(flow_ctx2, &face2);
+
+         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         {
+            /* +/- Y face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            face2 = lp_build_cube_face(bld, ry,
+                                       PIPE_TEX_FACE_POS_Y,
+                                       PIPE_TEX_FACE_NEG_Y);
+         }
+         lp_build_else(&if_ctx2);
+         {
+            /* +/- Z face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            face2 = lp_build_cube_face(bld, rz,
+                                       PIPE_TEX_FACE_POS_Z,
+                                       PIPE_TEX_FACE_NEG_Z);
+         }
+         lp_build_endif(&if_ctx2);
+         lp_build_flow_scope_end(flow_ctx2);
+         lp_build_flow_destroy(flow_ctx2);
+
+         *face_s = face_s2;
+         *face_t = face_t2;
+         *face = face2;
+      }
+
+      lp_build_endif(&if_ctx);
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+
+/**
+ * Sample the texture/mipmap using given image filter and mip filter.
+ * data0_ptr and data1_ptr point to the two mipmap levels to sample
+ * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
+ * If we're using nearest miplevel sampling the '1' values will be null/unused.
+ */
+static void
+lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned img_filter,
+                       unsigned mip_filter,
+                       LLVMValueRef s,
+                       LLVMValueRef t,
+                       LLVMValueRef r,
+                       LLVMValueRef lod_fpart,
+                       LLVMValueRef width0_vec,
+                       LLVMValueRef width1_vec,
+                       LLVMValueRef height0_vec,
+                       LLVMValueRef height1_vec,
+                       LLVMValueRef depth0_vec,
+                       LLVMValueRef depth1_vec,
+                       LLVMValueRef row_stride0_vec,
+                       LLVMValueRef row_stride1_vec,
+                       LLVMValueRef img_stride0_vec,
+                       LLVMValueRef img_stride1_vec,
+                       LLVMValueRef data_ptr0,
+                       LLVMValueRef data_ptr1,
+                       LLVMValueRef *colors_out)
+{
+   LLVMValueRef colors0[4], colors1[4];
+   int chan;
+
+   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      lp_build_sample_image_nearest(bld,
+                                    width0_vec, height0_vec, depth0_vec,
+                                    row_stride0_vec, img_stride0_vec,
+                                    data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_nearest(bld,
+                                       width1_vec, height1_vec, depth1_vec,
+                                       row_stride1_vec, img_stride1_vec,
+                                       data_ptr1, s, t, r, colors1);
+      }
+   }
+   else {
+      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+
+      lp_build_sample_image_linear(bld,
+                                   width0_vec, height0_vec, depth0_vec,
+                                   row_stride0_vec, img_stride0_vec,
+                                   data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_linear(bld,
+                                      width1_vec, height1_vec, depth1_vec,
+                                      row_stride1_vec, img_stride1_vec,
+                                      data_ptr1, s, t, r, colors1);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* interpolate samples from the two mipmap levels */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
+                                          colors0[chan], colors1[chan]);
+      }
+   }
+   else {
+      /* use first/only level's colors */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = colors0[chan];
+      }
    }
 }
 
 
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef width,
+                        LLVMValueRef height,
+                        LLVMValueRef depth,
+                        LLVMValueRef width_vec,
+                        LLVMValueRef height_vec,
+                        LLVMValueRef depth_vec,
+                        LLVMValueRef row_stride_array,
+                        LLVMValueRef img_stride_vec,
+                        LLVMValueRef data_array,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef lod, lod_fpart;
+   LLVMValueRef ilevel0, ilevel1, ilevel0_vec, ilevel1_vec;
+   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
+   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
+   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0, data_ptr1;
+
+   /*
+   printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
+          mip_filter, min_filter, mag_filter);
+   */
+
+   /*
+    * Compute the level of detail (float).
+    */
+   if (min_filter != mag_filter ||
+       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* Need to compute lod either to choose mipmap levels or to
+       * distinguish between minification/magnification with one mipmap level.
+       */
+      lod = lp_build_lod_selector(bld, s, t, r, width, height, depth);
+   }
+
+   /*
+    * Compute integer mipmap level(s) to fetch texels from.
+    */
+   if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* always use mip level 0 */
+      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   }
+   else {
+      if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR);
+         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
+                                    &lod_fpart);
+         lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart);
+      }
+   }
+
+   /*
+    * Convert scalar integer mipmap levels into vectors.
+    */
+   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+
+   /*
+    * Compute width, height at mipmap level 'ilevel0'
+    */
+   width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   if (dims >= 2) {
+      height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
+      row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                      ilevel0);
+      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         img_stride0_vec = lp_build_mul(&bld->int_coord_bld,
+                                        row_stride0_vec, height0_vec);
+         if (dims == 3) {
+            depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
+         }
+      }
+   }
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* compute width, height, depth for second mipmap level at 'ilevel1' */
+      width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
+      if (dims >= 2) {
+         height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
+         row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                         ilevel1);
+         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+            img_stride1_vec = lp_build_mul(&bld->int_coord_bld,
+                                           row_stride1_vec, height1_vec);
+            if (dims ==3) {
+               depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
+            }
+         }
+      }
+   }
+
+   /*
+    * Choose cube face, recompute per-face texcoords.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+   }
+
+   /*
+    * Get pointer(s) to image data for mipmap level(s).
+    */
+   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+   }
+
+   /*
+    * Get/interpolate texture colors.
+    */
+   if (min_filter == mag_filter) {
+      /* no need to distinquish between minification and magnification */
+      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
+                             width0_vec, width1_vec,
+                             height0_vec, height1_vec,
+                             depth0_vec, depth1_vec,
+                             row_stride0_vec, row_stride1_vec,
+                             img_stride0_vec, img_stride1_vec,
+                             data_ptr0, data_ptr1,
+                             colors_out);
+   }
+   else {
+      /* Emit conditional to choose min image filter or mag image filter
+       * depending on the lod being >0 or <= 0, respectively.
+       */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef minify;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[0]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[1]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[2]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[3]);
+
+      /* minify = lod > 0.0 */
+      minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                             lod, float_bld->zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
+      {
+         /* Use the minification filter */
+         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_else(&if_ctx);
+      {
+         /* Use the magnification filter */
+         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+
 static void
 lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
                           struct lp_type dst_type,
                           LLVMValueRef packed,
                           LLVMValueRef *rgba)
 {
-   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
    unsigned chan;
 
    /* Decode the input vector components */
@@ -799,7 +1720,7 @@ lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
       input = packed;
 
       if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
 
       if(stop < 32)
          input = LLVMBuildAnd(builder, input, mask, "");
@@ -817,8 +1738,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef t,
                               LLVMValueRef width,
                               LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
+                              LLVMValueRef stride_array,
+                              LLVMValueRef data_array,
                               LLVMValueRef *texel)
 {
    LLVMBuilderRef builder = bld->builder;
@@ -834,8 +1755,9 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
+   LLVMValueRef stride;
 
-   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
    lp_build_context_init(&h16, builder, lp_type_ufixed(16));
    lp_build_context_init(&u8n, builder, lp_type_unorm(8));
 
@@ -860,17 +1782,17 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
 
    /* subtract 0.5 (add -128) */
-   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   i32_c128 = lp_build_const_int_vec(i32.type, -128);
    s = LLVMBuildAdd(builder, s, i32_c128, "");
    t = LLVMBuildAdd(builder, t, i32_c128, "");
 
    /* compute floor (shift right 8) */
-   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
    t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
 
    /* compute fractional part (AND with 0xff) */
-   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   i32_c255 = lp_build_const_int_vec(i32.type, 255);
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
@@ -941,6 +1863,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
+   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
     *
@@ -958,10 +1882,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
 
    neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
    neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
@@ -1035,7 +1959,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    }
 
    assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+   res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25));
 
    /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
    for(chan = 0; chan < 3; ++chan)
@@ -1044,194 +1968,11 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 }
 
 
-static int
-texture_dims(enum pipe_texture_target tex)
-{
-   switch (tex) {
-   case PIPE_TEXTURE_1D:
-      return 1;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-      return 2;
-   case PIPE_TEXTURE_3D:
-      return 3;
-   default:
-      assert(0 && "bad texture target in texture_dims()");
-      return 2;
-   }
-}
-
-
-/**
- * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
- * \param width  scalar int texture width
- * \param height  scalar int texture height
- * \param depth  scalar int texture depth
- */
-static LLVMValueRef
-lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
-
-{
-   const int dims = texture_dims(bld->static_state->target);
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-
-   LLVMValueRef lod_bias = lp_build_const_scalar(bld->coord_bld.type,
-                                                 bld->static_state->lod_bias);
-   LLVMValueRef min_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->min_lod);
-   LLVMValueRef max_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->max_lod);
-
-   LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
-   LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
-
-   LLVMValueRef s0, s1, s2;
-   LLVMValueRef t0, t1, t2;
-   LLVMValueRef r0, r1, r2;
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho, lod;
-
-   /*
-    * dsdx = abs(s[1] - s[0]);
-    * dsdy = abs(s[2] - s[0]);
-    * dtdx = abs(t[1] - t[0]);
-    * dtdy = abs(t[2] - t[0]);
-    * drdx = abs(r[1] - r[0]);
-    * drdy = abs(r[2] - r[0]);
-    * XXX we're assuming a four-element quad in 2x2 layout here.
-    */
-   s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
-   s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
-   s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
-   dsdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s1, s0));
-   dsdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s2, s0));
-   if (dims > 1) {
-      t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
-      t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
-      t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
-      dtdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t1, t0));
-      dtdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t2, t0));
-      if (dims > 2) {
-         r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
-         r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
-         r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
-         drdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r1, r0));
-         drdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r2, r0));
-      }
-   }
-
-   /* Compute rho = max of all partial derivatives scaled by texture size.
-    * XXX this can be vectorized somewhat
-    */
-   rho = lp_build_mul(coord_bld,
-                       lp_build_max(coord_bld, dsdx, dsdy),
-                       lp_build_int_to_float(coord_bld, width));
-   if (dims > 1) {
-      LLVMValueRef max;
-      max = lp_build_mul(coord_bld,
-                         lp_build_max(coord_bld, dtdx, dtdy),
-                         lp_build_int_to_float(coord_bld, height));
-      rho = lp_build_max(coord_bld, rho, max);
-      if (dims > 2) {
-         max = lp_build_mul(coord_bld,
-                            lp_build_max(coord_bld, drdx, drdy),
-                            lp_build_int_to_float(coord_bld, depth));
-         rho = lp_build_max(coord_bld, rho, max);
-      }
-   }
-
-   /* compute lod = log2(rho) */
-   lod = lp_build_log2(coord_bld, rho);
-
-   /* add lod bias */
-   lod = lp_build_add(coord_bld, lod, lod_bias);
-
-   /* clamp lod */
-   lod = lp_build_clamp(coord_bld, lod, min_lod, max_lod);
-
-   return lod;
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
- * mipmap level index.
- * \param lod  scalar float texture level of detail
- * \param level_out  returns integer 
- */
-static void
-lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_iround(coord_bld, lod);
-
-   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_coord_bld, level,
-                               int_coord_bld->zero,
-                               last_level);
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
- */
-static void
-lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_ifloor(coord_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_coord_bld, level,
-                                int_coord_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   *level1_out = lp_build_add(int_coord_bld, *level0_out, int_coord_bld->one);
-   *level1_out = lp_build_min(int_coord_bld, *level1_out, int_coord_bld->zero);
-
-   *weight_out = lp_build_fract(coord_bld, lod);
-}
-
-
-
 /**
  * Build texture sampling code.
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
+ * \param type  vector float type to use for coords, etc.
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -1245,10 +1986,11 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     LLVMValueRef *texel)
 {
    struct lp_build_sample_context bld;
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef stride;
-   LLVMValueRef data_ptr;
+   LLVMValueRef width, width_vec;
+   LLVMValueRef height, height_vec;
+   LLVMValueRef depth, depth_vec;
+   LLVMValueRef stride_array;
+   LLVMValueRef data_array;
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
@@ -1256,6 +1998,7 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    (void) lp_build_lod_selector;   /* temporary to silence warning */
    (void) lp_build_nearest_mip_level;
    (void) lp_build_linear_mip_levels;
+   (void) lp_build_minify;
 
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
@@ -1263,10 +2006,16 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.static_state = static_state;
    bld.dynamic_state = dynamic_state;
    bld.format_desc = util_format_description(static_state->format);
+
+   bld.float_type = lp_type_float(32);
+   bld.int_type = lp_type_int(32);
    bld.coord_type = type;
    bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
    bld.texel_type = type;
+
+   lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
    lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
@@ -1275,41 +2024,37 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    /* Get the dynamic state */
    width = dynamic_state->width(dynamic_state, builder, unit);
    height = dynamic_state->height(dynamic_state, builder, unit);
-   stride = dynamic_state->stride(dynamic_state, builder, unit);
-   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   depth = dynamic_state->depth(dynamic_state, builder, unit);
+   stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
+   data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   /* Note that data_array is an array[level] of pointers to texture images */
 
    s = coords[0];
    t = coords[1];
    r = coords[2];
 
-   width = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
-   height = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
-   stride = lp_build_broadcast_scalar(&bld.uint_coord_bld, stride);
-
-   if(static_state->target == PIPE_TEXTURE_1D)
-      t = bld.coord_bld.zero;
-
-   switch (static_state->min_img_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height,
-                                     stride, data_ptr, texel);
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      if(lp_format_is_rgba8(bld.format_desc) &&
-         is_simple_wrap_mode(static_state->wrap_s) &&
-         is_simple_wrap_mode(static_state->wrap_t))
-         lp_build_sample_2d_linear_aos(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      else
-         lp_build_sample_2d_linear_soa(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      break;
-   default:
-      assert(0);
+   width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
+   height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
+   depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
+
+   if (lp_format_is_rgba8(bld.format_desc) &&
+       static_state->target == PIPE_TEXTURE_2D &&
+       static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+       is_simple_wrap_mode(static_state->wrap_s) &&
+       is_simple_wrap_mode(static_state->wrap_t)) {
+      /* special case */
+      lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
+                                    stride_array, data_array, texel);
+   }
+   else {
+      lp_build_sample_general(&bld, unit, s, t, r,
+                              width, height, depth,
+                              width_vec, height_vec, depth_vec,
+                              stride_array, NULL, data_array,
+                              texel);
    }
-
-   /* FIXME: respect static_state->min_mip_filter */;
-   /* FIXME: respect static_state->mag_img_filter */;
 
    lp_build_sample_compare(&bld, r, texel);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
index 740392f561..147336edb4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -37,7 +37,7 @@
 #define LP_BLD_STRUCT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 #include <llvm-c/Target.h>
 
 #include "util/u_debug.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 64e81f7b1f..278c838eac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -144,9 +144,9 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 #endif
 
          if(shift > 0)
-            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
+            tmp = LLVMBuildLShr(bld->builder, a, lp_build_const_int_vec(type4, shift*type.width), "");
          if(shift < 0)
-            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
+            tmp = LLVMBuildShl(bld->builder, a, lp_build_const_int_vec(type4, -shift*type.width), "");
 
          assert(tmp);
          if(tmp)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index b9472127a6..138ca620e6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -37,7 +37,7 @@
 #define LP_BLD_SWIZZLE_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index eddb7a83fa..63b938bfa9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -35,7 +35,7 @@
 #ifndef LP_BLD_TGSI_H
 #define LP_BLD_TGSI_H
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 
 struct tgsi_token;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 5f2c2a54ee..f160be878f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
@@ -95,6 +96,19 @@ struct lp_exec_mask {
    int cond_stack_size;
    LLVMValueRef cond_mask;
 
+   LLVMValueRef break_stack[LP_TGSI_MAX_NESTING];
+   int break_stack_size;
+   LLVMValueRef break_mask;
+
+   LLVMValueRef cont_stack[LP_TGSI_MAX_NESTING];
+   int cont_stack_size;
+   LLVMValueRef cont_mask;
+
+   LLVMBasicBlockRef loop_stack[LP_TGSI_MAX_NESTING];
+   int loop_stack_size;
+   LLVMBasicBlockRef loop_block;
+
+
    LLVMValueRef exec_mask;
 };
 
@@ -145,15 +159,33 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context
    mask->bld = bld;
    mask->has_mask = FALSE;
    mask->cond_stack_size = 0;
+   mask->loop_stack_size = 0;
+   mask->break_stack_size = 0;
+   mask->cont_stack_size = 0;
 
    mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
 }
 
 static void lp_exec_mask_update(struct lp_exec_mask *mask)
 {
-   mask->exec_mask = mask->cond_mask;
-   if (mask->cond_stack_size > 0)
-      mask->has_mask = TRUE;
+   if (mask->loop_stack_size) {
+      /*for loops we need to update the entire mask at
+       * runtime */
+      LLVMValueRef tmp;
+      tmp = LLVMBuildAnd(mask->bld->builder,
+                         mask->cont_mask,
+                         mask->break_mask,
+                         "maskcb");
+      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cond_mask,
+                                     tmp,
+                                     "maskfull");
+   } else
+      mask->exec_mask = mask->cond_mask;
+
+
+   mask->has_mask = (mask->cond_stack_size > 0 ||
+                     mask->loop_stack_size > 0);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
@@ -190,6 +222,89 @@ static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
    lp_exec_mask_update(mask);
 }
 
+static void lp_exec_bgnloop(struct lp_exec_mask *mask)
+{
+
+   if (mask->cont_stack_size == 0)
+      mask->cont_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->cont_stack_size == 0)
+      mask->break_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->cond_stack_size == 0)
+      mask->cond_mask = LLVMConstAllOnes(mask->int_vec_type);
+   mask->loop_stack[mask->loop_stack_size++] = mask->loop_block;
+   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
+   LLVMBuildBr(mask->bld->builder, mask->loop_block);
+   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_break(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "break");
+
+   mask->break_stack[mask->break_stack_size++] = mask->break_mask;
+   if (mask->break_stack_size > 1) {
+      mask->break_mask = LLVMBuildAnd(mask->bld->builder,
+                                      mask->break_mask,
+                                      exec_mask, "break_full");
+   } else
+      mask->break_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_continue(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "");
+
+   mask->cont_stack[mask->cont_stack_size++] = mask->cont_mask;
+   if (mask->cont_stack_size > 1) {
+      mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cont_mask,
+                                     exec_mask, "");
+   } else
+      mask->cont_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+
+static void lp_exec_endloop(struct lp_exec_mask *mask)
+{
+   LLVMBasicBlockRef endloop;
+   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
+                                      mask->bld->type.length);
+   /* i1cond = (mask == 0) */
+   LLVMValueRef i1cond = LLVMBuildICmp(
+      mask->bld->builder,
+      LLVMIntNE,
+      LLVMBuildBitCast(mask->bld->builder, mask->break_mask, reg_type, ""),
+      LLVMConstNull(reg_type), "");
+
+   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
+
+   LLVMBuildCondBr(mask->bld->builder,
+                   i1cond, mask->loop_block, endloop);
+
+   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
+
+   mask->loop_block = mask->loop_stack[--mask->loop_stack_size];
+   /* pop the break mask */
+   if (mask->cont_stack_size) {
+      mask->cont_mask = mask->cont_stack[--mask->cont_stack_size];
+   }
+   if (mask->break_stack_size) {
+      mask->break_mask = mask->cont_stack[--mask->break_stack_size];
+   }
+
+   lp_exec_mask_update(mask);
+}
+
 static void lp_exec_mask_store(struct lp_exec_mask *mask,
                                LLVMValueRef val,
                                LLVMValueRef dst)
@@ -360,7 +475,7 @@ emit_store(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
+      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
       value = lp_build_min(&bld->base, value, bld->base.one);
       break;
 
@@ -384,6 +499,11 @@ emit_store(
       assert(0);
       break;
 
+   case TGSI_FILE_PREDICATE:
+      /* FIXME */
+      assert(0);
+      break;
+
    default:
       assert( 0 );
    }
@@ -581,6 +701,17 @@ emit_instruction(
    if (indirect_temp_reference(inst))
       return FALSE;
 
+   /*
+    * Stores and write masks are handled in a general fashion after the long
+    * instruction opcode switch statement.
+    *
+    * Although not stricitly necessary, we avoid generating instructions for
+    * channels which won't be stored, in cases where's that easy. For some
+    * complex instructions, like texture sampling, it is more convenient to
+    * assume a full writemask and then let LLVM optimization passes eliminate
+    * redundant code.
+    */
+
    assert(info->num_dst <= 1);
    if(info->num_dst) {
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -865,7 +996,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
          dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
       }
@@ -1126,7 +1257,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      /* XXX what about dst0 writemask? */
       emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
@@ -1349,14 +1479,15 @@ emit_instruction(
    case TGSI_OPCODE_TXP:
       emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
-      
+
    case TGSI_OPCODE_BRK:
-      /* FIXME */
-      return 0;
+      lp_exec_break(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_IF:
       tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
+                          tmp0, bld->base.zero);
       lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
       break;
 
@@ -1366,6 +1497,10 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_BGNLOOP:
+      lp_exec_bgnloop(&bld->exec_mask);
+      break;
+
    case TGSI_OPCODE_REP:
       /* deprecated */
       assert(0);
@@ -1386,6 +1521,10 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_ENDLOOP:
+      lp_exec_endloop(&bld->exec_mask);
+      break;
+
    case TGSI_OPCODE_ENDREP:
       /* deprecated */
       assert(0);
@@ -1485,8 +1624,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_CONT:
-      /* FIXME */
-      return 0;
+      lp_exec_continue(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_EMIT:
@@ -1575,7 +1713,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
             assert(num_immediates < LP_MAX_IMMEDIATES);
             for( i = 0; i < size; ++i )
                bld.immediates[num_immediates][i] =
-                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
+                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
             for( i = size; i < 4; ++i )
                bld.immediates[num_immediates][i] = bld.base.undef;
             num_immediates++;
@@ -1589,7 +1727,14 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          assert( 0 );
       }
    }
-
+   if (0) {
+      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+      LLVMValueRef function = LLVMGetBasicBlockParent(block);
+      debug_printf("11111111111111111111111111111 \n");
+      tgsi_dump(tokens, 0);
+      LLVMDumpValue(function);
+      debug_printf("2222222222222222222222222222 \n");
+   }
    tgsi_parse_free( &parse );
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index c327ba045a..796af88caa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -58,7 +58,10 @@ LLVMTypeRef
 lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
@@ -115,6 +118,9 @@ lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type)
    if(!vec_type)
       return FALSE;
 
+   if (type.length == 1)
+      return lp_check_elem_type(type, vec_type);
+
    if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
       return FALSE;
 
@@ -153,7 +159,10 @@ LLVMTypeRef
 lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 16946cc28a..cd59d2faa6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -37,9 +37,9 @@
 #define LP_BLD_TYPE_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 /**
@@ -103,7 +103,7 @@ struct lp_type {
    unsigned width:14;
 
    /**
-    * Vector length.
+    * Vector length.  If length==1, this is a scalar (float/int) type.
     *
     * width*length should be a power of two greater or equal to eight.
     *
@@ -139,6 +139,7 @@ struct lp_build_context
 };
 
 
+/** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
 {
@@ -148,12 +149,29 @@ lp_type_float(unsigned width)
    res_type.floating = TRUE;
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector of float type */
+static INLINE struct lp_type
+lp_type_float_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar int type */
 static INLINE struct lp_type
 lp_type_int(unsigned width)
 {
@@ -162,12 +180,28 @@ lp_type_int(unsigned width)
    memset(&res_type, 0, sizeof res_type);
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector int type */
+static INLINE struct lp_type
+lp_type_int_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar uint type */
 static INLINE struct lp_type
 lp_type_uint(unsigned width)
 {
@@ -175,6 +209,20 @@ lp_type_uint(unsigned width)
 
    memset(&res_type, 0, sizeof res_type);
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector uint type */
+static INLINE struct lp_type
+lp_type_uint_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;