10 files changed, 342 insertions, 165 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7b35dd4bb4..e0d30be98d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -59,14 +59,6 @@
 #include "lp_bld_arit.h"
 
 
-/*
- * XXX: Increasing eliminates some artifacts, but adds others, most
- * noticeably corruption in the Earth halo in Google Earth.
- */
-#define RCP_NEWTON_STEPS 0
-
-#define RSQRT_NEWTON_STEPS 0
-
 #define EXP_POLY_DEGREE 3
 
 #define LOG_POLY_DEGREE 5
@@ -267,7 +259,7 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the sum of the elements of a */
+/** Return the scalar sum of the elements of a */
 LLVMValueRef
 lp_build_sum_vector(struct lp_build_context *bld,
                     LLVMValueRef a)
@@ -278,11 +270,9 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (a == bld->zero)
-      return bld->zero;
-   if (a == bld->undef)
-      return bld->undef;
-   assert(type.length > 1);
+   if (type.length == 1) {
+      return a;
+   }
 
    assert(!bld->type.norm);
 
@@ -546,7 +536,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
    if(b == 2 && bld->type.floating)
       return lp_build_add(bld, a, a);
 
-   if(util_is_pot(b)) {
+   if(util_is_power_of_two(b)) {
       unsigned shift = ffs(b) - 1;
 
       if(bld->type.floating) {
@@ -1266,6 +1256,11 @@ lp_build_sqrt(struct lp_build_context *bld,
  *
  *   x_{i+1} = x_i * (2 - a * x_i)
  *
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
  * See also:
  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
@@ -1306,13 +1301,27 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   /*
+    * We don't use RCPPS because:
+    * - it only has 10bits of precision
+    * - it doesn't even get the reciprocate of 1.0 exactly
+    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+    * - for recent processors the benefit over DIVPS is marginal, a case
+    *   depedent
+    *
+    * We could still use it on certain processors if benchmarks show that the
+    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+    * particular uses that require less workarounds.
+    */
+
+   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
 
-      for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
       }
 
@@ -1363,13 +1372,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
-      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);
       }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index 39dfc51e50..d3a5afff8c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -46,7 +46,7 @@
 boolean
 lp_check_alignment(const void *ptr, unsigned alignment)
 {
-   assert(util_is_pot(alignment));
+   assert(util_is_power_of_two(alignment));
    return ((uintptr_t)ptr & (alignment - 1)) == 0;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 247cb83ce6..92123e09d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -388,7 +388,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
    if (format_matches_type(format_desc, type) &&
        format_desc->block.bits <= type.width * 4 &&
-       util_is_pot(format_desc->block.bits)) {
+       util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
 
       /*
@@ -416,7 +416,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
-       util_is_pot(format_desc->block.bits) &&
+       util_is_power_of_two(format_desc->block.bits) &&
        format_desc->block.bits <= 32 &&
        format_desc->is_bitmask &&
        !format_desc->is_mixed &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6d5410d970..48baf7c425 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -40,6 +40,7 @@
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
 #include <llvm/Support/CommandLine.h>
+#include <llvm/Support/PrettyStackTrace.h>
 
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
@@ -143,7 +144,6 @@ lp_set_target_options(void)
    llvm::UnsafeFPMath = true;
 #endif
 
-#if 0
    /*
     * LLVM will generate MMX instructions for vectors <= 64 bits, leading to
     * innefficient code, and in 32bit systems, to the corruption of the FPU
@@ -152,10 +152,8 @@ lp_set_target_options(void)
     * See also:
     * - http://llvm.org/bugs/show_bug.cgi?id=3287
     * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/
-    *
-    * XXX: Unfortunately this is not working.
     */
-   static boolean first = FALSE;
+   static boolean first = TRUE;
    if (first) {
       static const char* options[] = {
          "prog",
@@ -164,7 +162,13 @@ lp_set_target_options(void)
       llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options));
       first = FALSE;
    }
-#endif
+
+   /*
+    * By default LLVM adds a signal handler to output a pretty stack trace.
+    * This signal handler is never removed, causing problems when unloading the
+    * shared object where the gallium driver resides.
+    */
+   llvm::DisablePrettyStackTrace = true;
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index e470082b97..e947b90d16 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,6 +37,8 @@
 #define LP_BLD_PACK_H
 
 
+#include "pipe/p_compiler.h"
+
 #include "gallivm/lp_bld.h"
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 0fd014ab9b..259b1142e3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -82,9 +82,9 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->swizzle_a         = view->swizzle_a;
 
    state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width0);
-   state->pot_height        = util_is_pot(texture->height0);
-   state->pot_depth         = util_is_pot(texture->depth0);
+   state->pot_width         = util_is_power_of_two(texture->width0);
+   state->pot_height        = util_is_power_of_two(texture->height0);
+   state->pot_depth         = util_is_power_of_two(texture->depth0);
 
    state->wrap_s            = sampler->wrap_s;
    state->wrap_t            = sampler->wrap_t;
@@ -124,6 +124,52 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
+ * Compute the partial offset of a pixel block along an arbitrary axis.
+ *
+ * @param coord   coordinate in pixels
+ * @param stride  number of bytes between rows of successive pixel blocks
+ * @param block_length  number of pixels in a pixels block along the coordinate
+ *                      axis
+ * @param out_offset    resulting relative offset of the pixel block in bytes
+ * @param out_subcoord  resulting sub-block pixel coordinate
+ */
+void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_subcoord)
+{
+   LLVMValueRef offset;
+   LLVMValueRef subcoord;
+
+   if (block_length == 1) {
+      subcoord = bld->zero;
+   }
+   else {
+      /*
+       * Pixel blocks have power of two dimensions. LLVM should convert the
+       * rem/div to bit arithmetic.
+       * TODO: Verify this.
+       */
+
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
+      subcoord = LLVMBuildURem(bld->builder, coord, block_width, "");
+      coord    = LLVMBuildUDiv(bld->builder, coord, block_width, "");
+   }
+
+   offset = lp_build_mul(bld, coord, stride);
+
+   assert(out_offset);
+   assert(out_subcoord);
+
+   *out_offset = offset;
+   *out_subcoord = subcoord;
+}
+
+
+/**
  * Compute the offset of a pixel block.
  *
  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
@@ -144,48 +190,35 @@ lp_build_sample_offset(struct lp_build_context *bld,
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
-   LLVMValueRef i;
-   LLVMValueRef j;
-
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (format_desc->block.width == 1) {
-      i = bld->zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
-
-   if (format_desc->block.height == 1) {
-      j = bld->zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
-   }
 
    x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
-   offset = lp_build_mul(bld, x, x_stride);
+
+   lp_build_sample_partial_offset(bld,
+                                  format_desc->block.width,
+                                  x, x_stride,
+                                  &offset, out_i);
 
    if (y && y_stride) {
-      LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+      LLVMValueRef y_offset;
+      lp_build_sample_partial_offset(bld,
+                                     format_desc->block.height,
+                                     y, y_stride,
+                                     &y_offset, out_j);
       offset = lp_build_add(bld, offset, y_offset);
    }
+   else {
+      *out_j = bld->zero;
+   }
 
    if (z && z_stride) {
-      LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+      LLVMValueRef z_offset;
+      LLVMValueRef k;
+      lp_build_sample_partial_offset(bld,
+                                     1, /* pixel blocks are always 2D */
+                                     z, z_stride,
+                                     &z_offset, &k);
       offset = lp_build_add(bld, offset, z_offset);
    }
 
    *out_offset = offset;
-   *out_i = i;
-   *out_j = j;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5b8f478094..caafc4eca0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,6 +36,8 @@
 #define LP_BLD_SAMPLE_H
 
 
+#include "pipe/p_format.h"
+
 #include "gallivm/lp_bld.h"
 
 struct pipe_resource;
@@ -147,6 +149,15 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_i);
+
+
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 806c7d56a8..1f39d9c98b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -176,6 +176,7 @@ texture_dims(enum pipe_texture_target tex)
    case PIPE_TEXTURE_1D:
       return 1;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       return 2;
    case PIPE_TEXTURE_3D:
@@ -322,59 +323,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
 
 /**
- * Fetch the texels as <4n x i8> in AoS form.
- */
-static LLVMValueRef
-lp_build_sample_packed(struct lp_build_sample_context *bld,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_array)
-{
-   LLVMValueRef offset, i, j;
-   LLVMValueRef data_ptr;
-   LLVMValueRef res;
-
-   /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   lp_build_sample_offset(&bld->uint_coord_bld,
-                          bld->format_desc,
-                          x, y, NULL, y_stride, NULL,
-                          &offset, &i, &j);
-
-   /* get pointer to mipmap level 0 data */
-   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
-
-   if (util_format_is_rgba8_variant(bld->format_desc)) {
-      /* Just fetch the data directly without swizzling */
-      assert(bld->format_desc->block.width == 1);
-      assert(bld->format_desc->block.height == 1);
-      assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-      res = lp_build_gather(bld->builder,
-                            bld->texel_type.length,
-                            bld->format_desc->block.bits,
-                            bld->texel_type.width,
-                            data_ptr, offset);
-   }
-   else {
-      struct lp_type type;
-
-      assert(bld->texel_type.width == 32);
-
-      memset(&type, 0, sizeof type);
-      type.width = 8;
-      type.length = bld->texel_type.length*4;
-      type.norm = TRUE;
-
-      res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type,
-                                    data_ptr, offset, i, j);
-   }
-
-   return res;
-}
-
-
-/**
  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
  */
 static LLVMValueRef
@@ -408,7 +356,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
- * We only support a few wrap modes in lp_build_sample_wrap_int() at this time.
+ * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time.
  * Return whether the given mode is supported by that function.
  */
 static boolean
@@ -430,13 +378,18 @@ is_simple_wrap_mode(unsigned mode)
  * \param length  the texture size along one dimension
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param i0  resulting sub-block pixel coordinate for coord0
  */
-static LLVMValueRef
-lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
-                         LLVMValueRef coord,
-                         LLVMValueRef length,
-                         boolean is_pot,
-                         unsigned wrap_mode)
+static void
+lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
+                                 unsigned block_length,
+                                 LLVMValueRef coord,
+                                 LLVMValueRef length,
+                                 LLVMValueRef stride,
+                                 boolean is_pot,
+                                 unsigned wrap_mode,
+                                 LLVMValueRef *out_offset,
+                                 LLVMValueRef *out_i)
 {
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
@@ -469,7 +422,134 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
       assert(0);
    }
 
-   return coord;
+   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+                                  out_offset, out_i);
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode, for scaled integer texcoords.
+ * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param offset0  resulting relative offset for coord0
+ * \param offset1  resulting relative offset for coord0 + 1
+ * \param i0  resulting sub-block pixel coordinate for coord0
+ * \param i1  resulting sub-block pixel coordinate for coord0 + 1
+ */
+static void
+lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
+                                unsigned block_length,
+                                LLVMValueRef coord0,
+                                LLVMValueRef length,
+                                LLVMValueRef stride,
+                                boolean is_pot,
+                                unsigned wrap_mode,
+                                LLVMValueRef *offset0,
+                                LLVMValueRef *offset1,
+                                LLVMValueRef *i0,
+                                LLVMValueRef *i1)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+   LLVMValueRef lmask, umask, mask;
+
+   if (block_length != 1) {
+      /*
+       * If the pixel block covers more than one pixel then there is no easy
+       * way to calculate offset1 relative to offset0. Instead, compute them
+       * independently.
+       */
+
+      LLVMValueRef coord1;
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord0,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset0, i0);
+
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord1,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset1, i1);
+
+      return;
+   }
+
+   /*
+    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
+    * multiplication.
+    */
+
+   *i0 = uint_coord_bld->zero;
+   *i1 = uint_coord_bld->zero;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+      }
+      else {
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+      }
+
+      mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              mask, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
+      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_LESS, coord0, length_minus_one);
+
+      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
+      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
+
+      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(uint_coord_bld,
+                              *offset0,
+                              LLVMBuildAnd(bld->builder, stride, mask, ""));
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+      *offset0 = uint_coord_bld->zero;
+      *offset1 = uint_coord_bld->zero;
+      break;
+   }
 }
 
 
@@ -1740,16 +1820,21 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef i32_c8, i32_c128, i32_c255;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
    LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2];
+   LLVMValueRef data_ptr;
+   LLVMValueRef x_stride, y_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef offset[2][2];
+   LLVMValueRef x_subcoord[2], y_subcoord[2];
    LLVMValueRef neighbors_lo[2][2];
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
-   LLVMValueRef stride;
+   const unsigned level = 0;
+   unsigned i, j;
 
-   assert(bld->static_state->target == PIPE_TEXTURE_2D);
+   assert(bld->static_state->target == PIPE_TEXTURE_2D
+         || bld->static_state->target == PIPE_TEXTURE_RECT);
    assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE);
@@ -1793,21 +1878,30 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
-   x0 = s_ipart;
-   y0 = t_ipart;
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x0 = lp_build_sample_wrap_int(bld, x0, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
-
-   x1 = lp_build_sample_wrap_int(bld, x1, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+
+   y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level);
+
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.width,
+                                   s_ipart, width, x_stride,
+                                   bld->static_state->pot_width,
+                                   bld->static_state->wrap_s,
+                                   &x_offset0, &x_offset1,
+                                   &x_subcoord[0], &x_subcoord[1]);
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.height,
+                                   t_ipart, height, y_stride,
+                                   bld->static_state->pot_height,
+                                   bld->static_state->wrap_t,
+                                   &y_offset0, &y_offset1,
+                                   &y_subcoord[0], &y_subcoord[1]);
+
+   offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0);
+   offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0);
+   offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1);
+   offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1);
 
    /*
     * Transform 4 x i32 in
@@ -1836,7 +1930,6 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
       LLVMValueRef shuffle_lo;
       LLVMValueRef shuffle_hi;
-      unsigned i, j;
 
       for(j = 0; j < h16.type.length; j += 4) {
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
@@ -1864,7 +1957,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
-   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+   /*
+    * get pointer to mipmap level 0 data
+    */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level);
 
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
@@ -1883,20 +1979,38 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
+   for (j = 0; j < 2; ++j) {
+      for (i = 0; i < 2; ++i) {
+         LLVMValueRef rgba8;
 
-   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
-   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
-   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
-   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+         if (util_format_is_rgba8_variant(bld->format_desc)) {
+            /*
+             * Given the format is a rgba8, just read the pixels as is,
+             * without any swizzling. Swizzling will be done later.
+             */
+            rgba8 = lp_build_gather(bld->builder,
+                                    bld->texel_type.length,
+                                    bld->format_desc->block.bits,
+                                    bld->texel_type.width,
+                                    data_ptr, offset[j][i]);
 
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+            rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+
+         }
+         else {
+            rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                            bld->format_desc,
+                                            u8n.type,
+                                            data_ptr, offset[j][i],
+                                            x_subcoord[i],
+                                            y_subcoord[j]);
+         }
+
+         lp_build_unpack2(builder, u8n.type, h16.type,
+                          rgba8,
+                          &neighbors_lo[j][i], &neighbors_hi[j][i]);
+      }
+   }
 
    /*
     * Linear interpolate with 8.8 fixed point.
@@ -2077,7 +2191,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    }
    else if (util_format_fits_8unorm(bld.format_desc) &&
             bld.format_desc->nr_channels > 1 &&
-            static_state->target == PIPE_TEXTURE_2D &&
+            (static_state->target == PIPE_TEXTURE_2D ||
+                  static_state->target == PIPE_TEXTURE_RECT) &&
             static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0aa64affac..0e07f7f3f3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -200,8 +200,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
    }
    mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
    assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = val;
-
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cond_mask,
+                                  val,
+                                  "");
    lp_exec_mask_update(mask);
 }
 
@@ -802,7 +804,7 @@ emit_store(
 
    case TGSI_FILE_PREDICATE:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         bld->preds[index][chan_index]);
+                         bld->preds[reg->Register.Index][chan_index]);
       break;
 
    default:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 3ffe916f8e..fec1d3dfbc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -128,16 +128,16 @@ struct lp_build_context
     */
    struct lp_type type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_elem_type(type) */
    LLVMTypeRef elem_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_vec_type(type) */
    LLVMTypeRef vec_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_int_elem_type(type) */
    LLVMTypeRef int_elem_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_int_vec_type(type) */
    LLVMTypeRef int_vec_type;
 
    /** Same as lp_build_undef(type) */