61 files changed, 2898 insertions, 2490 deletions
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
new file mode 100644
index 0000000000..257b72d7b2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -0,0 +1 @@
+lp_tile_soa.c
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index cd7b6356d2..e038a5229e 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -17,10 +17,13 @@ C_SOURCES = \
 	lp_bld_depth.c \
 	lp_bld_flow.c \
 	lp_bld_format_aos.c \
+	lp_bld_format_query.c \
 	lp_bld_format_soa.c \
 	lp_bld_interp.c \
 	lp_bld_intr.c \
 	lp_bld_logic.c \
+	lp_bld_pack.c \
+	lp_bld_sample.c \
 	lp_bld_sample_soa.c \
 	lp_bld_swizzle.c \
 	lp_bld_struct.c \
@@ -32,7 +35,6 @@ C_SOURCES = \
 	lp_draw_arrays.c \
 	lp_flush.c \
 	lp_jit.c \
-	lp_prim_setup.c \
 	lp_prim_vbuf.c \
 	lp_setup.c \
 	lp_query.c \
@@ -54,4 +56,10 @@ C_SOURCES = \
 	lp_tile_cache.c \
 	lp_tile_soa.c
 
+CPP_SOURCES = \
+	lp_bld_misc.cpp
+
 include ../../Makefile.template
+
+lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_access.py ../../auxiliary/util/u_format.csv
+	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 89d08834a3..0c3f00fd58 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -51,21 +51,22 @@ Requirements
 
  - Linux
  
- - udis86, http://udis86.sourceforge.net/ . Use my repository, which decodes
-   opcodes not yet supported by upstream.
+ - A x86 or amd64 processor.  64bit mode is preferred.
  
-     git clone git://people.freedesktop.org/~jrfonseca/udis86
-     cd udis86
-     ./configure --with-pic
-     make
-     sudo make install
+   Support for sse2 is strongly encouraged.  Support for ssse3, and sse4.1 will
+   yield the most efficient code.  The less features the CPU has the more
+   likely is that you ran into underperforming, buggy, or incomplete code.  
+   
+   See /proc/cpuinfo to know what your CPU supports.
+ 
+ - LLVM 2.5 or greater. LLVM 2.6 is preferred.
  
- - LLVM 2.5. On Debian based distributions do:
+   On Debian based distributions do:
  
      aptitude install llvm-dev
 
-   There is a typo in one of the llvm-dev 2.5 headers, that causes compilation
-   errors in the debug build:
+   There is a typo in one of the llvm 2.5 headers, that may cause compilation
+   errors. To fix it apply the change:
 
      --- /usr/include/llvm-c/Core.h.orig	2009-08-10 15:38:54.000000000 +0100
      +++ /usr/include/llvm-c/Core.h	2009-08-10 15:38:25.000000000 +0100
@@ -79,12 +80,17 @@ Requirements
           #endif
           return reinterpret_cast<T**>(Vals);
  
- - A x86 or amd64 processor with support for sse2, sse3, and sse4.1 SIMD
-   instructions. This is necessary because we emit several SSE intrinsics for
-   convenience. See /proc/cpuinfo to know what your CPU supports.
- 
- - scons
+ - scons (optional)
 
+ - udis86, http://udis86.sourceforge.net/ (optional):
+ 
+     git clone git://udis86.git.sourceforge.net/gitroot/udis86/udis86
+     cd udis86
+     ./autogen.sh
+     ./configure --with-pic
+     make
+     sudo make install
+ 
 
 Building
 ========
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index f4a9a3b22e..3bd2e70013 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -9,6 +9,13 @@ if not env.has_key('LLVM_VERSION'):
 
 env.Tool('udis86')
 
+env.CodeGenerate(
+	target = 'lp_tile_soa.c',
+	script = 'lp_tile_soa.py',
+	source = ['#src/gallium/auxiliary/util/u_format.csv'],
+	command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
 llvmpipe = env.ConvenienceLibrary(
 	target = 'llvmpipe',
 	source = [
@@ -23,9 +30,13 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_depth.c',
 		'lp_bld_flow.c',
 		'lp_bld_format_aos.c',
+        'lp_bld_format_query.c',
 		'lp_bld_format_soa.c',
 		'lp_bld_interp.c',
 		'lp_bld_intr.c',
+		'lp_bld_misc.cpp',
+        'lp_bld_pack.c',
+        'lp_bld_sample.c',
 		'lp_bld_sample_soa.c',
 		'lp_bld_struct.c',
 		'lp_bld_logic.c',
@@ -38,7 +49,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_draw_arrays.c',
 		'lp_flush.c',
 		'lp_jit.c',
-		'lp_prim_setup.c',
 		'lp_prim_vbuf.c',
 		'lp_setup.c',
 		'lp_query.c',
@@ -68,7 +78,7 @@ env.Prepend(LIBS = [llvmpipe] + auxiliaries)
 
 env.Program(
     target = 'lp_test_format',
-    source = ['lp_test_format.c'],
+    source = ['lp_test_format.c', 'lp_test_main.c'],
 )
 
 env.Program(
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index 0b115fc9b0..9c59677a74 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -47,12 +47,16 @@
 
 #include "util/u_memory.h"
 #include "util/u_debug.h"
+#include "util/u_math.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
 
@@ -71,30 +75,28 @@ lp_build_min_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.min.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -119,30 +121,28 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.max.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -204,15 +204,14 @@ lp_build_add(struct lp_build_context *bld,
       if(a == bld->one || b == bld->one)
         return bld->one;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -257,15 +256,14 @@ lp_build_sub(struct lp_build_context *bld,
       if(b == bld->one)
         return bld->zero;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -284,45 +282,6 @@ lp_build_sub(struct lp_build_context *bld,
 
 
 /**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef 
-lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build constant int vector of width 'n' and value 'c'.
- */
-static LLVMValueRef 
-lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(type, c, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
  * Normalized 8bit multiplication.
  *
  * - alpha plus one
@@ -365,33 +324,30 @@ lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
  */
 static LLVMValueRef
 lp_build_mul_u8n(LLVMBuilderRef builder,
+                 struct lp_type i16_type,
                  LLVMValueRef a, LLVMValueRef b)
 {
-   static LLVMValueRef c01 = NULL;
-   static LLVMValueRef c08 = NULL;
-   static LLVMValueRef c80 = NULL;
+   LLVMValueRef c8;
    LLVMValueRef ab;
 
-   if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
-   if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
-   if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+   c8 = lp_build_int_const_scalar(i16_type, 8);
    
 #if 0
    
    /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, c01, "");
+   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
    ab = LLVMBuildMul(builder, a, b, "");
 
 #else
    
-   /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    ab = LLVMBuildMul(builder, a, b, "");
-   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
-   ab = LLVMBuildAdd(builder, ab, c80, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
 
 #endif
    
-   ab = LLVMBuildLShr(builder, ab, c08, "");
+   ab = LLVMBuildLShr(builder, ab, c8, "");
 
    return ab;
 }
@@ -406,6 +362,8 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef b)
 {
    const struct lp_type type = bld->type;
+   LLVMValueRef shift;
+   LLVMValueRef res;
 
    if(a == bld->zero)
       return bld->zero;
@@ -419,53 +377,104 @@ lp_build_mul(struct lp_build_context *bld,
       return bld->undef;
 
    if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
-         LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
-         LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
-         static LLVMValueRef ml = NULL;
-         static LLVMValueRef mh = NULL;
-         LLVMValueRef al, ah, bl, bh;
-         LLVMValueRef abl, abh;
-         LLVMValueRef ab;
-         
-         if(!ml) ml = lp_build_unpack_shuffle(16, 0);
-         if(!mh) mh = lp_build_unpack_shuffle(16, 1);
-
-         /*  PUNPCKLBW, PUNPCKHBW */
-         al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
-         bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
-         ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
-         bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+      if(type.width == 8) {
+         struct lp_type i16_type = lp_wider_type(type);
+         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 
-         /* NOP */
-         al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
-         bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
-         ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
-         bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
+         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 
          /* PMULLW, PSRLW, PADDW */
-         abl = lp_build_mul_u8n(bld->builder, al, bl);
-         abh = lp_build_mul_u8n(bld->builder, ah, bh);
+         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 
-         /* PACKUSWB */
-         ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
-
-         /* NOP */
-         ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
          
          return ab;
       }
-#endif
 
       /* FIXME */
       assert(0);
    }
 
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstMul(a, b);
+   if(type.fixed)
+      shift = lp_build_int_const_scalar(type, type.width/2);
+   else
+      shift = NULL;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      res =  LLVMConstMul(a, b);
+      if(shift) {
+         if(type.sign)
+            res = LLVMConstAShr(res, shift);
+         else
+            res = LLVMConstLShr(res, shift);
+      }
+   }
+   else {
+      res = LLVMBuildMul(bld->builder, a, b, "");
+      if(shift) {
+         if(type.sign)
+            res = LLVMBuildAShr(bld->builder, res, shift, "");
+         else
+            res = LLVMBuildLShr(bld->builder, res, shift, "");
+      }
+   }
+
+   return res;
+}
+
+
+/**
+ * Small vector x scale multiplication optimization.
+ */
+LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b)
+{
+   LLVMValueRef factor;
+
+   if(b == 0)
+      return bld->zero;
+
+   if(b == 1)
+      return a;
 
-   return LLVMBuildMul(bld->builder, a, b, "");
+   if(b == -1)
+      return LLVMBuildNeg(bld->builder, a, "");
+
+   if(b == 2 && bld->type.floating)
+      return lp_build_add(bld, a, a);
+
+   if(util_is_pot(b)) {
+      unsigned shift = ffs(b) - 1;
+
+      if(bld->type.floating) {
+#if 0
+         /*
+          * Power of two multiplication by directly manipulating the mantissa.
+          *
+          * XXX: This might not be always faster, it will introduce a small error
+          * for multiplication by zero, and it will produce wrong results
+          * for Inf and NaN.
+          */
+         unsigned mantissa = lp_mantissa(bld->type);
+         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
+         a = LLVMBuildAdd(bld->builder, a, factor, "");
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
+         return a;
+#endif
+      }
+      else {
+         factor = lp_build_const_scalar(bld->type, shift);
+         return LLVMBuildShl(bld->builder, a, factor, "");
+      }
+   }
+
+   factor = lp_build_const_scalar(bld->type, (double)b);
+   return lp_build_mul(bld, a, factor);
 }
 
 
@@ -493,22 +502,43 @@ lp_build_div(struct lp_build_context *bld,
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstFDiv(a, b);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
 
    return LLVMBuildFDiv(bld->builder, a, b, "");
 }
 
 
+/**
+ * Linear interpolation.
+ *
+ * This also works for integer values with a few caveats.
+ *
+ * @sa http://www.stereopsis.com/doubleblend.html
+ */
 LLVMValueRef
 lp_build_lerp(struct lp_build_context *bld,
               LLVMValueRef x,
               LLVMValueRef v0,
               LLVMValueRef v1)
 {
-   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+   LLVMValueRef delta;
+   LLVMValueRef res;
+
+   delta = lp_build_sub(bld, v1, v0);
+
+   res = lp_build_mul(bld, x, delta);
+
+   res = lp_build_add(bld, v0, res);
+
+   if(bld->type.fixed)
+      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
+       * but it will be wrong for other uses. Basically we need a more
+       * powerful lp_type, capable of further distinguishing the values
+       * interpretation from the value storage. */
+      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+
+   return res;
 }
 
 
@@ -606,8 +636,7 @@ lp_build_abs(struct lp_build_context *bld,
       return a;
    }
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width*type.length == 128) {
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
       switch(type.width) {
       case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
@@ -617,7 +646,6 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
-#endif
 
    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 }
@@ -684,6 +712,8 @@ lp_build_round_sse41(struct lp_build_context *bld,
 
    assert(type.floating);
    assert(type.width*type.length == 128);
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse4_1);
 
    switch(type.width) {
    case 32:
@@ -703,20 +733,45 @@ lp_build_round_sse41(struct lp_build_context *bld,
 
 
 LLVMValueRef
-lp_build_round(struct lp_build_context *bld,
+lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
-#endif
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef res;
+      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iround(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
 }
 
 
@@ -728,13 +783,15 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-#endif
-
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
 }
 
 
@@ -745,59 +802,143 @@ lp_build_ceil(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iceil(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
-#endif
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 }
 
 
 LLVMValueRef
-lp_build_trunc(struct lp_build_context *bld,
-               LLVMValueRef a)
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
-#endif
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef half;
+
+      /* get sign bit */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+      /* sign * 0.5 */
+      half = lp_build_const_scalar(type, 0.5);
+      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+      half = LLVMBuildOr(bld->builder, sign, half, "");
+      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, half, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+   return res;
 }
 
 
-/**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating to zero.
- */
 LLVMValueRef
-lp_build_int(struct lp_build_context *bld,
-             LLVMValueRef a)
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
+   else {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? ~0 : 0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+
+      /* offset = -0.99999(9)f */
+      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
 }
 
 
 LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
-                LLVMValueRef a)
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a)
 {
-   a = lp_build_floor(bld, a);
-   a = lp_build_int(bld, a);
-   return a;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
+   else {
+      assert(0);
+      res = bld->undef;
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
 }
 
 
@@ -837,11 +978,9 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
@@ -858,11 +997,8 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
@@ -918,7 +1054,8 @@ lp_build_pow(struct lp_build_context *bld,
 {
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 }
@@ -972,7 +1109,8 @@ lp_build_polynomial(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
@@ -1026,7 +1164,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
@@ -1125,7 +1264,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
index d68a97c4b8..62be4b9aee 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-struct lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -67,6 +67,11 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b);
+
+LLVMValueRef
 lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
@@ -126,11 +131,18 @@ lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a);
 
 LLVMValueRef
-lp_build_int(struct lp_build_context *bld,
-             LLVMValueRef a);
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a);
 
 LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a);
 
 LLVMValueRef
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
index ffb302f736..cb8e1c7b00 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.h
@@ -42,7 +42,7 @@
 #include <pipe/p_compiler.h>
 
 
-struct lp_type type;
+struct lp_type;
 
 
 unsigned
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index 186cac70f6..9935209437 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -63,11 +63,13 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_conv.h"
 
 
@@ -198,243 +200,6 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
 
 /**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef
-lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build shuffle vectors that match PACKxx instructions.
- */
-static LLVMValueRef
-lp_build_const_pack_shuffle(unsigned n)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Expand the bit width.
- *
- * This will only change the number of bits the values are represented, not the
- * values themselves.
- */
-static void
-lp_build_expand(LLVMBuilderRef builder,
-               struct lp_type src_type,
-               struct lp_type dst_type,
-               LLVMValueRef src,
-               LLVMValueRef *dst, unsigned num_dsts)
-{
-   unsigned num_tmps;
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length == dst_type.length * num_dsts);
-
-   num_tmps = 1;
-   dst[0] = src;
-
-   while(src_type.width < dst_type.width) {
-      struct lp_type new_type = src_type;
-      LLVMTypeRef new_vec_type;
-
-      new_type.width *= 2;
-      new_type.length /= 2;
-      new_vec_type = lp_build_vec_type(new_type);
-
-      for(i = num_tmps; i--; ) {
-         LLVMValueRef zero;
-         LLVMValueRef shuffle_lo;
-         LLVMValueRef shuffle_hi;
-         LLVMValueRef lo;
-         LLVMValueRef hi;
-
-         zero = lp_build_zero(src_type);
-         shuffle_lo = lp_build_const_unpack_shuffle(src_type.length, 0);
-         shuffle_hi = lp_build_const_unpack_shuffle(src_type.length, 1);
-
-         /*  PUNPCKLBW, PUNPCKHBW */
-         lo = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_lo, "");
-         hi = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_hi, "");
-
-         dst[2*i + 0] = LLVMBuildBitCast(builder, lo, new_vec_type, "");
-         dst[2*i + 1] = LLVMBuildBitCast(builder, hi, new_vec_type, "");
-      }
-
-      src_type = new_type;
-
-      num_tmps *= 2;
-   }
-
-   assert(num_tmps == num_dsts);
-}
-
-
-/**
- * Non-interleaved pack.
- *
- * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
- *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
- *
- * TODO: handle saturation consistently.
- */
-static LLVMValueRef
-lp_build_pack2(LLVMBuilderRef builder,
-               struct lp_type src_type,
-               struct lp_type dst_type,
-               boolean clamped,
-               LLVMValueRef lo,
-               LLVMValueRef hi)
-{
-   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
-   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
-   LLVMValueRef shuffle;
-   LLVMValueRef res;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * 2 == dst_type.length);
-
-   assert(!src_type.floating);
-   assert(!dst_type.floating);
-
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(src_type.width * src_type.length == 128) {
-      /* All X86 non-interleaved pack instructions all take signed inputs and
-       * saturate them, so saturate beforehand. */
-      if(!src_type.sign && !clamped) {
-         struct lp_build_context bld;
-         unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-         LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
-         lp_build_context_init(&bld, builder, src_type);
-         lo = lp_build_min(&bld, lo, dst_max);
-         hi = lp_build_min(&bld, hi, dst_max);
-      }
-
-      switch(src_type.width) {
-      case 32:
-         if(dst_type.sign)
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-         else
-            /* PACKUSDW is the only instrinsic with a consistent signature */
-            return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-         break;
-
-      case 16:
-         if(dst_type.sign)
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-         else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
-         break;
-      }
-
-      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
-      return res;
-   }
-#endif
-
-   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
-   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
-
-   shuffle = lp_build_const_pack_shuffle(dst_type.length);
-
-   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
-
-   return res;
-}
-
-
-/**
- * Truncate the bit width.
- *
- * TODO: Handle saturation consistently.
- */
-static LLVMValueRef
-lp_build_pack(LLVMBuilderRef builder,
-              struct lp_type src_type,
-              struct lp_type dst_type,
-              boolean clamped,
-              const LLVMValueRef *src, unsigned num_srcs)
-{
-   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * num_srcs == dst_type.length);
-
-   for(i = 0; i < num_srcs; ++i)
-      tmp[i] = src[i];
-
-   while(src_type.width > dst_type.width) {
-      struct lp_type new_type = src_type;
-
-      new_type.width /= 2;
-      new_type.length *= 2;
-
-      /* Take in consideration the sign changes only in the last step */
-      if(new_type.width == dst_type.width)
-         new_type.sign = dst_type.sign;
-
-      num_srcs /= 2;
-
-      for(i = 0; i < num_srcs; ++i)
-         tmp[i] = lp_build_pack2(builder, src_type, new_type, clamped,
-                                 tmp[2*i + 0], tmp[2*i + 1]);
-
-      src_type = new_type;
-   }
-
-   assert(num_srcs == 1);
-
-   return tmp[0];
-}
-
-
-/**
  * Generic type conversion.
  *
  * TODO: Take a precision argument, or even better, add a new precision member
@@ -573,7 +338,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
    if(tmp_type.width < dst_type.width) {
       assert(num_tmps == 1);
-      lp_build_expand(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
+      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
       tmp_type.width = dst_type.width;
       tmp_type.length = dst_type.length;
       num_tmps = num_dsts;
@@ -693,7 +458,7 @@ lp_build_conv_mask(LLVMBuilderRef builder,
    }
    else if(src_type.width < dst_type.width) {
       assert(num_srcs == 1);
-      lp_build_expand(builder, src_type, dst_type, src[0], dst, num_dsts);
+      lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
    }
    else {
       assert(num_srcs == num_dsts);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
index ca378804d2..948e68fae4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-struct lp_type type;
+struct lp_type;
 
 
 LLVMValueRef
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
index 59d8f492e6..39dfc51e50 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -77,10 +77,10 @@ lp_disassemble(const void* func)
    while (ud_disassemble(&ud_obj)) {
 
 #ifdef PIPE_ARCH_X86
-      debug_printf("%08lx: ", (unsigned long)ud_insn_off(&ud_obj));
+      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
 #endif
 #ifdef PIPE_ARCH_X86_64
-      debug_printf("%016llx: ", (unsigned long long)ud_insn_off(&ud_obj));
+      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
 #endif
 
 #if 0
@@ -115,9 +115,16 @@ lp_disassemble(const void* func)
          }
       }
 
-      if (ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret)
+      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
+           ud_obj.mnemonic == UD_Iinvalid)
          break;
    }
+
+#if 0
+   /* Print GDB command, useful to verify udis86 output */
+   debug_printf("disassemble %p %p\n", func, (void*)(uintptr_t)ud_obj.pc);
+#endif
+
    debug_printf("\n");
 #else
    (void)func;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 21c665c4d4..d438c0e63d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -90,7 +90,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 
    if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
       type.floating = TRUE;
-      assert(swizzle = 0);
+      assert(swizzle == 0);
       assert(format_desc->channel[swizzle].size == format_desc->block.bits);
    }
    else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
@@ -210,7 +210,4 @@ lp_build_depth_test(LLVMBuilderRef builder,
       dst = lp_build_select(&bld, z_bitmask, src, dst);
       LLVMBuildStore(builder, dst, dst_ptr);
    }
-
-   /* FIXME */
-   assert(!state->occlusion_count);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
index 6d3f692619..970bee379f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format.h
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef LP_BLD_H
-#define LP_BLD_H
+#ifndef LP_BLD_FORMAT_H
+#define LP_BLD_FORMAT_H
 
 
 /**
@@ -42,62 +42,34 @@ struct util_format_description;
 struct lp_type;
 
 
-/**
- * Unpack a pixel into its RGBA components.
- *
- * @param packed integer.
- *
- * @return RGBA in a 4 floats vector.
- */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         enum pipe_format format,
-                         LLVMValueRef packed);
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc);
 
 
-/**
- * Pack a pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- */
-LLVMValueRef
-lp_build_pack_rgba_aos(LLVMBuilderRef builder,
-                       enum pipe_format format,
-                       LLVMValueRef rgba);
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled);
 
 
-/**
- * Load a pixel into its RGBA components.
- *
- * @param ptr value with the pointer to the packed pixel. Pointer type is
- * irrelevant.
- *
- * @return RGBA in a 4 floats vector.
- */
 LLVMValueRef
-lp_build_load_rgba_aos(LLVMBuilderRef builder,
-                       enum pipe_format format,
-                       LLVMValueRef ptr);
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         const struct util_format_description *desc,
+                         LLVMValueRef packed);
 
 
-/**
- * Store a pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- */
-void 
-lp_build_store_rgba_aos(LLVMBuilderRef builder,
-                        enum pipe_format format,
-                        LLVMValueRef ptr,
-                        LLVMValueRef rgba);
+LLVMValueRef
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed);
+
 
 LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       const struct util_format_description *desc,
+                       LLVMValueRef rgba);
 
 
 void
@@ -108,12 +80,4 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          LLVMValueRef *rgba);
 
 
-void
-lp_build_load_rgba_soa(LLVMBuilderRef builder,
-                       const struct util_format_description *format_desc,
-                       struct lp_type type,
-                       LLVMValueRef base_ptr,
-                       LLVMValueRef offsets,
-                       LLVMValueRef *rgba);
-
-#endif /* !LP_BLD_H */
+#endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
index b9b5d84bed..5836e0173f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -25,18 +25,39 @@
  *
  **************************************************************************/
 
+/**
+ * @file
+ * AoS pixel format manipulation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 
+#include "util/u_cpu_detect.h"
 #include "util/u_format.h"
 
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
 #include "lp_bld_format.h"
 
 
+/**
+ * Unpack a single pixel into its RGBA components.
+ *
+ * @param packed integer.
+ *
+ * @return RGBA in a 4 floats vector.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
 LLVMValueRef
 lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         enum pipe_format format,
+                         const struct util_format_description *desc,
                          LLVMValueRef packed)
 {
-   const struct util_format_description *desc;
    LLVMTypeRef type;
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
@@ -49,8 +70,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    unsigned shift;
    unsigned i;
 
-   desc = util_format_description(format);
-
    /* FIXME: Support more formats */
    assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
    assert(desc->block.width == 1);
@@ -151,12 +170,130 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Take a vector with packed pixels and unpack into a rgba8 vector.
+ *
+ * Formats with bit depth smaller than 32bits are accepted, but they must be
+ * padded to 32bits.
+ */
+LLVMValueRef
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed)
+{
+   struct lp_build_context bld;
+   bool rgba8;
+   LLVMValueRef res;
+   unsigned i;
+
+   lp_build_context_init(&bld, builder, type);
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   assert(!type.floating);
+   assert(!type.fixed);
+   assert(type.norm);
+   assert(type.width == 8);
+   assert(type.length % 4 == 0);
+
+   rgba8 = TRUE;
+   for(i = 0; i < 4; ++i) {
+      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
+      if(desc->channel[0].size != 8)
+         rgba8 = FALSE;
+   }
+
+   if(rgba8) {
+      /*
+       * The pixel is already in a rgba8 format variant. All it is necessary
+       * is to swizzle the channels.
+       */
+
+      unsigned char swizzles[4];
+      boolean zeros[4]; /* bitwise AND mask */
+      boolean ones[4]; /* bitwise OR mask */
+      boolean swizzles_needed = FALSE;
+      boolean zeros_needed = FALSE;
+      boolean ones_needed = FALSE;
+
+      for(i = 0; i < 4; ++i) {
+         enum util_format_swizzle swizzle = desc->swizzle[i];
+
+         /* Initialize with the no-op case */
+         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
+         zeros[i] = TRUE;
+         ones[i] = FALSE;
+
+         switch (swizzle) {
+         case UTIL_FORMAT_SWIZZLE_X:
+         case UTIL_FORMAT_SWIZZLE_Y:
+         case UTIL_FORMAT_SWIZZLE_Z:
+         case UTIL_FORMAT_SWIZZLE_W:
+            if(swizzle != swizzles[i]) {
+               swizzles[i] = swizzle;
+               swizzles_needed = TRUE;
+            }
+            break;
+         case UTIL_FORMAT_SWIZZLE_0:
+            zeros[i] = FALSE;
+            zeros_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_1:
+            ones[i] = TRUE;
+            ones_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_NONE:
+            assert(0);
+            break;
+         }
+      }
+
+      res = packed;
+
+      if(swizzles_needed)
+         res = lp_build_swizzle1_aos(&bld, res, swizzles);
+
+      if(zeros_needed) {
+         /* Mask out zero channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
+         res = LLVMBuildAnd(builder, res, mask, "");
+      }
+
+      if(ones_needed) {
+         /* Or one channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
+         res = LLVMBuildOr(builder, res, mask, "");
+      }
+   }
+   else {
+      /* FIXME */
+      assert(0);
+      res = lp_build_undef(type);
+   }
+
+   return res;
+}
+
+
+/**
+ * Pack a single pixel.
+ *
+ * @param rgba 4 float vector with the unpacked components.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
 LLVMValueRef
 lp_build_pack_rgba_aos(LLVMBuilderRef builder,
-                       enum pipe_format format,
+                       const struct util_format_description *desc,
                        LLVMValueRef rgba)
 {
-   const struct util_format_description *desc;
    LLVMTypeRef type;
    LLVMValueRef packed = NULL;
    LLVMValueRef swizzles[4];
@@ -167,8 +304,6 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
    unsigned shift;
    unsigned i, j;
 
-   desc = util_format_description(format);
-
    assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
    assert(desc->block.width == 1);
    assert(desc->block.height == 1);
@@ -247,57 +382,3 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 
    return packed;
 }
-
-
-LLVMValueRef
-lp_build_load_rgba_aos(LLVMBuilderRef builder,
-                       enum pipe_format format,
-                       LLVMValueRef ptr)
-{
-   const struct util_format_description *desc;
-   LLVMTypeRef type;
-   LLVMValueRef packed;
-
-   desc = util_format_description(format);
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   type = LLVMIntType(desc->block.bits);
-
-   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
-
-   packed = LLVMBuildLoad(builder, ptr, "");
-
-   return lp_build_unpack_rgba_aos(builder, format, packed);
-}
-
-
-void
-lp_build_store_rgba_aos(LLVMBuilderRef builder,
-                        enum pipe_format format,
-                        LLVMValueRef ptr,
-                        LLVMValueRef rgba)
-{
-   const struct util_format_description *desc;
-   LLVMTypeRef type;
-   LLVMValueRef packed;
-
-   desc = util_format_description(format);
-
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-
-   type = LLVMIntType(desc->block.bits);
-
-   packed = lp_build_pack_rgba_aos(builder, format, rgba);
-
-   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
-
-   LLVMBuildStore(builder, packed, ptr);
-}
-
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_query.c b/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
new file mode 100644
index 0000000000..f3832d07ff
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Utility functions to make assertions about formats.
+ *
+ * This module centralizes most of logic used when determining what algorithm
+ * is most suitable (i.e., most efficient yet correct) for a given format.
+ *
+ * It might be possible to move some of these functions to u_format module,
+ * but since tiny differences in the format my render it more/less
+ * appropriate to a given algorithm it is impossible to make any long term
+ * guarantee about the semantics of these functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_format.h"
+
+
+/**
+ * Whether this format is a 4 rgba8 variant
+ */
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc)
+{
+   unsigned chan;
+
+   if(desc->block.width != 1 ||
+      desc->block.height != 1 ||
+      desc->block.bits != 32)
+      return FALSE;
+
+   for(chan = 0; chan < 4; ++chan) {
+      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
+         return FALSE;
+      if(desc->channel[chan].size != 8)
+         return FALSE;
+   }
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
index b5ff434e1a..64151d169d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
@@ -34,66 +34,17 @@
 #include "lp_bld_format.h"
 
 
-/**
- * Gather elements from scatter positions in memory into a single vector.
- *
- * @param src_width src element width
- * @param dst_width result element width (source will be expanded to fit)
- * @param length length of the offsets,
- * @param base_ptr base pointer, should be a i8 pointer type.
- * @param offsets vector with offsets
- */
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
-{
-   LLVMTypeRef src_type = LLVMIntType(src_width);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
-   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-   LLVMValueRef res;
-   unsigned i;
-
-   res = LLVMGetUndef(dst_vec_type);
-   for(i = 0; i < length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef elem_offset;
-      LLVMValueRef elem_ptr;
-      LLVMValueRef elem;
-
-      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
-      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
-      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
-      elem = LLVMBuildLoad(builder, elem_ptr, "");
-
-      assert(src_width <= dst_width);
-      if(src_width > dst_width)
-         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
-      if(src_width < dst_width)
-         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
-
-      res = LLVMBuildInsertElement(builder, res, elem, index, "");
-   }
-
-   return res;
-}
-
-
 static LLVMValueRef
-lp_build_format_swizzle(struct lp_type type,
-                        const LLVMValueRef *inputs,
-                        enum util_format_swizzle swizzle)
+lp_build_format_swizzle_chan_soa(struct lp_type type,
+                                 const LLVMValueRef *unswizzled,
+                                 enum util_format_swizzle swizzle)
 {
    switch (swizzle) {
    case UTIL_FORMAT_SWIZZLE_X:
    case UTIL_FORMAT_SWIZZLE_Y:
    case UTIL_FORMAT_SWIZZLE_Z:
    case UTIL_FORMAT_SWIZZLE_W:
-      return inputs[swizzle];
+      return unswizzled[swizzle];
    case UTIL_FORMAT_SWIZZLE_0:
       return lp_build_zero(type);
    case UTIL_FORMAT_SWIZZLE_1:
@@ -108,6 +59,28 @@ lp_build_format_swizzle(struct lp_type type,
 
 
 void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled)
+{
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      swizzled[2] = swizzled[1] = swizzled[0] = depth;
+      swizzled[3] = lp_build_one(type);
+   }
+   else {
+      unsigned chan;
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      }
+   }
+}
+
+
+void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          const struct util_format_description *format_desc,
                          struct lp_type type,
@@ -119,7 +92,9 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
    unsigned chan;
 
    /* FIXME: Support more formats */
-   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH ||
+          (format_desc->layout == UTIL_FORMAT_LAYOUT_ARRAY &&
+           format_desc->block.bits == format_desc->channel[0].size));
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
    assert(format_desc->block.bits <= 32);
@@ -170,39 +145,5 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
       start = stop;
    }
 
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      enum util_format_swizzle swizzle = format_desc->swizzle[0];
-      LLVMValueRef depth = lp_build_format_swizzle(type, inputs, swizzle);
-      rgba[2] = rgba[1] = rgba[0] = depth;
-      rgba[3] = lp_build_one(type);
-   }
-   else {
-      for (chan = 0; chan < 4; ++chan) {
-         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-         rgba[chan] = lp_build_format_swizzle(type, inputs, swizzle);
-      }
-   }
-}
-
-
-void
-lp_build_load_rgba_soa(LLVMBuilderRef builder,
-                       const struct util_format_description *format_desc,
-                       struct lp_type type,
-                       LLVMValueRef base_ptr,
-                       LLVMValueRef offsets,
-                       LLVMValueRef *rgba)
-{
-   LLVMValueRef packed;
-
-   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-   assert(format_desc->block.bits <= 32);
-
-   packed = lp_build_gather(builder,
-                            type.length, format_desc->block.bits, type.width,
-                            base_ptr, offsets);
-
-   lp_build_unpack_rgba_soa(builder, format_desc, type, packed, rgba);
+   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 338dbca6d1..49dab8ab61 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -109,32 +109,6 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
 
 
 /**
- * Small vector x scale multiplication optimization.
- *
- * TODO: Should be elsewhere.
- */
-static LLVMValueRef
-coeff_multiply(struct lp_build_interp_soa_context *bld,
-               LLVMValueRef coeff,
-               int step)
-{
-   LLVMValueRef factor;
-
-   switch(step) {
-   case 0:
-      return bld->base.zero;
-   case 1:
-      return coeff;
-   case 2:
-      return lp_build_add(&bld->base, coeff, coeff);
-   default:
-      factor = lp_build_const_scalar(bld->base.type, (double)step);
-      return lp_build_mul(&bld->base, coeff, factor);
-   }
-}
-
-
-/**
  * Multiply the dadx and dady with the xstep and ystep respectively.
  */
 static void
@@ -149,8 +123,8 @@ coeffs_update(struct lp_build_interp_soa_context *bld)
       if (mode != TGSI_INTERPOLATE_CONSTANT) {
          for(chan = 0; chan < NUM_CHANNELS; ++chan) {
             if(mask & (1 << chan)) {
-               bld->dadx[attrib][chan] = coeff_multiply(bld, bld->dadx[attrib][chan], bld->xstep);
-               bld->dady[attrib][chan] = coeff_multiply(bld, bld->dady[attrib][chan], bld->ystep);
+               bld->dadx[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dadx[attrib][chan], bld->xstep);
+               bld->dady[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dady[attrib][chan], bld->ystep);
             }
          }
       }
@@ -329,8 +303,8 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
             unsigned first, last, mask;
             unsigned attrib;
 
-            first = decl->DeclarationRange.First;
-            last = decl->DeclarationRange.Last;
+            first = decl->Range.First;
+            last = decl->Range.Last;
             mask = decl->Declaration.UsageMask;
 
             for( attrib = first; attrib <= last; ++attrib ) {
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 6b6f820769..db22a8028a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -33,6 +33,8 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -65,7 +67,7 @@ lp_build_cmp(struct lp_build_context *bld,
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
-      if(type.floating) {
+      if(type.floating && util_cpu_caps.has_sse) {
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -114,7 +116,7 @@ lp_build_cmp(struct lp_build_context *bld,
          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
          return res;
       }
-      else {
+      else if(util_cpu_caps.has_sse2) {
          static const struct {
             unsigned swap:1;
             unsigned eq:1;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
index a4ee7723b5..d67500ef70 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
@@ -42,7 +42,7 @@
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
 
-struct lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
new file mode 100644
index 0000000000..d3f78c06d9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_config.h"
+
+#include "lp_bld_misc.h"
+
+
+#ifndef LLVM_NATIVE_ARCH
+
+namespace llvm {
+   extern void LinkInJIT();
+}
+
+
+void
+LLVMLinkInJIT(void)
+{
+   llvm::LinkInJIT();
+}
+
+
+extern "C" int X86TargetMachineModule;
+
+
+int
+LLVMInitializeNativeTarget(void)
+{
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   X86TargetMachineModule = 1;
+#endif
+   return 0;
+}
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.h b/src/gallium/drivers/llvmpipe/lp_bld_misc.h
new file mode 100644
index 0000000000..0e787e0b9c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.h
@@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "llvm/Config/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifndef LLVM_NATIVE_ARCH
+
+void
+LLVMLinkInJIT(void);
+
+int
+LLVMInitializeNativeTarget(void);
+
+#endif /* !LLVM_NATIVE_ARCH */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.c b/src/gallium/drivers/llvmpipe/lp_bld_pack.c
new file mode 100644
index 0000000000..bc360ad77a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_pack.c
@@ -0,0 +1,418 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for packing/unpacking.
+ *
+ * Pack/unpacking is necessary for conversion between types of different
+ * bit width.
+ *
+ * They are also commonly used when an computation needs higher
+ * precision for the intermediate values. For example, if one needs the
+ * function:
+ *
+ *   c = compute(a, b);
+ *
+ * to use more precision for intermediate results then one should implement it
+ * as:
+ *
+ *   LLVMValueRef
+ *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
+ *   {
+ *      struct lp_type wide_type = lp_wider_type(type);
+ *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
+ *
+ *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
+ *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
+ *
+ *      cl = compute_half(al, bl);
+ *      ch = compute_half(ah, bh);
+ *
+ *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
+ *
+ *      return c;
+ *   }
+ *
+ * where compute_half() would do the computation for half the elements with
+ * twice the precision.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Build shuffle vectors that match PACKxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_pack_shuffle(unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Interleave vector elements.
+ *
+ * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
+ */
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   LLVMValueRef shuffle;
+
+   shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
+
+   return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
+}
+
+
+/**
+ * Double the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi)
+{
+   LLVMValueRef msb;
+   LLVMTypeRef dst_vec_type;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(dst_type.width == src_type.width * 2);
+   assert(dst_type.length * 2 == src_type.length);
+
+   if(dst_type.sign && src_type.sign) {
+      /* Replicate the sign bit in the most significant bits */
+      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+   }
+   else
+      /* Most significant bits always zero */
+      msb = lp_build_zero(src_type);
+
+   /* Interleave bits */
+   if(util_cpu_caps.little_endian) {
+      *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
+   }
+   else {
+      *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
+   }
+
+   /* Cast the result into the new type (twice as wide) */
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
+   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
+}
+
+
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length == dst_type.length * num_dsts);
+
+   num_tmps = 1;
+   dst[0] = src;
+
+   while(src_type.width < dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width *= 2;
+      tmp_type.length /= 2;
+
+      for(i = num_tmps; i--; ) {
+         lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
+      }
+
+      src_type = tmp_type;
+
+      num_tmps *= 2;
+   }
+
+   assert(num_tmps == num_dsts);
+}
+
+
+/**
+ * Non-interleaved pack.
+ *
+ * This will move values as
+ *
+ *   lo =   __ l0 __ l1 __ l2 __..  __ ln
+ *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ *
+ * It is assumed the values are already clamped into the destination type range.
+ * Values outside that range will produce undefined results. Use
+ * lp_build_packs2 instead.
+ */
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi)
+{
+   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
+   LLVMValueRef shuffle;
+   LLVMValueRef res;
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+      switch(src_type.width) {
+      case 32:
+         if(dst_type.sign) {
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+         }
+         else {
+            if (util_cpu_caps.has_sse4_1) {
+               /* PACKUSDW is the only instrinsic with a consistent signature */
+               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
+            }
+            else {
+               assert(0);
+               return LLVMGetUndef(dst_vec_type);
+            }
+         }
+         break;
+
+      case 16:
+         if(dst_type.sign)
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+         else
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+         break;
+
+      default:
+         assert(0);
+         return LLVMGetUndef(dst_vec_type);
+         break;
+      }
+
+      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      return res;
+   }
+
+   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
+   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
+
+   shuffle = lp_build_const_pack_shuffle(dst_type.length);
+
+   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
+
+   return res;
+}
+
+
+
+/**
+ * Non-interleaved pack and saturate.
+ *
+ * Same as lp_build_pack2 but will saturate values so that they fit into the
+ * destination type.
+ */
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi)
+{
+   boolean clamp;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.sign == dst_type.sign);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   clamp = TRUE;
+
+   /* All X86 SSE non-interleaved pack instructions take signed inputs and
+    * saturate them, so no need to clamp for those cases. */
+   if(util_cpu_caps.has_sse2 &&
+      src_type.width * src_type.length == 128 &&
+      src_type.sign)
+      clamp = FALSE;
+
+   if(clamp) {
+      struct lp_build_context bld;
+      unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
+      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      lp_build_context_init(&bld, builder, src_type);
+      lo = lp_build_min(&bld, lo, dst_max);
+      hi = lp_build_min(&bld, hi, dst_max);
+      /* FIXME: What about lower bound? */
+   }
+
+   return lp_build_pack2(builder, src_type, dst_type, lo, hi);
+}
+
+
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
+{
+   LLVMValueRef (*pack2)(LLVMBuilderRef builder,
+                         struct lp_type src_type,
+                         struct lp_type dst_type,
+                         LLVMValueRef lo,
+                         LLVMValueRef hi);
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length);
+
+   if(clamped)
+      pack2 = &lp_build_pack2;
+   else
+      pack2 = &lp_build_packs2;
+
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+
+   while(src_type.width > dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width /= 2;
+      tmp_type.length *= 2;
+
+      /* Take in consideration the sign changes only in the last step */
+      if(tmp_type.width == dst_type.width)
+         tmp_type.sign = dst_type.sign;
+
+      num_srcs /= 2;
+
+      for(i = 0; i < num_srcs; ++i)
+         tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
+
+      src_type = tmp_type;
+   }
+
+   assert(num_srcs == 1);
+
+   return tmp[0];
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.h b/src/gallium/drivers/llvmpipe/lp_bld_pack.h
new file mode 100644
index 0000000000..fb2a34984a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_pack.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for packing/unpacking conversions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_PACK_H
+#define LP_BLD_PACK_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);
+
+
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi);
+
+
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs);
+
+
+#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
new file mode 100644
index 0000000000..af70ddc6ab
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- common code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width0);
+   state->pot_height        = util_is_pot(texture->height0);
+   state->pot_depth         = util_is_pot(texture->depth0);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   if(sampler->compare_mode) {
+      state->compare_mode      = sampler->compare_mode;
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+   state->prefilter         = sampler->prefilter;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Compute the offset of a pixel.
+ *
+ * x, y, y_stride are vectors
+ */
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
+
+      x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(bld, x, x_stride);
+      y_offset = lp_build_mul(bld, y, y_stride);
+
+      offset = lp_build_add(bld, x_offset, y_offset);
+   }
+
+   return offset;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
index 403d0e4836..8cb8210ca7 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
@@ -40,7 +40,9 @@
 
 struct pipe_texture;
 struct pipe_sampler_state;
+struct util_format_description;
 struct lp_type;
+struct lp_build_context;
 
 
 /**
@@ -119,6 +121,24 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr);
+
+
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
                     const struct lp_sampler_static_state *static_state,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
index 8ca1be6f1b..47b68b71e2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * Texture sampling.
+ * Texture sampling -- SoA.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
@@ -35,54 +35,23 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "util/u_debug.h"
+#include "util/u_debug_dump.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
 
-void
-lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
-                        const struct pipe_sampler_state *sampler)
-{
-   memset(state, 0, sizeof *state);
-
-   if(!texture)
-      return;
-
-   if(!sampler)
-      return;
-
-   state->format            = texture->format;
-   state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width[0]);
-   state->pot_height        = util_is_pot(texture->height[0]);
-   state->pot_depth         = util_is_pot(texture->depth[0]);
-
-   state->wrap_s            = sampler->wrap_s;
-   state->wrap_t            = sampler->wrap_t;
-   state->wrap_r            = sampler->wrap_r;
-   state->min_img_filter    = sampler->min_img_filter;
-   state->min_mip_filter    = sampler->min_mip_filter;
-   state->mag_img_filter    = sampler->mag_img_filter;
-   if(sampler->compare_mode) {
-      state->compare_mode      = sampler->compare_mode;
-      state->compare_func      = sampler->compare_func;
-   }
-   state->normalized_coords = sampler->normalized_coords;
-   state->prefilter         = sampler->prefilter;
-}
-
-
-
 /**
  * Keep all information for sampling code generation in a single place.
  */
@@ -111,66 +80,61 @@ struct lp_build_sample_context
 
 
 static void
-lp_build_sample_texel(struct lp_build_sample_context *bld,
-                      LLVMValueRef x,
-                      LLVMValueRef y,
-                      LLVMValueRef y_stride,
-                      LLVMValueRef data_ptr,
-                      LLVMValueRef *texel)
+lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          LLVMValueRef x,
+                          LLVMValueRef y,
+                          LLVMValueRef y_stride,
+                          LLVMValueRef data_ptr,
+                          LLVMValueRef *texel)
 {
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef x_stride;
    LLVMValueRef offset;
+   LLVMValueRef packed;
+
+   offset = lp_build_sample_offset(&bld->int_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
+
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+   packed = lp_build_gather(bld->builder,
+                            bld->texel_type.length,
+                            bld->format_desc->block.bits,
+                            bld->texel_type.width,
+                            data_ptr, offset);
+
+   lp_build_unpack_rgba_soa(bld->builder,
+                            bld->format_desc,
+                            bld->texel_type,
+                            packed, texel);
+}
 
-   x_stride = lp_build_const_scalar(bld->int_coord_type, bld->format_desc->block.bits/8);
-
-   if(bld->format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      LLVMValueRef x_lo, x_hi;
-      LLVMValueRef y_lo, y_hi;
-      LLVMValueRef x_stride_lo, x_stride_hi;
-      LLVMValueRef y_stride_lo, y_stride_hi;
-      LLVMValueRef x_offset_lo, x_offset_hi;
-      LLVMValueRef y_offset_lo, y_offset_hi;
-      LLVMValueRef offset_lo, offset_hi;
-
-      x_lo = LLVMBuildAnd(bld->builder, x, int_coord_bld->one, "");
-      y_lo = LLVMBuildAnd(bld->builder, y, int_coord_bld->one, "");
-
-      x_hi = LLVMBuildLShr(bld->builder, x, int_coord_bld->one, "");
-      y_hi = LLVMBuildLShr(bld->builder, y, int_coord_bld->one, "");
-
-      x_stride_lo = x_stride;
-      y_stride_lo = lp_build_const_scalar(bld->int_coord_type, 2*bld->format_desc->block.bits/8);
-
-      x_stride_hi = lp_build_const_scalar(bld->int_coord_type, 4*bld->format_desc->block.bits/8);
-      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, int_coord_bld->one, "");
-
-      x_offset_lo = lp_build_mul(int_coord_bld, x_lo, x_stride_lo);
-      y_offset_lo = lp_build_mul(int_coord_bld, y_lo, y_stride_lo);
-      offset_lo = lp_build_add(int_coord_bld, x_offset_lo, y_offset_lo);
-
-      x_offset_hi = lp_build_mul(int_coord_bld, x_hi, x_stride_hi);
-      y_offset_hi = lp_build_mul(int_coord_bld, y_hi, y_stride_hi);
-      offset_hi = lp_build_add(int_coord_bld, x_offset_hi, y_offset_hi);
 
-      offset = lp_build_add(int_coord_bld, offset_hi, offset_lo);
-   }
-   else {
-      LLVMValueRef x_offset;
-      LLVMValueRef y_offset;
+static LLVMValueRef
+lp_build_sample_packed(struct lp_build_sample_context *bld,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef offset;
 
-      x_offset = lp_build_mul(int_coord_bld, x, x_stride);
-      y_offset = lp_build_mul(int_coord_bld, y, y_stride);
+   offset = lp_build_sample_offset(&bld->int_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
 
-      offset = lp_build_add(int_coord_bld, x_offset, y_offset);
-   }
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
 
-   lp_build_load_rgba_soa(bld->builder,
-                          bld->format_desc,
-                          bld->texel_type,
-                          data_ptr,
-                          offset,
-                          texel);
+   return lp_build_gather(bld->builder,
+                          bld->texel_type.length,
+                          bld->format_desc->block.bits,
+                          bld->texel_type.width,
+                          data_ptr, offset);
 }
 
 
@@ -208,7 +172,8 @@ lp_build_sample_wrap(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       /* FIXME */
-      _debug_printf("warning: failed to translate texture wrap mode %u\n", wrap_mode);
+      _debug_printf("warning: failed to translate texture wrap mode %s\n",
+                    debug_dump_tex_wrap(wrap_mode, TRUE));
       coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
       coord = lp_build_min(int_coord_bld, coord, length_minus_one);
       break;
@@ -240,7 +205,7 @@ lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
    x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
    y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
 
-   lp_build_sample_texel(bld, x, y, stride, data_ptr, texel);
+   lp_build_sample_texel_soa(bld, x, y, stride, data_ptr, texel);
 }
 
 
@@ -274,8 +239,8 @@ lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
    s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
    t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
 
-   x0 = lp_build_int(&bld->coord_bld, s_ipart);
-   y0 = lp_build_int(&bld->coord_bld, t_ipart);
+   x0 = lp_build_itrunc(&bld->coord_bld, s_ipart);
+   y0 = lp_build_itrunc(&bld->coord_bld, t_ipart);
 
    x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
    y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
@@ -286,10 +251,10 @@ lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
    x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
    y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
 
-   lp_build_sample_texel(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
-   lp_build_sample_texel(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
-   lp_build_sample_texel(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
-   lp_build_sample_texel(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
+   lp_build_sample_texel_soa(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel_soa(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel_soa(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
 
    /* TODO: Don't interpolate missing channels */
    for(chan = 0; chan < 4; ++chan) {
@@ -304,6 +269,217 @@ lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
 
 
 static void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   unsigned chan;
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if(start)
+         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+
+      if(stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
+}
+
+
+static void
+lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2];
+   LLVMValueRef neighbors_lo[2][2];
+   LLVMValueRef neighbors_hi[2][2];
+   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef unswizzled[4];
+
+   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+
+   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   s = LLVMBuildAdd(builder, s, i32_c128, "");
+   t = LLVMBuildAdd(builder, t, i32_c128, "");
+
+   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+
+   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
+   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
+
+   x0 = s_ipart;
+   y0 = t_ipart;
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+
+   {
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffle_lo;
+      LLVMValueRef shuffle_hi;
+      unsigned i, j;
+
+      for(j = 0; j < h16.type.length; j += 4) {
+         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
+         LLVMValueRef index;
+
+         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_lo[j + i] = index;
+
+         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_hi[j + i] = index;
+      }
+
+      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
+      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
+      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
+      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+
+   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
+   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
+   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
+   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+
+   /*
+    * Linear interpolate with 8.8 fixed point.
+    */
+
+   packed_lo = lp_build_lerp_2d(&h16,
+                                s_fpart_lo, t_fpart_lo,
+                                neighbors_lo[0][0],
+                                neighbors_lo[0][1],
+                                neighbors_lo[1][0],
+                                neighbors_lo[1][1]);
+
+   packed_hi = lp_build_lerp_2d(&h16,
+                                s_fpart_hi, t_fpart_hi,
+                                neighbors_hi[0][0],
+                                neighbors_hi[0][1],
+                                neighbors_hi[1][0],
+                                neighbors_hi[1][1]);
+
+   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
+
+   /*
+    * Convert to SoA and swizzle.
+    */
+
+   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
+
+   lp_build_rgba8_to_f32_soa(bld->builder,
+                             bld->texel_type,
+                             packed, unswizzled);
+
+   lp_build_format_swizzle_soa(bld->format_desc,
+                               bld->texel_type, unswizzled,
+                               texel);
+}
+
+
+static void
 lp_build_sample_compare(struct lp_build_sample_context *bld,
                         LLVMValueRef p,
                         LLVMValueRef *texel)
@@ -402,7 +578,10 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       break;
    case PIPE_TEX_FILTER_LINEAR:
    case PIPE_TEX_FILTER_ANISO:
-      lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      if(lp_format_is_rgba8(bld.format_desc))
+         lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
+      else
+         lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
       break;
    default:
       assert(0);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
index 1f6da80448..b9472127a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-struct lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index adc81569ed..3eb0e0c57c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -64,7 +64,7 @@
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
 
 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   ((INST)->FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
 
 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
@@ -157,19 +157,19 @@ emit_fetch(
    unsigned index,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[index];
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   const struct tgsi_full_src_register *reg = &inst->Src[index];
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    LLVMValueRef res;
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
 
-      switch (reg->SrcRegister.File) {
+      switch (reg->Register.File) {
       case TGSI_FILE_CONSTANT: {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->SrcRegister.Index*4 + swizzle, 0);
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
          LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
          LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
          res = lp_build_broadcast_scalar(&bld->base, scalar);
@@ -177,17 +177,17 @@ emit_fetch(
       }
 
       case TGSI_FILE_IMMEDIATE:
-         res = bld->immediates[reg->SrcRegister.Index][swizzle];
+         res = bld->immediates[reg->Register.Index][swizzle];
          assert(res);
          break;
 
       case TGSI_FILE_INPUT:
-         res = bld->inputs[reg->SrcRegister.Index][swizzle];
+         res = bld->inputs[reg->Register.Index][swizzle];
          assert(res);
          break;
 
       case TGSI_FILE_TEMPORARY:
-         res = bld->temps[reg->SrcRegister.Index][swizzle];
+         res = bld->temps[reg->Register.Index][swizzle];
          if(!res)
             return bld->base.undef;
          break;
@@ -198,14 +198,6 @@ emit_fetch(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      res = bld->base.zero;
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      res = bld->base.one;
-      break;
-
    default:
       assert( 0 );
       return bld->base.undef;
@@ -275,7 +267,7 @@ emit_store(
    unsigned chan_index,
    LLVMValueRef value)
 {
-   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[index];
+   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
 
    switch( inst->Instruction.Saturate ) {
    case TGSI_SAT_NONE:
@@ -295,13 +287,13 @@ emit_store(
       assert(0);
    }
 
-   switch( reg->DstRegister.File ) {
+   switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
-      bld->outputs[reg->DstRegister.Index][chan_index] = value;
+      bld->outputs[reg->Register.Index][chan_index] = value;
       break;
 
    case TGSI_FILE_TEMPORARY:
-      bld->temps[reg->DstRegister.Index][chan_index] = value;
+      bld->temps[reg->Register.Index][chan_index] = value;
       break;
 
    case TGSI_FILE_ADDRESS:
@@ -327,14 +319,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           boolean projected,
           LLVMValueRef *texel)
 {
-   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   const uint unit = inst->Src[1].Register.Index;
    LLVMValueRef lodbias;
    LLVMValueRef oow;
    LLVMValueRef coords[3];
    unsigned num_coords;
    unsigned i;
 
-   switch (inst->InstructionExtTexture.Texture) {
+   switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
       num_coords = 1;
       break;
@@ -383,7 +375,7 @@ emit_kil(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst )
 {
-   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[0];
+   const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
    LLVMValueRef mask;
    unsigned chan_index;
@@ -394,12 +386,7 @@ emit_kil(
       unsigned swizzle;
 
       /* Unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
-
-      /* Note that we test if the value is less than zero, so 1.0 and 0.0 need
-       * not to be tested. */
-      if(swizzle == TGSI_EXTSWIZZLE_ZERO || swizzle == TGSI_EXTSWIZZLE_ONE)
-         continue;
+      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
 
       /* Check if the component has not been already tested. */
       assert(swizzle < NUM_CHANNELS);
@@ -436,15 +423,15 @@ indirect_temp_reference(const struct tgsi_full_instruction *inst)
 {
    uint i;
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
-      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
-          reg->SrcRegister.Indirect)
+      const struct tgsi_full_src_register *reg = &inst->Src[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
          return TRUE;
    }
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
-      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
-          reg->DstRegister.Indirect)
+      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
          return TRUE;
    }
    return FALSE;
@@ -488,7 +475,6 @@ emit_instruction(
 #endif
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
       }
@@ -574,9 +560,9 @@ emit_instruction(
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
-         LLVMValueRef *p_floor_log2;
-         LLVMValueRef *p_exp;
-         LLVMValueRef *p_log2;
+         LLVMValueRef *p_floor_log2 = NULL;
+         LLVMValueRef *p_exp = NULL;
+         LLVMValueRef *p_log2 = NULL;
 
          src0 = emit_fetch( bld, inst, 0, CHAN_X );
          src0 = lp_build_abs( &bld->base, src0 );
@@ -1386,15 +1372,6 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_NOISE1:
-   case TGSI_OPCODE_NOISE2:
-   case TGSI_OPCODE_NOISE3:
-   case TGSI_OPCODE_NOISE4:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = bld->base.zero;
-      }
-      break;
-
    case TGSI_OPCODE_NOP:
       break;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
index 606243d6c5..1320a26721 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.c
@@ -160,12 +160,31 @@ lp_build_int_vec_type(struct lp_type type)
 struct lp_type
 lp_int_type(struct lp_type type)
 {
-   struct lp_type int_type;
+   struct lp_type res_type;
 
-   memset(&int_type, 0, sizeof int_type);
-   int_type.width = type.width;
-   int_type.length = type.length;
-   return int_type;
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+
+   return res_type;
+}
+
+
+/**
+ * Return the type with twice the bit width (hence half the number of elements).
+ */
+struct lp_type
+lp_wider_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memcpy(&res_type, &type, sizeof res_type);
+   res_type.width *= 2;
+   res_type.length /= 2;
+
+   assert(res_type.length);
+
+   return res_type;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
index ee5ca3483c..2fb233d335 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.h
@@ -43,13 +43,18 @@
 
 
 /**
+ * Native SIMD register width.
+ *
+ * 128 for all architectures we care about.
+ */
+#define LP_NATIVE_VECTOR_WIDTH 128
+
+/**
  * Several functions can only cope with vectors of length up to this value.
  * You may need to increase that value if you want to represent bigger vectors.
  */
 #define LP_MAX_VECTOR_LENGTH 16
 
-#define LP_MAX_TYPE_WIDTH 64
-
 
 /**
  * The LLVM type system can't conveniently express all the things we care about
@@ -134,6 +139,91 @@ struct lp_build_context
 };
 
 
+static INLINE struct lp_type
+lp_type_float(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_int(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_uint(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_unorm(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.norm = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_fixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_ufixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
 LLVMTypeRef
 lp_build_elem_type(struct lp_type type);
 
@@ -166,6 +256,10 @@ struct lp_type
 lp_int_type(struct lp_type type);
 
 
+struct lp_type
+lp_wider_type(struct lp_type type);
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index bdcff94b9b..08d9f2e273 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -50,6 +50,7 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
                double depth, unsigned stencil)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   union util_color uc;
    unsigned cv;
    uint i;
 
@@ -64,8 +65,8 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
          struct pipe_surface *ps = llvmpipe->framebuffer.cbufs[i];
 
-         util_pack_color(rgba, ps->format, &cv);
-         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
+         util_pack_color(rgba, ps->format, &uc);
+         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, uc.ui);
       }
       llvmpipe->dirty_render_cache = TRUE;
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index a4b2bd8c2a..679e244274 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -31,13 +31,13 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_clear.h"
 #include "lp_context.h"
 #include "lp_flush.h"
-#include "lp_prim_setup.h"
 #include "lp_prim_vbuf.h"
 #include "lp_state.h"
 #include "lp_surface.h"
@@ -107,11 +107,21 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    if (llvmpipe->draw)
       draw_destroy( llvmpipe->draw );
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->texture[i], NULL);
+   }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      lp_destroy_tex_tile_cache(llvmpipe->vertex_tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->vertex_textures[i], NULL);
+   }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
       if (llvmpipe->constants[i].buffer) {
@@ -140,6 +150,11 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          llvmpipe->framebuffer.zsbuf->texture == texture)
          return PIPE_REFERENCED_FOR_WRITE;
    }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      if (llvmpipe->vertex_tex_cache[i] &&
+          llvmpipe->vertex_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
    
    return PIPE_UNREFERENCED;
 }
@@ -175,7 +190,8 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.delete_blend_state = llvmpipe_delete_blend_state;
 
    llvmpipe->pipe.create_sampler_state = llvmpipe_create_sampler_state;
-   llvmpipe->pipe.bind_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.bind_fragment_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.bind_vertex_sampler_states  = llvmpipe_bind_vertex_sampler_states;
    llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
 
    llvmpipe->pipe.create_depth_stencil_alpha_state = llvmpipe_create_depth_stencil_state;
@@ -200,7 +216,8 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
    llvmpipe->pipe.set_polygon_stipple = llvmpipe_set_polygon_stipple;
    llvmpipe->pipe.set_scissor_state = llvmpipe_set_scissor_state;
-   llvmpipe->pipe.set_sampler_textures = llvmpipe_set_sampler_textures;
+   llvmpipe->pipe.set_fragment_sampler_textures = llvmpipe_set_sampler_textures;
+   llvmpipe->pipe.set_vertex_sampler_textures = llvmpipe_set_vertex_sampler_textures;
    llvmpipe->pipe.set_viewport_state = llvmpipe_set_viewport_state;
 
    llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
@@ -229,13 +246,15 @@ llvmpipe_create( struct pipe_screen *screen )
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
       llvmpipe->tex_cache[i] = lp_create_tex_tile_cache( screen );
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
+      llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
 
 
    /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
       llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
       llvmpipe->tgsi.vert_samplers[i].processor = TGSI_PROCESSOR_VERTEX;
-      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->tex_cache[i];
+      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->vertex_tex_cache[i];
       llvmpipe->tgsi.vert_samplers_list[i] = &llvmpipe->tgsi.vert_samplers[i];
    }
 
@@ -255,25 +274,25 @@ llvmpipe_create( struct pipe_screen *screen )
       goto fail;
 
    draw_texture_samplers(llvmpipe->draw,
-                         PIPE_MAX_SAMPLERS,
+                         PIPE_MAX_VERTEX_SAMPLERS,
                          (struct tgsi_sampler **)
                             llvmpipe->tgsi.vert_samplers_list);
 
-   llvmpipe->setup = lp_draw_render_stage(llvmpipe);
-   if (!llvmpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "LP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->setup);
-   }
-   else {
-      lp_init_vbuf(llvmpipe);
-   }
+   llvmpipe->vbuf_backend = lp_create_vbuf_backend(llvmpipe);
+   if (!llvmpipe->vbuf_backend)
+      goto fail;
+
+   llvmpipe->vbuf = draw_vbuf_stage(llvmpipe->draw, llvmpipe->vbuf_backend);
+   if (!llvmpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->vbuf);
+   draw_set_render(llvmpipe->draw, llvmpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 8d5a0d4f1f..cc4d5ad5fd 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -55,6 +55,7 @@ struct llvmpipe_context {
    /** Constant state objects */
    const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
    const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    struct lp_fragment_shader *fs;
@@ -68,12 +69,15 @@ struct llvmpipe_context {
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_texture *vertex_textures[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
 
    unsigned num_samplers;
    unsigned num_textures;
+   unsigned num_vertex_samplers;
+   unsigned num_vertex_textures;
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
@@ -88,9 +92,6 @@ struct llvmpipe_context {
    /** Mapped vertex buffers */
    ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
    
-   /** Mapped constant buffers */
-   void *mapped_constants[PIPE_SHADER_TYPES];
-
    /** Vertex format */
    struct vertex_info vertex_info;
    struct vertex_info vertex_info_vbuf;
@@ -124,9 +125,10 @@ struct llvmpipe_context {
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct llvmpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
@@ -138,6 +140,7 @@ struct llvmpipe_context {
 
    unsigned tex_timestamp;
    struct llvmpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   struct llvmpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
 
    unsigned no_rast : 1;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index 89772e62d3..0aa13a1fc6 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -45,54 +45,6 @@
 
 
 
-static void
-llvmpipe_map_constant_buffers(struct llvmpipe_context *lp)
-{
-   struct pipe_screen *screen = lp->pipe.screen;
-   uint i, size;
-
-   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
-         lp->mapped_constants[i] = screen->buffer_map(screen, lp->constants[i].buffer,
-                                                      PIPE_BUFFER_USAGE_CPU_READ);
-   }
-
-   if (lp->constants[PIPE_SHADER_VERTEX].buffer)
-      size = lp->constants[PIPE_SHADER_VERTEX].buffer->size;
-   else
-      size = 0;
-
-   lp->jit_context.constants = lp->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   draw_set_mapped_constant_buffer(lp->draw,
-                                   lp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   size);
-}
-
-
-static void
-llvmpipe_unmap_constant_buffers(struct llvmpipe_context *lp)
-{
-   struct pipe_screen *screen = lp->pipe.screen;
-   uint i;
-
-   /* really need to flush all prims since the vert/frag shaders const buffers
-    * are going away now.
-    */
-   draw_flush(lp->draw);
-
-   draw_set_mapped_constant_buffer(lp->draw, NULL, 0);
-
-   lp->jit_context.constants = NULL;
-
-   for (i = 0; i < 2; i++) {
-      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
-         screen->buffer_unmap(screen, lp->constants[i].buffer);
-      lp->mapped_constants[i] = NULL;
-   }
-}
-
-
 boolean
 llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
@@ -124,7 +76,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
       llvmpipe_update_derived( lp );
 
    llvmpipe_map_transfers(lp);
-   llvmpipe_map_constant_buffers(lp);
 
    /*
     * Map vertex buffers
@@ -163,7 +114,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
 
 
    /* Note: leave drawing surfaces mapped */
-   llvmpipe_unmap_constant_buffers(lp);
 
    lp->dirty_render_cache = TRUE;
    
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index b5c1c95bb7..cd8381fe30 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -58,8 +58,10 @@ llvmpipe_flush( struct pipe_context *pipe,
        * in the hope that a later clear will wipe them out.
        */
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i])
+         if (llvmpipe->cbuf_cache[i]) {
+            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
             lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+         }
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,8 +73,10 @@ llvmpipe_flush( struct pipe_context *pipe,
    }
    else if (flags & PIPE_FLUSH_RENDER_CACHE) {
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i])
+         if (llvmpipe->cbuf_cache[i]) {
+            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
             lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+         }
 
       /* FIXME: untile zsbuf! */
      
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index b4a22ff4a9..bce3baec16 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,8 +36,10 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
 #include "lp_screen.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_misc.h"
 #include "lp_jit.h"
 
 
@@ -147,6 +149,18 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 {
    char *error = NULL;
 
+   util_cpu_detect();
+
+#if 0
+   /* For simulating less capable machines */
+   util_cpu_caps.has_sse3 = 0;
+   util_cpu_caps.has_ssse3 = 0;
+   util_cpu_caps.has_sse4_1 = 0;
+#endif
+
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+
    screen->module = LLVMModuleCreateWithName("llvmpipe");
 
    screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
@@ -154,7 +168,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
       _debug_printf("%s\n", error);
       LLVMDisposeMessage(error);
-      abort();
+      assert(0);
    }
 
    screen->target = LLVMGetExecutionEngineTargetData(screen->engine);
@@ -163,8 +177,15 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    LLVMAddTargetData(screen->target, screen->pass);
    /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
     * but there are more on SVN. */
+   /* TODO: Add more passes */
    LLVMAddConstantPropagationPass(screen->pass);
-   LLVMAddInstructionCombiningPass(screen->pass);
+   if(util_cpu_caps.has_sse4_1) {
+      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+       * and sitofp (necessary for trunc/floor/ceil/round implementation)
+       * somehow becomes invalid code.
+       */
+      LLVMAddInstructionCombiningPass(screen->pass);
+   }
    LLVMAddPromoteMemoryToRegisterPass(screen->pass);
    LLVMAddGVNPass(screen->pass);
    LLVMAddCFGSimplificationPass(screen->pass);
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.c b/src/gallium/drivers/llvmpipe/lp_prim_setup.c
deleted file mode 100644
index b14f8fb99d..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in lp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "lp_context.h"
-#include "lp_setup.h"
-#include "lp_state.h"
-#include "lp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   llvmpipe_setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   llvmpipe_setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   llvmpipe_setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *lp_draw_render_stage( struct llvmpipe_context *llvmpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = llvmpipe_setup_create_context(llvmpipe);
-   sstage->stage.draw = llvmpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-lp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-lp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.h b/src/gallium/drivers/llvmpipe/lp_prim_setup.h
deleted file mode 100644
index da6cae6375..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef LP_PRIM_SETUP_H
-#define LP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct llvmpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-lp_draw_render_stage( struct llvmpipe_context *llvmpipe );
-
-extern struct setup_context *
-lp_draw_setup_context( struct draw_stage * );
-
-extern void
-lp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-lp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* LP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
index c394dcb61d..4abff4eccc 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
@@ -37,10 +37,9 @@
 
 
 #include "lp_context.h"
+#include "lp_setup.h"
 #include "lp_state.h"
 #include "lp_prim_vbuf.h"
-#include "lp_prim_setup.h"
-#include "lp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
@@ -59,6 +58,8 @@ struct llvmpipe_vbuf_render
 {
    struct vbuf_render base;
    struct llvmpipe_context *llvmpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -75,6 +76,11 @@ llvmpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 lp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -105,36 +111,6 @@ lp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 lp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -160,11 +136,7 @@ static boolean
 lp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = lp_draw_setup_context(cvbr->llvmpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
    llvmpipe_setup_prepare( setup_ctx );
 
@@ -193,14 +165,9 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -367,11 +334,6 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in lp_setup.c:
-    */
-   lp_draw_flush( setup );
 }
 
 
@@ -384,17 +346,12 @@ lp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -568,40 +525,38 @@ static void
 lp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   cvbr->llvmpipe->vbuf_render = NULL;
+   llvmpipe_setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-lp_init_vbuf(struct llvmpipe_context *lp)
+struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *lp)
 {
-   assert(lp->draw);
+   struct llvmpipe_vbuf_render *cvbr = CALLOC_STRUCT(llvmpipe_vbuf_render);
 
-   lp->vbuf_render = CALLOC_STRUCT(llvmpipe_vbuf_render);
+   assert(lp->draw);
 
-   lp->vbuf_render->base.max_indices = LP_MAX_VBUF_INDEXES;
-   lp->vbuf_render->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->base.get_vertex_info = lp_vbuf_get_vertex_info;
-   lp->vbuf_render->base.allocate_vertices = lp_vbuf_allocate_vertices;
-   lp->vbuf_render->base.map_vertices = lp_vbuf_map_vertices;
-   lp->vbuf_render->base.unmap_vertices = lp_vbuf_unmap_vertices;
-   lp->vbuf_render->base.set_primitive = lp_vbuf_set_primitive;
-   lp->vbuf_render->base.draw = lp_vbuf_draw;
-   lp->vbuf_render->base.draw_arrays = lp_vbuf_draw_arrays;
-   lp->vbuf_render->base.release_vertices = lp_vbuf_release_vertices;
-   lp->vbuf_render->base.destroy = lp_vbuf_destroy;
+   cvbr->base.max_indices = LP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->llvmpipe = lp;
+   cvbr->base.get_vertex_info = lp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = lp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = lp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = lp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = lp_vbuf_set_primitive;
+   cvbr->base.draw = lp_vbuf_draw;
+   cvbr->base.draw_arrays = lp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = lp_vbuf_release_vertices;
+   cvbr->base.destroy = lp_vbuf_destroy;
 
-   lp->vbuf = draw_vbuf_stage(lp->draw, &lp->vbuf_render->base);
+   cvbr->llvmpipe = lp;
 
-   draw_set_rasterize_stage(lp->draw, lp->vbuf);
+   cvbr->setup = llvmpipe_setup_create_context(cvbr->llvmpipe);
 
-   draw_set_render(lp->draw, &lp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
index 6c4e6063e6..0676e2f42a 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct llvmpipe_context;
 
-extern void
-lp_init_vbuf(struct llvmpipe_context *llvmpipe);
+extern struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *llvmpipe);
 
 
 #endif /* LP_VBUF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index ff7ef8658a..19fe2850fd 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -27,6 +27,7 @@
 
 
 #include "util/u_memory.h"
+#include "util/u_format.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -58,7 +59,9 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_SAMPLERS;
+      return PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
       return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
@@ -82,7 +85,7 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
+      return 9;  /* max 256x256x256 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_TGSI_CONT_SUPPORTED:
@@ -131,17 +134,17 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
    struct llvmpipe_winsys *winsys = screen->winsys;
+   const struct util_format_description *format_desc;
+
+   format_desc = util_format_description(format);
+   if(!format_desc)
+      return FALSE;
 
    assert(target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_2D ||
           target == PIPE_TEXTURE_3D ||
           target == PIPE_TEXTURE_CUBE);
 
-   if(format == PIPE_FORMAT_Z16_UNORM)
-      return FALSE;
-   if(format == PIPE_FORMAT_S8_UNORM)
-      return FALSE;
-
    switch(format) {
    case PIPE_FORMAT_DXT1_RGB:
    case PIPE_FORMAT_DXT1_RGBA:
@@ -152,8 +155,51 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       break;
    }
 
-   if(tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
-      return winsys->is_displaytarget_format_supported(winsys, format);
+   if(tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+      if(format_desc->block.width != 1 ||
+         format_desc->block.height != 1)
+         return FALSE;
+
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+         return FALSE;
+
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB)
+         return FALSE;
+   }
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET) {
+      if(!winsys->is_displaytarget_format_supported(winsys, format))
+         return FALSE;
+   }
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /* FIXME: Temporary restriction. See lp_state_fs.c. */
+      if(format_desc->block.bits != 32)
+         return FALSE;
+   }
+
+   /* FIXME: Temporary restrictions. See lp_bld_sample_soa.c */
+   if(tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) {
+      if(format_desc->block.width != 1 ||
+         format_desc->block.height != 1)
+         return FALSE;
+
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+         return FALSE;
+
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+   }
 
    return TRUE;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 2d2fc19a65..b4aabd4d7c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "lp_context.h"
-#include "lp_prim_setup.h"
 #include "lp_quad.h"
 #include "lp_setup.h"
 #include "lp_state.h"
@@ -90,6 +89,8 @@ struct setup_context {
    float oneoverarea;
    int facing;
 
+   float pixel_offset;
+
    struct quad_header quad[MAX_QUADS];
    struct quad_header *quad_ptrs[MAX_QUADS];
    unsigned count;
@@ -115,6 +116,7 @@ struct setup_context {
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
+ALIGN_STACK
 static void
 shade_quads(struct llvmpipe_context *llvmpipe,
             struct quad_header *quads[],
@@ -124,7 +126,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    struct quad_header *quad = quads[0];
    const unsigned x = quad->input.x0;
    const unsigned y = quad->input.y0;
-   uint8_t *tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+   uint8_t *tile;
    uint8_t *color;
    void *depth;
    uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
@@ -150,7 +152,13 @@ shade_quads(struct llvmpipe_context *llvmpipe,
          mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
 
    /* color buffer */
-   color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
+      llvmpipe->framebuffer.cbufs[0]) {
+      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+   }
+   else
+      color = NULL;
 
    /* depth buffer */
    if(llvmpipe->zsbuf_map) {
@@ -158,7 +166,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
       assert((y % 2) == 0);
       depth = llvmpipe->zsbuf_map +
               y*llvmpipe->zsbuf_transfer->stride +
-              2*x*llvmpipe->zsbuf_transfer->block.size;
+              2*x*pf_get_blocksize(llvmpipe->zsbuf_transfer->texture->format);
    }
    else
       depth = NULL;
@@ -271,11 +279,13 @@ clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
        * until we codegenerate single-quad variants of the fragment pipeline
        * we need this hack. */
       const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      struct quad_header quads[nr_quads];
-      struct quad_header *quad_ptrs[nr_quads];
+      struct quad_header quads[4];
+      struct quad_header *quad_ptrs[4];
       int x0 = block_x(quad->input.x0);
       unsigned i;
 
+      assert(nr_quads == 4);
+
       for(i = 0; i < nr_quads; ++i) {
          int x = x0 + 2*i;
          if(x == quad->input.x0)
@@ -474,6 +484,16 @@ static boolean setup_sort_vertices( struct setup_context *setup,
       ((det > 0.0) ^ 
        (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
+    */
+   if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
    return TRUE;
 }
 
@@ -499,7 +519,7 @@ static void tri_pos_coeff( struct setup_context *setup,
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
+    * pixel centers, in other words (pixel_offset, pixel_offset).
     *
     * this is neat but unfortunately not a good way to do things for
     * triangles with very large values of dadx or dady as it will
@@ -510,8 +530,8 @@ static void tri_pos_coeff( struct setup_context *setup,
     * instead - i'll switch to this later.
     */
    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
    /*
    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -600,8 +620,8 @@ static void tri_linear_coeff( struct setup_context *setup,
        * instead - i'll switch to this later.
        */
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
       /*
       debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -652,8 +672,8 @@ static void tri_persp_coeff( struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (mina -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
@@ -737,12 +757,12 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
 static void setup_tri_edges( struct setup_context *setup )
 {
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
 
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
 
    setup->emaj.sy = ceilf(vmin_y);
    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
@@ -941,8 +961,8 @@ linear_pos_coeff(struct setup_context *setup,
    setup->coef.dadx[0][i] = dadx;
    setup->coef.dady[0][i] = dady;
    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 }
 
 
@@ -963,8 +983,8 @@ line_linear_coeff(struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
@@ -989,8 +1009,8 @@ line_persp_coeff(struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 7b26ce61a3..d1c74ab07b 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -126,6 +126,10 @@ void *
 llvmpipe_create_sampler_state(struct pipe_context *,
                               const struct pipe_sampler_state *);
 void llvmpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void
+llvmpipe_bind_vertex_sampler_states(struct pipe_context *,
+                                    unsigned num_samplers,
+                                    void **samplers);
 void llvmpipe_delete_sampler_state(struct pipe_context *, void *);
 
 void *
@@ -172,6 +176,11 @@ void llvmpipe_set_sampler_textures( struct pipe_context *,
                                     unsigned num,
                                     struct pipe_texture ** );
 
+void
+llvmpipe_set_vertex_sampler_textures(struct pipe_context *,
+                                     unsigned num_textures,
+                                     struct pipe_texture **);
+
 void llvmpipe_set_viewport_state( struct pipe_context *,
                                   const struct pipe_viewport_state * );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
index 3f03bd0057..b2e75d3b14 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -76,7 +76,7 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
    for (i = 0; i < 4; ++i) {
       uint8_t c = float_to_ubyte(blend_color->color[i]);
       for (j = 0; j < 16; ++j)
-         llvmpipe->jit_context.blend_color[i*4 + j] = c;
+         llvmpipe->jit_context.blend_color[i*16 + j] = c;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index e87976b9f3..e703964aaa 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -65,26 +65,19 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
    if (vinfo->num_attribs == 0) {
       /* compute vertex layout now */
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-      const enum interp_mode colorInterp
-         = llvmpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(llvmpipe->draw);
       uint i;
 
-      if (llvmpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(llvmpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -93,35 +86,40 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < lpfs->info.num_inputs; i++) {
          int src;
-         switch (lpfs->info.input_semantic_name[i]) {
-         case TGSI_SEMANTIC_POSITION:
-            src = draw_find_vs_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
-            break;
+         enum interp_mode interp;
 
-         case TGSI_SEMANTIC_COLOR:
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_COLOR, 
-                                 lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         switch (lpfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
             break;
-
-         case TGSI_SEMANTIC_FOG:
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
             break;
-
-         case TGSI_SEMANTIC_GENERIC:
-         case TGSI_SEMANTIC_FACE:
-            /* this includes texcoords and varying vars */
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_GENERIC,
-                                      lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
             break;
-
          default:
             assert(0);
+            interp = INTERP_LINEAR;
+         }
+
+         switch (lpfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            interp = INTERP_POS;
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            if (llvmpipe->rasterizer->flatshade) {
+               interp = INTERP_CONSTANT;
+            }
+            break;
          }
+
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(llvmpipe->draw,
+                                   lpfs->info.input_semantic_name[i],
+                                   lpfs->info.input_semantic_index[i]);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
       llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
@@ -200,10 +198,14 @@ update_tgsi_samplers( struct llvmpipe_context *llvmpipe )
    unsigned i;
 
    /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->sampler[i];
-      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->texture[i];
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->vertex_samplers[i];
+      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->vertex_textures[i];
+      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
+   }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      lp_tex_tile_cache_validate_texture( llvmpipe->vertex_tex_cache[i] );
    }
 
    /* fragment shader samplers */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 9faed5a0b1..ee0f69b2af 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -83,6 +83,7 @@
 #include "lp_bld_debug.h"
 #include "lp_screen.h"
 #include "lp_context.h"
+#include "lp_buffer.h"
 #include "lp_state.h"
 #include "lp_quad.h"
 #include "lp_tex_sample.h"
@@ -147,6 +148,20 @@ generate_depth(LLVMBuilderRef builder,
    format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(src_type.floating) {
+      src_type.sign = FALSE;
+      src_type.norm = TRUE;
+   }
+   else {
+      assert(!src_type.sign);
+      assert(src_type.norm);
+   }
+
    /* Pick the depth type. */
    dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
 
@@ -154,14 +169,11 @@ generate_depth(LLVMBuilderRef builder,
    assert(dst_type.width == src_type.width);
    assert(dst_type.length == src_type.length);
 
-#if 1
-   src = lp_build_clamped_float_to_unsigned_norm(builder,
-                                                 src_type,
-                                                 dst_type.width,
-                                                 src);
-#else
    lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-#endif
+
+   dst_ptr = LLVMBuildBitCast(builder,
+                              dst_ptr,
+                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
 
    lp_build_depth_test(builder,
                        &key->depth,
@@ -399,9 +411,9 @@ generate_fragment(struct llvmpipe_context *lp,
 #ifdef DEBUG
    tgsi_dump(shader->base.tokens, 0);
    if(key->depth.enabled) {
+      debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
       debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
       debug_printf("depth.writemask = %u\n", key->depth.writemask);
-      debug_printf("depth.occlusion_count = %u\n", key->depth.occlusion_count);
    }
    if(key->alpha.enabled) {
       debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
@@ -419,6 +431,34 @@ generate_fragment(struct llvmpipe_context *lp,
       debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
    }
    debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+      if(key->sampler[i].format) {
+         debug_printf("sampler[%u] = \n", i);
+         debug_printf("  .format = %s\n",
+                      pf_name(key->sampler[i].format));
+         debug_printf("  .target = %s\n",
+                      debug_dump_tex_target(key->sampler[i].target, TRUE));
+         debug_printf("  .pot = %u %u %u\n",
+                      key->sampler[i].pot_width,
+                      key->sampler[i].pot_height,
+                      key->sampler[i].pot_depth);
+         debug_printf("  .wrap = %s %s %s\n",
+                      debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                      debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                      debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+         debug_printf("  .min_img_filter = %s\n",
+                      debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+         debug_printf("  .min_mip_filter = %s\n",
+                      debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+         debug_printf("  .mag_img_filter = %s\n",
+                      debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+         if(key->sampler[i].compare_mode)
+            debug_printf("  .compare_mode = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+         debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
+      }
+   }
+
 #endif
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
@@ -582,6 +622,13 @@ generate_fragment(struct llvmpipe_context *lp,
     * Translate the LLVM IR into machine code.
     */
 
+#ifdef DEBUG
+   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
+      LLVMDumpValue(variant->function);
+      assert(0);
+   }
+#endif
+
    LLVMRunFunctionPassManager(screen->pass, variant->function);
 
 #ifdef DEBUG
@@ -589,11 +636,6 @@ generate_fragment(struct llvmpipe_context *lp,
    debug_printf("\n");
 #endif
 
-   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function);
-      abort();
-   }
-
    variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
 
 #ifdef DEBUG
@@ -672,16 +714,29 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 void
 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             const struct pipe_constant_buffer *constants)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct pipe_buffer *buffer = constants ? constants->buffer : NULL;
+   unsigned size = buffer ? buffer->size : 0;
+   const void *data = buffer ? llvmpipe_buffer(buffer)->data : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   if(shader == PIPE_SHADER_VERTEX)
+      draw_flush(llvmpipe->draw);
+
    /* note: reference counting */
-   pipe_buffer_reference(&llvmpipe->constants[shader].buffer,
-			 buf ? buf->buffer : NULL);
+   pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
+
+   if(shader == PIPE_SHADER_FRAGMENT) {
+      llvmpipe->jit_context.constants = data;
+   }
+
+   if(shader == PIPE_SHADER_VERTEX) {
+      draw_set_mapped_constant_buffer(llvmpipe->draw, data, size);
+   }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index c69d90c723..d382f9ca87 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -78,6 +78,34 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe,
 
 
 void
+llvmpipe_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == llvmpipe->num_vertex_samplers &&
+       !memcmp(llvmpipe->vertex_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      llvmpipe->vertex_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      llvmpipe->vertex_samplers[i] = NULL;
+
+   llvmpipe->num_vertex_samplers = num_samplers;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+void
 llvmpipe_set_sampler_textures(struct pipe_context *pipe,
                               unsigned num, struct pipe_texture **texture)
 {
@@ -102,8 +130,8 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
       if(tex) {
          struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
          struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
-         jit_tex->width = tex->width[0];
-         jit_tex->height = tex->height[0];
+         jit_tex->width = tex->width0;
+         jit_tex->height = tex->height0;
          jit_tex->stride = lp_tex->stride[0];
          if(!lp_tex->dt)
             jit_tex->data = lp_tex->data;
@@ -117,6 +145,37 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
 
 
 void
+llvmpipe_set_vertex_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num_textures,
+                                     struct pipe_texture **textures)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num_textures <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_textures == llvmpipe->num_vertex_textures &&
+       !memcmp(llvmpipe->vertex_textures, textures, num_textures * sizeof(struct pipe_texture *))) {
+      return;
+   }
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num_textures ? textures[i] : NULL;
+
+      pipe_texture_reference(&llvmpipe->vertex_textures[i], tex);
+      lp_tex_tile_cache_set_texture(llvmpipe->vertex_tex_cache[i], tex);
+   }
+
+   llvmpipe->num_vertex_textures = num_textures;
+
+   llvmpipe->dirty |= LP_NEW_TEXTURE;
+}
+
+
+void
 llvmpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 177a26b7b1..c06ce8b75c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -53,10 +53,11 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
+         lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
          lp_flush_tile_cache(lp->cbuf_cache[i]);
 
          /* assign new */
-         lp->framebuffer.cbufs[i] = fb->cbufs[i];
+         pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
 
          /* update cache */
          lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
@@ -81,7 +82,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       }
 
       /* assign new */
-      lp->framebuffer.zsbuf = fb->zsbuf;
+      pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
 
       /* Tell draw module how deep the Z/depth buffer is */
       if (lp->framebuffer.zsbuf) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vs.c b/src/gallium/drivers/llvmpipe/lp_state_vs.c
index 15c3029614..8a761648e7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -92,5 +92,6 @@ llvmpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
       (struct lp_vertex_shader *)vs;
 
    draw_delete_vertex_shader(llvmpipe->draw, state->draw_data);
+   FREE( (void *)state->shader.tokens );
    FREE( state );
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index a88e110c66..39d80726e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -56,6 +56,9 @@
 #include "lp_bld_type.h"
 
 
+#define LP_TEST_NUM_SAMPLES 32
+
+
 void
 write_tsv_header(FILE *fp);
 
@@ -68,17 +71,28 @@ boolean
 test_all(unsigned verbose, FILE *fp);
 
 
+#if defined(PIPE_CC_MSVC)
+
+unsigned __int64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+#define rdtsc() __rdtsc()
+
+#elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+
 static INLINE uint64_t
 rdtsc(void)
 {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    uint32_t hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+}
+
 #else
-   return 0;
+
+#define rdtsc() 0
+
 #endif
-}
+
 
 
 float
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 94b661dcba..29fff91981 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -462,6 +462,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
+ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -477,8 +478,8 @@ test_one(unsigned verbose,
    char *error = NULL;
    blend_test_ptr_t blend_test_ptr;
    boolean success;
-   const unsigned n = 32;
-   int64_t cycles[n];
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned i, j;
 
@@ -530,11 +531,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t con[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t res[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -595,11 +596,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         uint8_t src[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t dst[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t con[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t res[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t ref[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 9dcf58e5dc..968c7a2d4a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,6 +142,7 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
+ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -156,8 +157,8 @@ test_one(unsigned verbose,
    char *error = NULL;
    conv_test_ptr_t conv_test_ptr;
    boolean success;
-   const unsigned n = 32;
-   int64_t cycles[n];
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned num_srcs;
    unsigned num_dsts;
@@ -229,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index d8455e5649..23ea9ebbe7 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -35,10 +35,11 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/Transforms/Scalar.h>
 
+#include "util/u_cpu_detect.h"
 #include "util/u_format.h"
 
-#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
+#include "lp_test.h"
 
 
 struct pixel_test_case
@@ -89,40 +90,63 @@ struct pixel_test_case test_cases[] =
 };
 
 
-typedef void (*load_ptr_t)(const void *, float *);
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct util_format_description *desc,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%s\n", desc->name);
+
+   fflush(fp);
+}
+
+
+typedef void (*load_ptr_t)(const uint32_t packed, float *);
 
 
 static LLVMValueRef
 add_load_rgba_test(LLVMModuleRef module,
-                   enum pipe_format format)
+                   const struct util_format_description *desc)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
-   LLVMValueRef ptr;
+   LLVMValueRef packed;
    LLVMValueRef rgba_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef rgba;
-   struct lp_build_loop_state loop;
 
-   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[0] = LLVMInt32Type();
    args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
 
    func = LLVMAddFunction(module, "load", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
-   ptr = LLVMGetParam(func, 0);
+   packed = LLVMGetParam(func, 0);
    rgba_ptr = LLVMGetParam(func, 1);
 
    block = LLVMAppendBasicBlock(func, "entry");
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 1, 0), &loop);
+   if(desc->block.bits < 32)
+      packed = LLVMBuildTrunc(builder, packed, LLVMIntType(desc->block.bits), "");
 
-   rgba = lp_build_load_rgba_aos(builder, format, ptr);
-   LLVMBuildStore(builder, rgba, rgba_ptr);
+   rgba = lp_build_unpack_rgba_aos(builder, desc, packed);
 
-   lp_build_loop_end(builder, LLVMConstInt(LLVMInt32Type(), 4, 0), NULL, &loop);
+   LLVMBuildStore(builder, rgba, rgba_ptr);
 
    LLVMBuildRetVoid(builder);
 
@@ -131,27 +155,28 @@ add_load_rgba_test(LLVMModuleRef module,
 }
 
 
-typedef void (*store_ptr_t)(void *, const float *);
+typedef void (*store_ptr_t)(uint32_t *, const float *);
 
 
 static LLVMValueRef
 add_store_rgba_test(LLVMModuleRef module,
-                    enum pipe_format format)
+                    const struct util_format_description *desc)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
-   LLVMValueRef ptr;
+   LLVMValueRef packed_ptr;
    LLVMValueRef rgba_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef rgba;
+   LLVMValueRef packed;
 
-   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[0] = LLVMPointerType(LLVMInt32Type(), 0);
    args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
 
    func = LLVMAddFunction(module, "store", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
-   ptr = LLVMGetParam(func, 0);
+   packed_ptr = LLVMGetParam(func, 0);
    rgba_ptr = LLVMGetParam(func, 1);
 
    block = LLVMAppendBasicBlock(func, "entry");
@@ -160,7 +185,12 @@ add_store_rgba_test(LLVMModuleRef module,
 
    rgba = LLVMBuildLoad(builder, rgba_ptr, "");
 
-   lp_build_store_rgba_aos(builder, format, ptr, rgba);
+   packed = lp_build_pack_rgba_aos(builder, desc, rgba);
+
+   if(desc->block.bits < 32)
+      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+
+   LLVMBuildStore(builder, packed, packed_ptr);
 
    LLVMBuildRetVoid(builder);
 
@@ -169,8 +199,9 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
+ALIGN_STACK
 static boolean
-test_format(const struct pixel_test_case *test)
+test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef load = NULL;
@@ -192,8 +223,8 @@ test_format(const struct pixel_test_case *test)
 
    module = LLVMModuleCreateWithName("test");
 
-   load = add_load_rgba_test(module, test->format);
-   store = add_store_rgba_test(module, test->format);
+   load = add_load_rgba_test(module, desc);
+   store = add_store_rgba_test(module, desc);
 
    if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
       LLVMDumpModule(module);
@@ -229,7 +260,7 @@ test_format(const struct pixel_test_case *test)
    memset(unpacked, 0, sizeof unpacked);
    packed = 0;
 
-   load_ptr(&test->packed, unpacked);
+   load_ptr(test->packed, unpacked);
    store_ptr(&packed, unpacked);
 
    success = TRUE;
@@ -255,18 +286,29 @@ test_format(const struct pixel_test_case *test)
    if(pass)
       LLVMDisposePassManager(pass);
 
+   if(fp)
+      write_tsv_row(fp, desc, success);
+
    return success;
 }
 
 
-int main(int argc, char **argv)
+boolean
+test_all(unsigned verbose, FILE *fp)
 {
    unsigned i;
-   int ret;
+   bool success = TRUE;
 
    for (i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); ++i)
-      if(!test_format(&test_cases[i]))
-        ret = 1;
+      if(!test_format(verbose, fp, &test_cases[i]))
+        success = FALSE;
 
-   return ret;
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   return test_all(verbose, fp);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 4592dc0b2d..314544aa9a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -34,10 +34,25 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_const.h"
+#include "lp_bld_misc.h"
 #include "lp_test.h"
 
 
+#ifdef PIPE_CC_MSVC
+static INLINE double
+round(double x)
+{
+   if (x >= 0.0)
+      return floor(x + 0.5);
+   else
+      return ceil(x - 0.5);
+}
+#endif
+
+
 void
 dump_type(FILE *fp,
           struct lp_type type)
@@ -365,6 +380,11 @@ int main(int argc, char **argv)
          n = atoi(argv[i]);
    }
 
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+
+   util_cpu_detect();
+
    if(fp) {
       /* Warm up the caches */
       test_some(0, NULL, 100);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
index 773e848242..a6d9a2c1ac 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -36,6 +36,7 @@
 #include "util/u_memory.h"
 #include "util/u_tile.h"
 #include "util/u_format.h"
+#include "util/u_math.h"
 #include "lp_context.h"
 #include "lp_surface.h"
 #include "lp_texture.h"
@@ -154,7 +155,6 @@ lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
       if (lpt->timestamp != tc->timestamp) {
          /* texture was modified, invalidate all cached tiles */
          uint i;
-         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
          for (i = 0; i < NUM_ENTRIES; i++) {
             tc->entries[i].addr.bits.invalid = 1;
          }
@@ -270,8 +270,8 @@ lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
                                      addr.bits.level, 
                                      addr.bits.z, 
                                      PIPE_TRANSFER_READ, 0, 0,
-                                     tc->texture->width[addr.bits.level],
-                                     tc->texture->height[addr.bits.level]);
+                                     u_minify(tc->texture->width0, addr.bits.level),
+                                     u_minify(tc->texture->height0, addr.bits.level));
 
          tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
 
@@ -290,7 +290,7 @@ lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
             assert(0);
          }
 
-         util_format_read_4ub(tc->tex_trans->format,
+         util_format_read_4ub(tc->tex_trans->texture->format,
                               (uint8_t *)tile->color, sizeof tile->color[0],
                               tc->tex_trans_map, tc->tex_trans->stride,
                               x, y, w, h);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
index a1365a045f..0d01c07fb5 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -544,7 +544,7 @@ compute_lambda(struct tgsi_sampler *tgsi_sampler,
       float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
       dsdx = fabsf(dsdx);
       dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * texture->width[0];
+      rho = MAX2(dsdx, dsdy) * texture->width0;
    }
    if (t) {
       float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
@@ -552,7 +552,7 @@ compute_lambda(struct tgsi_sampler *tgsi_sampler,
       float max;
       dtdx = fabsf(dtdx);
       dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * texture->height[0];
+      max = MAX2(dtdx, dtdy) * texture->height0;
       rho = MAX2(rho, max);
    }
    if (p) {
@@ -561,7 +561,7 @@ compute_lambda(struct tgsi_sampler *tgsi_sampler,
       float max;
       dpdx = fabsf(dpdx);
       dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * texture->depth[0];
+      max = MAX2(dpdx, dpdy) * texture->depth0;
       rho = MAX2(rho, max);
    }
 
@@ -726,9 +726,9 @@ get_texel(const struct tgsi_sampler *tgsi_sampler,
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
+   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
+       y < 0 || y >= (int) u_minify(texture->height0, level) ||
+       z < 0 || z >= (int) u_minify(texture->depth0, level)) {
       rgba[0][j] = sampler->border_color[0];
       rgba[1][j] = sampler->border_color[1];
       rgba[2][j] = sampler->border_color[2];
@@ -1093,8 +1093,8 @@ lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
 
    assert(sampler->normalized_coords);
 
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
 
@@ -1250,9 +1250,9 @@ lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
 
    assert(sampler->normalized_coords);
 
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+   depth = u_minify(texture->depth0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -1394,8 +1394,8 @@ lp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
    /* texture RECTS cannot be mipmapped */
    assert(level0 == level1);
 
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
 
@@ -1513,8 +1513,8 @@ lp_get_samples(struct tgsi_sampler *tgsi_sampler,
 
    /* Do this elsewhere: 
     */
-   samp->xpot = util_unsigned_logbase2( samp->texture->width[0] );
-   samp->ypot = util_unsigned_logbase2( samp->texture->height[0] );
+   samp->xpot = util_unsigned_logbase2( samp->texture->width0 );
+   samp->ypot = util_unsigned_logbase2( samp->texture->height0 );
 
    /* Try to hook in a faster sampler.  Ultimately we'll have to
     * code-generate these.  Luckily most of this looks like it is
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 724d437833..f099f903bd 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -48,7 +48,6 @@
 /* Simple, maximally packed layout.
  */
 
-
 /* Conventional allocation path for non-display textures:
  */
 static boolean
@@ -57,31 +56,31 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
 {
    struct pipe_texture *pt = &lpt->base;
    unsigned level;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
 
    unsigned buffer_size = 0;
 
-   pf_get_block(lpt->base.format, &lpt->base.block);
-
    for (level = 0; level <= pt->last_level; level++) {
-      pt->width[level] = width;
-      pt->height[level] = height;
-      pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
-      lpt->stride[level] = align(pt->nblocksx[level]*pt->block.size, 16);
+      unsigned nblocksx, nblocksy;
+
+      /* Allocate storage for whole quads. This is particularly important
+       * for depth surfaces, which are currently stored in a swizzled format. */
+      nblocksx = pf_get_nblocksx(pt->format, align(width, 2));
+      nblocksy = pf_get_nblocksy(pt->format, align(height, 2));
+
+      lpt->stride[level] = align(nblocksx * pf_get_blocksize(pt->format), 16);
 
       lpt->level_offset[level] = buffer_size;
 
-      buffer_size += (pt->nblocksy[level] *
+      buffer_size += (nblocksy *
                       ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                       lpt->stride[level]);
 
-      width  = minify(width);
-      height = minify(height);
-      depth = minify(depth);
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
    }
 
    lpt->data = align_malloc(buffer_size, 16);
@@ -95,14 +94,10 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
 {
    struct llvmpipe_winsys *winsys = screen->winsys;
 
-   pf_get_block(lpt->base.format, &lpt->base.block);
-   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
-   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
-
    lpt->dt = winsys->displaytarget_create(winsys,
                                           lpt->base.format,
-                                          lpt->base.width[0],
-                                          lpt->base.height[0],
+                                          lpt->base.width0,
+                                          lpt->base.height0,
                                           16,
                                           &lpt->stride[0] );
 
@@ -164,7 +159,7 @@ llvmpipe_texture_blanket(struct pipe_screen * screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -175,8 +170,6 @@ llvmpipe_texture_blanket(struct pipe_screen * screen,
    lpt->base = *base;
    pipe_reference_init(&lpt->base.reference, 1);
    lpt->base.screen = screen;
-   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
-   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
    lpt->stride[0] = stride[0];
 
    pipe_buffer_reference(&lpt->buffer, buffer);
@@ -221,8 +214,8 @@ llvmpipe_get_tex_surface(struct pipe_screen *screen,
       pipe_reference_init(&ps->reference, 1);
       pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
       ps->offset = lpt->level_offset[level];
       ps->usage = usage;
 
@@ -250,11 +243,17 @@ llvmpipe_get_tex_surface(struct pipe_screen *screen,
       ps->level = level;
       ps->zslice = zslice;
 
+      /* XXX shouldn't that rather be
+         tex_height = align(ps->height, 2);
+         to account for alignment done in llvmpipe_texture_layout ?
+      */
       if (pt->target == PIPE_TEXTURE_CUBE) {
-         ps->offset += face * pt->nblocksy[level] * lpt->stride[level];
+         unsigned tex_height = ps->height;
+         ps->offset += face * pf_get_nblocksy(pt->format, tex_height) * lpt->stride[level];
       }
       else if (pt->target == PIPE_TEXTURE_3D) {
-         ps->offset += zslice * pt->nblocksy[level] * lpt->stride[level];
+         unsigned tex_height = ps->height;
+         ps->offset += zslice * pf_get_nblocksy(pt->format, tex_height) * lpt->stride[level];
       }
       else {
          assert(face == 0);
@@ -295,14 +294,10 @@ llvmpipe_get_tex_transfer(struct pipe_screen *screen,
    if (lpt) {
       struct pipe_transfer *pt = &lpt->base;
       pipe_texture_reference(&pt->texture, texture);
-      pt->format = texture->format;
-      pt->block = texture->block;
       pt->x = x;
       pt->y = y;
       pt->width = w;
       pt->height = h;
-      pt->nblocksx = texture->nblocksx[level];
-      pt->nblocksy = texture->nblocksy[level];
       pt->stride = lptex->stride[level];
       pt->usage = usage;
       pt->face = face;
@@ -311,11 +306,17 @@ llvmpipe_get_tex_transfer(struct pipe_screen *screen,
 
       lpt->offset = lptex->level_offset[level];
 
+      /* XXX shouldn't that rather be
+         tex_height = align(u_minify(texture->height0, level), 2)
+         to account for alignment done in llvmpipe_texture_layout ?
+      */
       if (texture->target == PIPE_TEXTURE_CUBE) {
-         lpt->offset += face * pt->nblocksy * pt->stride;
+         unsigned tex_height = u_minify(texture->height0, level);
+         lpt->offset += face *  pf_get_nblocksy(texture->format, tex_height) * pt->stride;
       }
       else if (texture->target == PIPE_TEXTURE_3D) {
-         lpt->offset += zslice * pt->nblocksy * pt->stride;
+         unsigned tex_height = u_minify(texture->height0, level);
+         lpt->offset += zslice * pf_get_nblocksy(texture->format, tex_height) * pt->stride;
       }
       else {
          assert(face == 0);
@@ -347,23 +348,17 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
    ubyte *map, *xfer_map;
    struct llvmpipe_texture *lpt;
+   enum pipe_format format;
 
    assert(transfer->texture);
    lpt = llvmpipe_texture(transfer->texture);
+   format = lpt->base.format;
 
    if(lpt->dt) {
       struct llvmpipe_winsys *winsys = screen->winsys;
-      unsigned flags = 0;
-
-      if (transfer->usage != PIPE_TRANSFER_READ) {
-         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-      }
-
-      if (transfer->usage != PIPE_TRANSFER_WRITE) {
-         flags |= PIPE_BUFFER_USAGE_CPU_READ;
-      }
 
-      map = winsys->displaytarget_map(winsys, lpt->dt, flags);
+      map = winsys->displaytarget_map(winsys, lpt->dt,
+                                      pipe_transfer_buffer_flags(transfer));
       if (map == NULL)
          return NULL;
    }
@@ -373,7 +368,7 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE))
    {
       /* Do something to notify sharing contexts of a texture change.
        * In llvmpipe, that would mean flushing the texture cache.
@@ -382,8 +377,8 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    }
    
    xfer_map = map + llvmpipe_transfer(transfer)->offset +
-      transfer->y / transfer->block.height * transfer->stride +
-      transfer->x / transfer->block.width * transfer->block.size;
+      transfer->y / pf_get_blockheight(format) * transfer->stride +
+      transfer->x / pf_get_blockwidth(format) * pf_get_blocksize(format);
    /*printf("map = %p  xfer map = %p\n", map, xfer_map);*/
    return xfer_map;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index 143afec3d3..7a1ecf5107 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -44,10 +44,53 @@
 #include "lp_tile_cache.h"
 
 
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
 struct llvmpipe_tile_cache *
 lp_create_tile_cache( struct pipe_screen *screen )
 {
    struct llvmpipe_tile_cache *tc;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( llvmpipe_tile_cache );
    if(!tc)
@@ -193,44 +236,41 @@ lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
    if(!pt)
       return;
 
+   assert(tc->transfer_map);
+
    /* push the tile to all positions marked as clear */
    for (y = 0; y < pt->height; y += TILE_SIZE) {
       for (x = 0; x < pt->width; x += TILE_SIZE) {
          struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
 
-         switch(tile->status) {
-         case LP_TILE_STATUS_UNDEFINED:
-            break;
-
-         case LP_TILE_STATUS_CLEAR: {
-            /**
-             * Actually clear the tiles which were flagged as being in a clear state.
-             */
-
-            struct pipe_screen *screen = pt->texture->screen;
-            unsigned tw = TILE_SIZE;
-            unsigned th = TILE_SIZE;
-            void *dst;
-
-            if (pipe_clip_tile(x, y, &tw, &th, pt))
-               continue;
-
-            dst = screen->transfer_map(screen, pt);
-            assert(dst);
-            if(!dst)
-               continue;
-
-            util_fill_rect(dst, &pt->block, pt->stride,
-                           x, y, tw,  th,
-                           tc->clear_val);
-
-            screen->transfer_unmap(screen, pt);
-            break;
-         }
+         if(tile->status != LP_TILE_STATUS_UNDEFINED) {
+            unsigned w = TILE_SIZE;
+            unsigned h = TILE_SIZE;
+
+            if (!pipe_clip_tile(x, y, &w, &h, pt)) {
+               switch(tile->status) {
+               case LP_TILE_STATUS_CLEAR:
+                  /* Actually clear the tiles which were flagged as being in a
+                   * clear state. */
+                  util_fill_rect(tc->transfer_map, pt->texture->format, pt->stride,
+                                 x, y, w, h,
+                                 tc->clear_val);
+                  break;
+
+               case LP_TILE_STATUS_DEFINED:
+                  lp_tile_write_4ub(pt->texture->format,
+                                    tile->color,
+                                    tc->transfer_map, pt->stride,
+                                    x, y, w, h);
+                  break;
+
+               default:
+                  assert(0);
+                  break;
+               }
+            }
 
-         case LP_TILE_STATUS_DEFINED:
-            lp_put_tile_rgba_soa(pt, x, y, tile->color);
-            break;
+            tile->status = LP_TILE_STATUS_UNDEFINED;
          }
       }
    }
@@ -248,6 +288,14 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
    struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
    struct pipe_transfer *pt = tc->transfer;
    
+   assert(tc->surface);
+   assert(tc->transfer);
+
+   if(!tc->transfer_map)
+      lp_tile_cache_map_transfers(tc);
+
+   assert(tc->transfer_map);
+
    switch(tile->status) {
    case LP_TILE_STATUS_CLEAR:
       /* don't get tile from framebuffer, just clear it */
@@ -255,11 +303,22 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
 
-   case LP_TILE_STATUS_UNDEFINED:
-      /* get new tile data from transfer */
-      lp_get_tile_rgba_soa(pt, x, y, tile->color);
+   case LP_TILE_STATUS_UNDEFINED: {
+      unsigned w = TILE_SIZE;
+      unsigned h = TILE_SIZE;
+
+      x &= ~(TILE_SIZE - 1);
+      y &= ~(TILE_SIZE - 1);
+
+      if (!pipe_clip_tile(x, y, &w, &h, tc->transfer))
+         lp_tile_read_4ub(pt->texture->format,
+                          tile->color,
+                          tc->transfer_map, tc->transfer->stride,
+                          x, y, w, h);
+
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
+   }
 
    case LP_TILE_STATUS_DEFINED:
       /* nothing to do */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
index 6d8ba5ece7..161bab3799 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -33,42 +33,7 @@
 #include "lp_tile_soa.h"
 
 
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
+struct llvmpipe_tile_cache;  /* opaque */
 
 
 extern struct llvmpipe_tile_cache *
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.c b/src/gallium/drivers/llvmpipe/lp_tile_soa.c
deleted file mode 100644
index 4e4ccb31cc..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.c
+++ /dev/null
@@ -1,931 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * RGBA/float tile get/put functions.
- * Usable both by drivers and state trackers.
- */
-
-
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "util/u_rect.h"
-#include "util/u_tile.h"
-#include "lp_tile_cache.h"
-#include "lp_tile_soa.h"
-
-
-const unsigned char
-tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {
-   {  0,  1,  4,  5,  8,  9, 12, 13},
-   {  2,  3,  6,  7, 10, 11, 14, 15}
-};
-
-
-
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
-
-static void
-a8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = (pixel >> 24) & 0xff;
-      }
-   }
-}
-
-
-static void
-a8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
-
-static void
-x8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = 0xff;
-      }
-   }
-}
-
-
-static void
-x8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
-
-static void
-b8g8r8a8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >> 24) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = (pixel >>  0) & 0xff;
-      }
-   }
-}
-
-
-static void
-b8g8r8a8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
-
-static void
-a1r5g5b5_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >> 10) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 3) = ((pixel >> 15)       ) * 255;
-      }
-   }
-}
-
-
-static void
-a1r5g5b5_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         r = r >> 3;  /* 5 bits */
-         g = g >> 3;  /* 5 bits */
-         b = b >> 3;  /* 5 bits */
-         a = a >> 7;  /* 1 bit */
-         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
-
-static void
-a4r4g4b4_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >>  8) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  4) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 3) = ((pixel >> 12)      ) * 255 / 15;
-      }
-   }
-}
-
-
-static void
-a4r4g4b4_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         r >>= 4;
-         g >>= 4;
-         b >>= 4;
-         a >>= 4;
-         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_R5G6B5_UNORM ***/
-
-static void
-r5g6b5_get_tile_rgba(const ushort *src,
-                     unsigned w, unsigned h,
-                     uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >> 11) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x3f) * 255 / 63;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-r5g6b5_put_tile_rgba(ushort *dst,
-                     unsigned w, unsigned h,
-                     const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         uint r = (uint) TILE_PIXEL(p, j, i, 0) * 31 / 255;
-         uint g = (uint) TILE_PIXEL(p, j, i, 1) * 63 / 255;
-         uint b = (uint) TILE_PIXEL(p, j, i, 2) * 31 / 255;
-         *dst++ = (r << 11) | (g << 5) | (b);
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_Z16_UNORM ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z16_get_tile_rgba(const ushort *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   const float scale = 1.0f / 65535.0f;
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src++ * scale;
-      }
-   }
-}
-
-
-
-
-/*** PIPE_FORMAT_L8_UNORM ***/
-
-static void
-l8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = *src;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-l8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r;
-         r = TILE_PIXEL(p, j, i, 0);
-         *dst++ = (ubyte) r;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_A8_UNORM ***/
-
-static void
-a8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = 0;
-         TILE_PIXEL(p, j, i, 3) = *src;
-      }
-   }
-}
-
-
-static void
-a8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned a;
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (ubyte) a;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_R16_SNORM ***/
-
-static void
-r16_get_tile_rgba(const short *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) = MAX2(src[0] >> 7, 0);
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = 0;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-r16_put_tile_rgba(short *dst,
-                  unsigned w, unsigned h,
-                  const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, dst++) {
-         dst[0] = TILE_PIXEL(p, j, i, 0) << 7;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
-
-static void
-r16g16b16a16_get_tile_rgba(const short *src,
-                           unsigned w, unsigned h,
-                           uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src += 4) {
-         TILE_PIXEL(p, j, i, 0) = src[0] >> 8;
-         TILE_PIXEL(p, j, i, 1) = src[1] >> 8;
-         TILE_PIXEL(p, j, i, 2) = src[2] >> 8;
-         TILE_PIXEL(p, j, i, 3) = src[3] >> 8;
-      }
-   }
-}
-
-
-static void
-r16g16b16a16_put_tile_rgba(short *dst,
-                           unsigned w, unsigned h,
-                           const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, dst += 4) {
-         dst[0] = TILE_PIXEL(p, j, i, 0) << 8;
-         dst[1] = TILE_PIXEL(p, j, i, 1) << 8;
-         dst[2] = TILE_PIXEL(p, j, i, 2) << 8;
-         dst[3] = TILE_PIXEL(p, j, i, 3) << 8;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_I8_UNORM ***/
-
-static void
-i8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src;
-      }
-   }
-}
-
-
-static void
-i8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r;
-         r = TILE_PIXEL(p, j, i, 0);
-         *dst++ = (ubyte) r;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A8L8_UNORM ***/
-
-static void
-a8l8_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         ushort ra = *src++;
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = ra & 0xff;
-         TILE_PIXEL(p, j, i, 3) = ra >> 8;
-      }
-   }
-}
-
-
-static void
-a8l8_put_tile_rgba(ushort *dst,
-                   unsigned w, unsigned h,
-                   const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (a << 8) | r;
-      }
-   }
-}
-
-
-
-
-/*** PIPE_FORMAT_Z32_UNORM ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z32_get_tile_rgba(const unsigned *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   const double scale = 1.0 / (double) 0xffffffff;
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (*src++ * scale);
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_S8Z24_UNORM ***/
-
-/**
- * Return Z component as four float in [0,1].  Stencil part ignored.
- */
-static void
-s8z24_get_tile_rgba(const unsigned *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p)
-{
-   const double scale = 1.0 / ((1 << 24) - 1);
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ & 0xffffff));
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_Z24S8_UNORM ***/
-
-/**
- * Return Z component as four float in [0,1].  Stencil part ignored.
- */
-static void
-z24s8_get_tile_rgba(const unsigned *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p)
-{
-   const double scale = 1.0 / ((1 << 24) - 1);
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ >> 8));
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_Z32_FLOAT ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z32f_get_tile_rgba(const float *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src++;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
-
-/**
- * Convert YCbCr (or YCrCb) to RGBA.
- */
-static void
-ycbcr_get_tile_rgba(const ushort *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p,
-                    boolean rev)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      /* do two texels at a time */
-      for (j = 0; j < (w & ~1); j += 2, src += 2) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j, i, 0) = r;
-         TILE_PIXEL(p, j, i, 1) = g;
-         TILE_PIXEL(p, j, i, 2) = b;
-         TILE_PIXEL(p, j, i, 3) = 255;
-
-         /* odd pixel: use y1,cr,cb */
-         r = 1.164f * (y1-16) + 1.596f * (cr-128);
-         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y1-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j + 1, i, 0) = r;
-         TILE_PIXEL(p, j + 1, i, 1) = g;
-         TILE_PIXEL(p, j + 1, i, 2) = b;
-         TILE_PIXEL(p, j + 1, i, 3) = 255;
-      }
-      /* do the last texel */
-      if (w & 1) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j, i, 0) = r;
-         TILE_PIXEL(p, j, i, 1) = g;
-         TILE_PIXEL(p, j, i, 2) = b;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-fake_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (i ^ j) & 1 ? 255 : 0;
-      }
-   }
-}
-
-
-static void
-lp_tile_raw_to_rgba_soa(enum pipe_format format,
-                        void *src,
-                        uint w, uint h,
-                        uint8_t *p)
-{
-   switch (format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
-      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
-      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
-      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
-      r5g6b5_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A8L8_UNORM:
-      a8l8_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_get_tile_rgba((short *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_get_tile_rgba((short *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      z16_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      z32_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      s8z24_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-      z24s8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z32_FLOAT:
-      z32f_get_tile_rgba((float *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_YCBCR:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, p, FALSE);
-      break;
-   case PIPE_FORMAT_YCBCR_REV:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, p, TRUE);
-      break;
-   default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
-      fake_get_tile_rgba(src, w, h, p);
-   }
-}
-
-
-void
-lp_get_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     uint8_t *p)
-{
-   uint w = TILE_SIZE, h = TILE_SIZE;
-   void *packed;
-
-   if (pipe_clip_tile(x, y, &w, &h, pt))
-      return;
-
-   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
-
-   if (!packed)
-      return;
-
-   if(pt->format == PIPE_FORMAT_YCBCR || pt->format == PIPE_FORMAT_YCBCR_REV)
-      assert((x & 1) == 0);
-
-   pipe_get_tile_raw(pt, x, y, w, h, packed, 0);
-
-   lp_tile_raw_to_rgba_soa(pt->format, packed, w, h, p);
-
-   FREE(packed);
-}
-
-
-void
-lp_put_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     const uint8_t *p)
-{
-   uint w = TILE_SIZE, h = TILE_SIZE;
-   void *packed;
-
-   if (pipe_clip_tile(x, y, &w, &h, pt))
-      return;
-
-   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
-
-   if (!packed)
-      return;
-
-   switch (pt->format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
-      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
-      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
-      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      assert(0);
-      break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
-      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A8L8_UNORM:
-      a8l8_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_put_tile_rgba((short *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      /*z16_put_tile_rgba((ushort *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      /*z32_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(pt->format));
-   }
-
-   pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
-
-   FREE(packed);
-}
-
-
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
index 3d8c703b73..040b01865d 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -64,14 +64,18 @@ tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH];
 
 
 void
-lp_get_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     uint8_t *p);
+lp_tile_read_4ub(enum pipe_format format,
+                 uint8_t *dst,
+                 const void *src, unsigned src_stride,
+                 unsigned x, unsigned y, unsigned w, unsigned h);
+
 
 void
-lp_put_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     const uint8_t *p);
+lp_tile_write_4ub(enum pipe_format format,
+                  const uint8_t *src,
+                  void *dst, unsigned dst_stride,
+                  unsigned x, unsigned y, unsigned w, unsigned h);
+
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
new file mode 100644
index 0000000000..004c5c979e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+'''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format accessor functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+'''
+
+
+import sys
+import os.path
+
+sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/util'))
+
+from u_format_access import *
+
+
+def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
+    '''Generate the function to read pixels from a particular format'''
+
+    name = short_name(format)
+
+    src_native_type = native_type(format)
+
+    print 'static void'
+    print 'lp_tile_%s_read_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, dst_suffix, dst_native_type)
+    print '{'
+    print '   unsigned x, y;'
+    print '   const uint8_t *src_row = src + y0*src_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    names = ['']*4
+    if format.colorspace == 'rgb':
+        for i in range(4):
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                names[swizzle] += 'rgba'[i]
+    elif format.colorspace == 'zs':
+        swizzle = format.out_swizzle[0]
+        if swizzle < 4:
+            names[swizzle] = 'z'
+        else:
+            assert False
+    else:
+        assert False
+
+    if format.layout == ARITH:
+        print '         %s pixel = *src_pixel++;' % src_native_type
+        shift = 0;
+        for i in range(4):
+            src_type = format.in_types[i]
+            width = src_type.size
+            if names[i]:
+                value = 'pixel'
+                mask = (1 << width) - 1
+                if shift:
+                    value = '(%s >> %u)' % (value, shift)
+                if shift + width < format.block_size():
+                    value = '(%s & 0x%x)' % (value, mask)
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+            shift += width
+    elif format.layout == ARRAY:
+        for i in range(4):
+            src_type = format.in_types[i]
+            if names[i]:
+                value = '(*src_pixel++)'
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+    else:
+        assert False
+
+    for i in range(4):
+        if format.colorspace == 'rgb':
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                value = names[swizzle]
+            elif swizzle == SWIZZLE_0:
+                value = '0'
+            elif swizzle == SWIZZLE_1:
+                value = '1'
+            else:
+                assert False
+        elif format.colorspace == 'zs':
+            if i < 3:
+                value = 'z'
+            else:
+                value = '1'
+        else:
+            assert False
+        print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
+
+    print '      }'
+    print '      src_row += src_stride;'
+    print '   }'
+    print '}'
+    print
+    
+
+def generate_format_write(format, src_type, src_native_type, src_suffix):
+    '''Generate the function to write pixels to a particular format'''
+
+    name = short_name(format)
+
+    dst_native_type = native_type(format)
+
+    print 'static void'
+    print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
+    print '{'
+    print '   unsigned x, y;'
+    print '   uint8_t *dst_row = dst + y0*dst_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    inv_swizzle = [None]*4
+    if format.colorspace == 'rgb':
+        for i in range(4):
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                inv_swizzle[swizzle] = i
+    elif format.colorspace == 'zs':
+        swizzle = format.out_swizzle[0]
+        if swizzle < 4:
+            inv_swizzle[swizzle] = 0
+    else:
+        assert False
+
+    if format.layout == ARITH:
+        print '         %s pixel = 0;' % dst_native_type
+        shift = 0;
+        for i in range(4):
+            dst_type = format.in_types[i]
+            width = dst_type.size
+            if inv_swizzle[i] is not None:
+                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                if shift:
+                    value = '(%s << %u)' % (value, shift)
+                print '         pixel |= %s;' % value
+            shift += width
+        print '         *dst_pixel++ = pixel;'
+    elif format.layout == ARRAY:
+        for i in range(4):
+            dst_type = format.in_types[i]
+            if inv_swizzle[i] is not None:
+                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         *dst_pixel++ = %s;' % value
+    else:
+        assert False
+
+    print '      }'
+    print '      dst_row += dst_stride;'
+    print '   }'
+    print '}'
+    print
+    
+
+def generate_read(formats, dst_type, dst_native_type, dst_suffix):
+    '''Generate the dispatch function to read pixels from any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_read(format, dst_type, dst_native_type, dst_suffix)
+
+    print 'void'
+    print 'lp_tile_read_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
+    print '{'
+    print '   void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % dst_native_type
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_read_%s;' % (short_name(format), dst_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("unsupported format\\n");'
+    print '      return;'
+    print '   }'
+    print '   func(dst, (const uint8_t *)src, src_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def generate_write(formats, src_type, src_native_type, src_suffix):
+    '''Generate the dispatch function to write pixels to any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_write(format, src_type, src_native_type, src_suffix)
+
+    print 'void'
+    print 'lp_tile_write_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
+    
+    print '{'
+    print '   void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % src_native_type
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_write_%s;' % (short_name(format), src_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("unsupported format\\n");'
+    print '      return;'
+    print '   }'
+    print '   func(src, (uint8_t *)dst, dst_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def main():
+    formats = []
+    for arg in sys.argv[1:]:
+        formats.extend(parse(arg))
+
+    print '/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */'
+    print
+    # This will print the copyright message on the top of this file
+    print __doc__.strip()
+    print
+    print '#include "pipe/p_compiler.h"'
+    print '#include "util/u_format.h"'
+    print '#include "util/u_math.h"'
+    print '#include "lp_tile_soa.h"'
+    print
+    print 'const unsigned char'
+    print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
+    print '   {  0,  1,  4,  5,  8,  9, 12, 13},'
+    print '   {  2,  3,  6,  7, 10, 11, 14, 15}'
+    print '};'
+    print
+
+    generate_clamp()
+
+    type = Type(UNSIGNED, True, 8)
+    native_type = 'uint8_t'
+    suffix = '4ub'
+
+    generate_read(formats, type, native_type, suffix)
+    generate_write(formats, type, native_type, suffix)
+
+
+if __name__ == '__main__':
+    main()