96 files changed, 5861 insertions, 14233 deletions
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 7c6e46006b..e880042b71 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -3,42 +3,28 @@ include $(TOP)/configs/current
 
 LIBNAME = llvmpipe
 
-CFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+DEFINES += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
 
 C_SOURCES = \
-	lp_bld_alpha.c \
-	lp_bld_arit.c \
-	lp_bld_blend_aos.c \
-	lp_bld_blend_logicop.c \
-	lp_bld_blend_soa.c \
-	lp_bld_const.c \
-	lp_bld_conv.c \
-	lp_bld_debug.c \
-	lp_bld_depth.c \
-	lp_bld_flow.c \
-	lp_bld_format_aos.c \
-	lp_bld_format_query.c \
-	lp_bld_format_soa.c \
-	lp_bld_interp.c \
-	lp_bld_intr.c \
-	lp_bld_logic.c \
-	lp_bld_pack.c \
-	lp_bld_sample.c \
-	lp_bld_sample_soa.c \
-	lp_bld_swizzle.c \
-	lp_bld_struct.c \
-	lp_bld_tgsi_soa.c \
-	lp_bld_type.c \
 	lp_buffer.c \
 	lp_clear.c \
 	lp_context.c \
 	lp_draw_arrays.c \
+	lp_fence.c \
 	lp_flush.c \
 	lp_jit.c \
-	lp_prim_vbuf.c \
-	lp_setup.c \
+	lp_perf.c \
 	lp_query.c \
+	lp_rast.c \
+	lp_rast_tri.c \
+	lp_scene.c \
+	lp_scene_queue.c \
 	lp_screen.c \
+	lp_setup.c \
+	lp_setup_line.c \
+	lp_setup_point.c \
+	lp_setup_tri.c \
+	lp_setup_vbuf.c \
 	lp_state_blend.c \
 	lp_state_clip.c \
 	lp_state_derived.c \
@@ -49,16 +35,32 @@ C_SOURCES = \
 	lp_state_vertex.c \
 	lp_state_vs.c \
 	lp_surface.c \
-	lp_tex_cache.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
-	lp_tile_cache.c \
+	lp_tile_surface.c \
 	lp_tile_soa.c
 
 CPP_SOURCES = \
-	lp_bld_misc.cpp
+
 
 include ../../Makefile.template
 
 lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_access.py ../../auxiliary/util/u_format.csv
 	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
+
+
+# to make a .s file to inspect assembly code
+.c.s:
+	$(CC) -S $(INCLUDES) $(DEFINES) $(CFLAGS) $(LIBRARY_DEFINES) $<
+
+
+testprogs := lp_test_format	\
+	     lp_test_blend	\
+	     lp_test_conv
+
+LIBS += $(GL_LIB_DEPS) -L. -lllvmpipe -L../../auxiliary/ -lgallium
+
+$(testprogs): lp_test_% : lp_test_%.o lp_test_main.o libllvmpipe.a
+	$(LD) $(filter %.o,$^) -o $@ -Wl,--start-group  $(LIBS) -Wl,--end-group
+
+default: $(testprogs)
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 6bb545a501..a39283e5e8 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -21,40 +21,25 @@ env.CodeGenerate(
 llvmpipe = env.ConvenienceLibrary(
 	target = 'llvmpipe',
 	source = [
-		'lp_bld_alpha.c',
-		'lp_bld_arit.c',
-		'lp_bld_blend_aos.c',
-		'lp_bld_blend_logicop.c',
-		'lp_bld_blend_soa.c',
-		'lp_bld_const.c',
-		'lp_bld_conv.c',
-		'lp_bld_debug.c',
-		'lp_bld_depth.c',
-		'lp_bld_flow.c',
-		'lp_bld_format_aos.c',
-        'lp_bld_format_query.c',
-		'lp_bld_format_soa.c',
-		'lp_bld_interp.c',
-		'lp_bld_intr.c',
-		'lp_bld_misc.cpp',
-        'lp_bld_pack.c',
-        'lp_bld_sample.c',
-		'lp_bld_sample_soa.c',
-		'lp_bld_struct.c',
-		'lp_bld_logic.c',
-		'lp_bld_swizzle.c',
-		'lp_bld_tgsi_soa.c',		
-		'lp_bld_type.c',
 		'lp_buffer.c',
 		'lp_clear.c',
 		'lp_context.c',
 		'lp_draw_arrays.c',
+		'lp_fence.c',
 		'lp_flush.c',
 		'lp_jit.c',
-		'lp_prim_vbuf.c',
-		'lp_setup.c',
+		'lp_perf.c',
 		'lp_query.c',
+		'lp_rast.c',
+		'lp_rast_tri.c',
+		'lp_scene.c',
+		'lp_scene_queue.c',
 		'lp_screen.c',
+		'lp_setup.c',
+		'lp_setup_line.c',
+		'lp_setup_point.c',
+		'lp_setup_tri.c',
+		'lp_setup_vbuf.c',
 		'lp_state_blend.c',
 		'lp_state_clip.c',
 		'lp_state_derived.c',
@@ -65,29 +50,28 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vertex.c',
 		'lp_state_vs.c',
 		'lp_surface.c',
-		'lp_tex_cache.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
-		'lp_tile_cache.c',
 		'lp_tile_soa.c',
 	])
 
 
-env = env.Clone()
+if env['platform'] != 'embedded':
+    env = env.Clone()
 
-env.Prepend(LIBS = [llvmpipe] + gallium)
+    env.Prepend(LIBS = [llvmpipe] + gallium)
 
-tests = [
-    'format',
-    'blend',
-    'conv',
-]
+    tests = [
+        'format',
+        'blend',
+        'conv',
+    ]
 
-for test in tests:
-    target = env.Program(
-        target = 'lp_test_' + test,
-        source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
-    )
-    env.InstallProgram(target)
+    for test in tests:
+        target = env.Program(
+            target = 'lp_test_' + test,
+            source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
+        )
+        env.InstallProgram(target)
 
-Export('llvmpipe')
+    Export('llvmpipe')
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
deleted file mode 100644
index 2b4bc5c819..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_alpha.h"
-
-
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref)
-{
-   struct lp_build_context bld;
-
-   lp_build_context_init(&bld, builder, type);
-
-   if(state->enabled) {
-      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
-
-      lp_build_name(test, "alpha_mask");
-
-      lp_build_mask_update(mask, test);
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
deleted file mode 100644
index eea6b5d6a5..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ /dev/null
@@ -1,1318 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helper
- *
- * LLVM IR doesn't support all basic arithmetic operations we care about (most
- * notably min/max and saturated operations), and it is often necessary to
- * resort machine-specific intrinsics directly. The functions here hide all
- * these implementation details from the other modules.
- *
- * We also do simple expressions simplification here. Reasons are:
- * - it is very easy given we have all necessary information readily available
- * - LLVM optimization passes fail to simplify several vector expressions
- * - We often know value constraints which the optimization passes have no way
- *   of knowing, such as when source arguments are known to be in [0, 1] range.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_memory.h"
-#include "util/u_debug.h"
-#include "util/u_math.h"
-#include "util/u_string.h"
-#include "util/u_cpu_detect.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_pack.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_arit.h"
-
-
-/**
- * Generate min(a, b)
- * No checks for special case values of a or b = 1 or 0 are done.
- */
-static LLVMValueRef
-lp_build_min_simple(struct lp_build_context *bld,
-                    LLVMValueRef a,
-                    LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   const char *intrinsic = NULL;
-   LLVMValueRef cond;
-
-   /* TODO: optimize the constant case */
-
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
-            intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.min.pd";
-      }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pminsd";
-      }
-   }
-
-   if(intrinsic)
-      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
-
-   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
-   return lp_build_select(bld, cond, a, b);
-}
-
-
-/**
- * Generate max(a, b)
- * No checks for special case values of a or b = 1 or 0 are done.
- */
-static LLVMValueRef
-lp_build_max_simple(struct lp_build_context *bld,
-                    LLVMValueRef a,
-                    LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   const char *intrinsic = NULL;
-   LLVMValueRef cond;
-
-   /* TODO: optimize the constant case */
-
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
-            intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.max.pd";
-      }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
-            intrinsic = "llvm.x86.sse41.pmaxsd";
-      }
-   }
-
-   if(intrinsic)
-      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
-
-   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
-   return lp_build_select(bld, cond, a, b);
-}
-
-
-/**
- * Generate 1 - a, or ~a depending on bld->type.
- */
-LLVMValueRef
-lp_build_comp(struct lp_build_context *bld,
-              LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   if(a == bld->one)
-      return bld->zero;
-   if(a == bld->zero)
-      return bld->one;
-
-   if(type.norm && !type.floating && !type.fixed && !type.sign) {
-      if(LLVMIsConstant(a))
-         return LLVMConstNot(a);
-      else
-         return LLVMBuildNot(bld->builder, a, "");
-   }
-
-   if(LLVMIsConstant(a))
-      return LLVMConstSub(bld->one, a);
-   else
-      return LLVMBuildSub(bld->builder, bld->one, a, "");
-}
-
-
-/**
- * Generate a + b
- */
-LLVMValueRef
-lp_build_add(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   LLVMValueRef res;
-
-   if(a == bld->zero)
-      return b;
-   if(b == bld->zero)
-      return a;
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   if(bld->type.norm) {
-      const char *intrinsic = NULL;
-
-      if(a == bld->one || b == bld->one)
-        return bld->one;
-
-      if(util_cpu_caps.has_sse2 &&
-         type.width * type.length == 128 &&
-         !type.floating && !type.fixed) {
-         if(type.width == 8)
-            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-         if(type.width == 16)
-            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
-      }
-   
-      if(intrinsic)
-         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
-   }
-
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstAdd(a, b);
-   else
-      res = LLVMBuildAdd(bld->builder, a, b, "");
-
-   /* clamp to ceiling of 1.0 */
-   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
-      res = lp_build_min_simple(bld, res, bld->one);
-
-   /* XXX clamp to floor of -1 or 0??? */
-
-   return res;
-}
-
-
-/**
- * Generate a - b
- */
-LLVMValueRef
-lp_build_sub(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   LLVMValueRef res;
-
-   if(b == bld->zero)
-      return a;
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-   if(a == b)
-      return bld->zero;
-
-   if(bld->type.norm) {
-      const char *intrinsic = NULL;
-
-      if(b == bld->one)
-        return bld->zero;
-
-      if(util_cpu_caps.has_sse2 &&
-         type.width * type.length == 128 &&
-         !type.floating && !type.fixed) {
-         if(type.width == 8)
-            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-         if(type.width == 16)
-            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
-      }
-   
-      if(intrinsic)
-         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
-   }
-
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstSub(a, b);
-   else
-      res = LLVMBuildSub(bld->builder, a, b, "");
-
-   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
-      res = lp_build_max_simple(bld, res, bld->zero);
-
-   return res;
-}
-
-
-/**
- * Normalized 8bit multiplication.
- *
- * - alpha plus one
- *
- *     makes the following approximation to the division (Sree)
- *    
- *       a*b/255 ~= (a*(b + 1)) >> 256
- *    
- *     which is the fastest method that satisfies the following OpenGL criteria
- *    
- *       0*0 = 0 and 255*255 = 255
- *
- * - geometric series
- *
- *     takes the geometric series approximation to the division
- *
- *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
- *
- *     in this case just the first two terms to fit in 16bit arithmetic
- *
- *       t/255 ~= (t + (t >> 8)) >> 8
- *
- *     note that just by itself it doesn't satisfies the OpenGL criteria, as
- *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
- *     must be used
- *
- * - geometric series plus rounding
- *
- *     when using a geometric series division instead of truncating the result
- *     use roundoff in the approximation (Jim Blinn)
- *
- *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
- *
- *     achieving the exact results
- *
- * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
- *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
- * @sa Michael Herf, The "double blend trick", May 2000, 
- *     http://www.stereopsis.com/doubleblend.html
- */
-static LLVMValueRef
-lp_build_mul_u8n(LLVMBuilderRef builder,
-                 struct lp_type i16_type,
-                 LLVMValueRef a, LLVMValueRef b)
-{
-   LLVMValueRef c8;
-   LLVMValueRef ab;
-
-   c8 = lp_build_int_const_scalar(i16_type, 8);
-   
-#if 0
-   
-   /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
-   ab = LLVMBuildMul(builder, a, b, "");
-
-#else
-   
-   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
-   ab = LLVMBuildMul(builder, a, b, "");
-   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
-   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
-
-#endif
-   
-   ab = LLVMBuildLShr(builder, ab, c8, "");
-
-   return ab;
-}
-
-
-/**
- * Generate a * b
- */
-LLVMValueRef
-lp_build_mul(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   LLVMValueRef shift;
-   LLVMValueRef res;
-
-   if(a == bld->zero)
-      return bld->zero;
-   if(a == bld->one)
-      return b;
-   if(b == bld->zero)
-      return bld->zero;
-   if(b == bld->one)
-      return a;
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   if(!type.floating && !type.fixed && type.norm) {
-      if(type.width == 8) {
-         struct lp_type i16_type = lp_wider_type(type);
-         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
-
-         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
-         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
-
-         /* PMULLW, PSRLW, PADDW */
-         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
-         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
-
-         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
-         
-         return ab;
-      }
-
-      /* FIXME */
-      assert(0);
-   }
-
-   if(type.fixed)
-      shift = lp_build_int_const_scalar(type, type.width/2);
-   else
-      shift = NULL;
-
-   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
-      res =  LLVMConstMul(a, b);
-      if(shift) {
-         if(type.sign)
-            res = LLVMConstAShr(res, shift);
-         else
-            res = LLVMConstLShr(res, shift);
-      }
-   }
-   else {
-      res = LLVMBuildMul(bld->builder, a, b, "");
-      if(shift) {
-         if(type.sign)
-            res = LLVMBuildAShr(bld->builder, res, shift, "");
-         else
-            res = LLVMBuildLShr(bld->builder, res, shift, "");
-      }
-   }
-
-   return res;
-}
-
-
-/**
- * Small vector x scale multiplication optimization.
- */
-LLVMValueRef
-lp_build_mul_imm(struct lp_build_context *bld,
-                 LLVMValueRef a,
-                 int b)
-{
-   LLVMValueRef factor;
-
-   if(b == 0)
-      return bld->zero;
-
-   if(b == 1)
-      return a;
-
-   if(b == -1)
-      return LLVMBuildNeg(bld->builder, a, "");
-
-   if(b == 2 && bld->type.floating)
-      return lp_build_add(bld, a, a);
-
-   if(util_is_pot(b)) {
-      unsigned shift = ffs(b) - 1;
-
-      if(bld->type.floating) {
-#if 0
-         /*
-          * Power of two multiplication by directly manipulating the mantissa.
-          *
-          * XXX: This might not be always faster, it will introduce a small error
-          * for multiplication by zero, and it will produce wrong results
-          * for Inf and NaN.
-          */
-         unsigned mantissa = lp_mantissa(bld->type);
-         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
-         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
-         a = LLVMBuildAdd(bld->builder, a, factor, "");
-         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
-         return a;
-#endif
-      }
-      else {
-         factor = lp_build_const_scalar(bld->type, shift);
-         return LLVMBuildShl(bld->builder, a, factor, "");
-      }
-   }
-
-   factor = lp_build_const_scalar(bld->type, (double)b);
-   return lp_build_mul(bld, a, factor);
-}
-
-
-/**
- * Generate a / b
- */
-LLVMValueRef
-lp_build_div(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-
-   if(a == bld->zero)
-      return bld->zero;
-   if(a == bld->one)
-      return lp_build_rcp(bld, b);
-   if(b == bld->zero)
-      return bld->undef;
-   if(b == bld->one)
-      return a;
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstFDiv(a, b);
-
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-
-   return LLVMBuildFDiv(bld->builder, a, b, "");
-}
-
-
-/**
- * Linear interpolation.
- *
- * This also works for integer values with a few caveats.
- *
- * @sa http://www.stereopsis.com/doubleblend.html
- */
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1)
-{
-   LLVMValueRef delta;
-   LLVMValueRef res;
-
-   delta = lp_build_sub(bld, v1, v0);
-
-   res = lp_build_mul(bld, x, delta);
-
-   res = lp_build_add(bld, v0, res);
-
-   if(bld->type.fixed)
-      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
-       * but it will be wrong for other uses. Basically we need a more
-       * powerful lp_type, capable of further distinguishing the values
-       * interpretation from the value storage. */
-      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_lerp_2d(struct lp_build_context *bld,
-                 LLVMValueRef x,
-                 LLVMValueRef y,
-                 LLVMValueRef v00,
-                 LLVMValueRef v01,
-                 LLVMValueRef v10,
-                 LLVMValueRef v11)
-{
-   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
-   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
-   return lp_build_lerp(bld, y, v0, v1);
-}
-
-
-/**
- * Generate min(a, b)
- * Do checks for special cases.
- */
-LLVMValueRef
-lp_build_min(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   if(a == b)
-      return a;
-
-   if(bld->type.norm) {
-      if(a == bld->zero || b == bld->zero)
-         return bld->zero;
-      if(a == bld->one)
-         return b;
-      if(b == bld->one)
-         return a;
-   }
-
-   return lp_build_min_simple(bld, a, b);
-}
-
-
-/**
- * Generate max(a, b)
- * Do checks for special cases.
- */
-LLVMValueRef
-lp_build_max(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   if(a == b)
-      return a;
-
-   if(bld->type.norm) {
-      if(a == bld->one || b == bld->one)
-         return bld->one;
-      if(a == bld->zero)
-         return b;
-      if(b == bld->zero)
-         return a;
-   }
-
-   return lp_build_max_simple(bld, a, b);
-}
-
-
-/**
- * Generate abs(a)
- */
-LLVMValueRef
-lp_build_abs(struct lp_build_context *bld,
-             LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-
-   if(!type.sign)
-      return a;
-
-   if(type.floating) {
-      /* Mask out the sign bit */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      unsigned long absMask = ~(1 << (type.width - 1));
-      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      a = LLVMBuildAnd(bld->builder, a, mask, "");
-      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-      return a;
-   }
-
-   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
-      switch(type.width) {
-      case 8:
-         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-      case 16:
-         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
-      case 32:
-         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
-      }
-   }
-
-   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
-}
-
-
-LLVMValueRef
-lp_build_sgn(struct lp_build_context *bld,
-             LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMValueRef cond;
-   LLVMValueRef res;
-
-   /* Handle non-zero case */
-   if(!type.sign) {
-      /* if not zero then sign must be positive */
-      res = bld->one;
-   }
-   else if(type.floating) {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef one;
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      one = LLVMConstBitCast(bld->one, int_vec_type);
-      res = LLVMBuildOr(bld->builder, sign, one, "");
-      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
-   }
-   else
-   {
-      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
-      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
-      res = lp_build_select(bld, cond, bld->one, minus_one);
-   }
-
-   /* Handle zero */
-   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
-   res = lp_build_select(bld, cond, bld->zero, bld->one);
-
-   return res;
-}
-
-
-enum lp_build_round_sse41_mode
-{
-   LP_BUILD_ROUND_SSE41_NEAREST = 0,
-   LP_BUILD_ROUND_SSE41_FLOOR = 1,
-   LP_BUILD_ROUND_SSE41_CEIL = 2,
-   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
-};
-
-
-static INLINE LLVMValueRef
-lp_build_round_sse41(struct lp_build_context *bld,
-                     LLVMValueRef a,
-                     enum lp_build_round_sse41_mode mode)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   const char *intrinsic;
-
-   assert(type.floating);
-   assert(type.width*type.length == 128);
-   assert(lp_check_value(type, a));
-   assert(util_cpu_caps.has_sse4_1);
-
-   switch(type.width) {
-   case 32:
-      intrinsic = "llvm.x86.sse41.round.ps";
-      break;
-   case 64:
-      intrinsic = "llvm.x86.sse41.round.pd";
-      break;
-   default:
-      assert(0);
-      return bld->undef;
-   }
-
-   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
-                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
-}
-
-
-LLVMValueRef
-lp_build_trunc(struct lp_build_context *bld,
-               LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1)
-      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef res;
-      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
-      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
-      return res;
-   }
-}
-
-
-LLVMValueRef
-lp_build_round(struct lp_build_context *bld,
-               LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1)
-      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef res;
-      res = lp_build_iround(bld, a);
-      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
-      return res;
-   }
-}
-
-
-LLVMValueRef
-lp_build_floor(struct lp_build_context *bld,
-               LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   assert(type.floating);
-
-   if(util_cpu_caps.has_sse4_1)
-      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef res;
-      res = lp_build_ifloor(bld, a);
-      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
-      return res;
-   }
-}
-
-
-LLVMValueRef
-lp_build_ceil(struct lp_build_context *bld,
-              LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1)
-      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef res;
-      res = lp_build_iceil(bld, a);
-      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
-      return res;
-   }
-}
-
-
-/**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating to zero.
- */
-LLVMValueRef
-lp_build_itrunc(struct lp_build_context *bld,
-                LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
-}
-
-
-LLVMValueRef
-lp_build_iround(struct lp_build_context *bld,
-                LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-   LLVMValueRef res;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
-   }
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef half;
-
-      /* get sign bit */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-
-      /* sign * 0.5 */
-      half = lp_build_const_scalar(type, 0.5);
-      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
-      half = LLVMBuildOr(bld->builder, sign, half, "");
-      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
-
-      res = LLVMBuildAdd(bld->builder, a, half, "");
-   }
-
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
-                LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-   LLVMValueRef res;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   }
-   else {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef offset;
-
-      /* sign = a < 0 ? ~0 : 0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
-
-      /* offset = -0.99999(9)f */
-      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
-
-      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
-
-      res = LLVMBuildAdd(bld->builder, a, offset, "");
-   }
-
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_iceil(struct lp_build_context *bld,
-               LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-   LLVMValueRef res;
-
-   assert(type.floating);
-   assert(lp_check_value(type, a));
-
-   if(util_cpu_caps.has_sse4_1) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
-   }
-   else {
-      assert(0);
-      res = bld->undef;
-   }
-
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_sqrt(struct lp_build_context *bld,
-              LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
-
-   /* TODO: optimize the constant case */
-   /* TODO: optimize the constant case */
-
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
-
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-}
-
-
-LLVMValueRef
-lp_build_rcp(struct lp_build_context *bld,
-             LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   if(a == bld->zero)
-      return bld->undef;
-   if(a == bld->one)
-      return bld->one;
-   if(a == bld->undef)
-      return bld->undef;
-
-   assert(type.floating);
-
-   if(LLVMIsConstant(a))
-      return LLVMConstFDiv(bld->one, a);
-
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      /* FIXME: improve precision */
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-
-   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
-}
-
-
-/**
- * Generate 1/sqrt(a)
- */
-LLVMValueRef
-lp_build_rsqrt(struct lp_build_context *bld,
-               LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-
-   assert(type.floating);
-
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-
-   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
-}
-
-
-/**
- * Generate cos(a)
- */
-LLVMValueRef
-lp_build_cos(struct lp_build_context *bld,
-              LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
-
-   /* TODO: optimize the constant case */
-
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
-
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-}
-
-
-/**
- * Generate sin(a)
- */
-LLVMValueRef
-lp_build_sin(struct lp_build_context *bld,
-              LLVMValueRef a)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
-
-   /* TODO: optimize the constant case */
-
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
-
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-}
-
-
-/**
- * Generate pow(x, y)
- */
-LLVMValueRef
-lp_build_pow(struct lp_build_context *bld,
-             LLVMValueRef x,
-             LLVMValueRef y)
-{
-   /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                   __FUNCTION__);
-
-   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
-}
-
-
-/**
- * Generate exp(x)
- */
-LLVMValueRef
-lp_build_exp(struct lp_build_context *bld,
-             LLVMValueRef x)
-{
-   /* log2(e) = 1/log(2) */
-   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
-
-   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
-}
-
-
-/**
- * Generate log(x)
- */
-LLVMValueRef
-lp_build_log(struct lp_build_context *bld,
-             LLVMValueRef x)
-{
-   /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
-
-   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
-}
-
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-
-/**
- * Generate polynomial.
- * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
- */
-static LLVMValueRef
-lp_build_polynomial(struct lp_build_context *bld,
-                    LLVMValueRef x,
-                    const double *coeffs,
-                    unsigned num_coeffs)
-{
-   const struct lp_type type = bld->type;
-   LLVMValueRef res = NULL;
-   unsigned i;
-
-   /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                   __FUNCTION__);
-
-   for (i = num_coeffs; i--; ) {
-      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
-      if(res)
-         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
-      else
-         res = coeff;
-   }
-
-   if(res)
-      return res;
-   else
-      return bld->undef;
-}
-
-
-/**
- * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
- */
-const double lp_build_exp2_polynomial[] = {
-#if EXP_POLY_DEGREE == 5
-   9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
-#elif EXP_POLY_DEGREE == 4
-   1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
-#elif EXP_POLY_DEGREE == 3
-   9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
-#elif EXP_POLY_DEGREE == 2
-   1.0017247, 6.5763628e-1, 3.3718944e-1
-#else
-#error
-#endif
-};
-
-
-void
-lp_build_exp2_approx(struct lp_build_context *bld,
-                     LLVMValueRef x,
-                     LLVMValueRef *p_exp2_int_part,
-                     LLVMValueRef *p_frac_part,
-                     LLVMValueRef *p_exp2)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-   LLVMValueRef ipart = NULL;
-   LLVMValueRef fpart = NULL;
-   LLVMValueRef expipart = NULL;
-   LLVMValueRef expfpart = NULL;
-   LLVMValueRef res = NULL;
-
-   if(p_exp2_int_part || p_frac_part || p_exp2) {
-      /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                      __FUNCTION__);
-
-      assert(type.floating && type.width == 32);
-
-      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
-      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
-
-      /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
-      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
-
-      /* fpart = x - ipart */
-      fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
-      fpart = LLVMBuildSub(bld->builder, x, fpart, "");
-   }
-
-   if(p_exp2_int_part || p_exp2) {
-      /* expipart = (float) (1 << ipart) */
-      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
-      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
-      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
-   }
-
-   if(p_exp2) {
-      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
-                                     Elements(lp_build_exp2_polynomial));
-
-      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
-   }
-
-   if(p_exp2_int_part)
-      *p_exp2_int_part = expipart;
-
-   if(p_frac_part)
-      *p_frac_part = fpart;
-
-   if(p_exp2)
-      *p_exp2 = res;
-}
-
-
-LLVMValueRef
-lp_build_exp2(struct lp_build_context *bld,
-              LLVMValueRef x)
-{
-   LLVMValueRef res;
-   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
-   return res;
-}
-
-
-/**
- * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
- * These coefficients can be generate with
- * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
- */
-const double lp_build_log2_polynomial[] = {
-#if LOG_POLY_DEGREE == 6
-   3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
-#elif LOG_POLY_DEGREE == 5
-   2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
-#elif LOG_POLY_DEGREE == 4
-   2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
-#elif LOG_POLY_DEGREE == 3
-   2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
-#else
-#error
-#endif
-};
-
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-void
-lp_build_log2_approx(struct lp_build_context *bld,
-                     LLVMValueRef x,
-                     LLVMValueRef *p_exp,
-                     LLVMValueRef *p_floor_log2,
-                     LLVMValueRef *p_log2)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-
-   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
-   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
-   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
-
-   LLVMValueRef i = NULL;
-   LLVMValueRef exp = NULL;
-   LLVMValueRef mant = NULL;
-   LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
-   LLVMValueRef res = NULL;
-
-   if(p_exp || p_floor_log2 || p_log2) {
-      /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                      __FUNCTION__);
-
-      assert(type.floating && type.width == 32);
-
-      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
-
-      /* exp = (float) exponent(x) */
-      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
-   }
-
-   if(p_floor_log2 || p_log2) {
-      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
-      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
-      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
-   }
-
-   if(p_log2) {
-      /* mant = (float) mantissa(x) */
-      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
-      mant = LLVMBuildOr(bld->builder, mant, one, "");
-      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
-
-      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
-                                    Elements(lp_build_log2_polynomial));
-
-      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
-
-      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
-   }
-
-   if(p_exp)
-      *p_exp = exp;
-
-   if(p_floor_log2)
-      *p_floor_log2 = logexp;
-
-   if(p_log2)
-      *p_log2 = res;
-}
-
-
-LLVMValueRef
-lp_build_log2(struct lp_build_context *bld,
-              LLVMValueRef x)
-{
-   LLVMValueRef res;
-   lp_build_log2_approx(bld, x, NULL, NULL, &res);
-   return res;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
deleted file mode 100644
index 62be4b9aee..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper arithmetic functions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_ARIT_H
-#define LP_BLD_ARIT_H
-
-
-#include <llvm-c/Core.h>  
-
-
-struct lp_type;
-struct lp_build_context;
-
-
-/**
- * Complement, i.e., 1 - a.
- */
-LLVMValueRef
-lp_build_comp(struct lp_build_context *bld,
-              LLVMValueRef a);
-
-LLVMValueRef
-lp_build_add(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_sub(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_mul(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_mul_imm(struct lp_build_context *bld,
-                 LLVMValueRef a,
-                 int b);
-
-LLVMValueRef
-lp_build_div(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1);
-
-/**
- * Bilinear interpolation.
- *
- * Values indices are in v_{yx}.
- */
-LLVMValueRef
-lp_build_lerp_2d(struct lp_build_context *bld,
-                 LLVMValueRef x,
-                 LLVMValueRef y,
-                 LLVMValueRef v00,
-                 LLVMValueRef v01,
-                 LLVMValueRef v10,
-                 LLVMValueRef v11);
-
-LLVMValueRef
-lp_build_min(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_max(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_abs(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_sgn(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_round(struct lp_build_context *bld,
-               LLVMValueRef a);
-
-LLVMValueRef
-lp_build_floor(struct lp_build_context *bld,
-               LLVMValueRef a);
-
-LLVMValueRef
-lp_build_ceil(struct lp_build_context *bld,
-              LLVMValueRef a);
-
-LLVMValueRef
-lp_build_trunc(struct lp_build_context *bld,
-               LLVMValueRef a);
-
-LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
-                LLVMValueRef a);
-LLVMValueRef
-lp_build_iceil(struct lp_build_context *bld,
-               LLVMValueRef a);
-
-LLVMValueRef
-lp_build_iround(struct lp_build_context *bld,
-                LLVMValueRef a);
-
-LLVMValueRef
-lp_build_itrunc(struct lp_build_context *bld,
-                LLVMValueRef a);
-
-LLVMValueRef
-lp_build_sqrt(struct lp_build_context *bld,
-              LLVMValueRef a);
-
-LLVMValueRef
-lp_build_rcp(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_rsqrt(struct lp_build_context *bld,
-               LLVMValueRef a);
-
-LLVMValueRef
-lp_build_cos(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_sin(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_pow(struct lp_build_context *bld,
-             LLVMValueRef a,
-             LLVMValueRef b);
-
-LLVMValueRef
-lp_build_exp(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_log(struct lp_build_context *bld,
-             LLVMValueRef a);
-
-LLVMValueRef
-lp_build_exp2(struct lp_build_context *bld,
-              LLVMValueRef a);
-
-LLVMValueRef
-lp_build_log2(struct lp_build_context *bld,
-              LLVMValueRef a);
-
-void
-lp_build_exp2_approx(struct lp_build_context *bld,
-                     LLVMValueRef x,
-                     LLVMValueRef *p_exp2_int_part,
-                     LLVMValueRef *p_frac_part,
-                     LLVMValueRef *p_exp2);
-
-void
-lp_build_log2_approx(struct lp_build_context *bld,
-                     LLVMValueRef x,
-                     LLVMValueRef *p_exp,
-                     LLVMValueRef *p_floor_log2,
-                     LLVMValueRef *p_log2);
-
-#endif /* !LP_BLD_ARIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
deleted file mode 100644
index da272e549f..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef LP_BLD_BLEND_H
-#define LP_BLD_BLEND_H
-
-
-/**
- * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
- */
-
-#include <llvm-c/Core.h>  
- 
-#include "pipe/p_format.h"
-
-
-struct pipe_blend_state;
-struct lp_type;
-struct lp_build_context;
-
-
-/**
- * Whether the blending function is commutative or not.
- */
-boolean
-lp_build_blend_func_commutative(unsigned func);
-
-
-/**
- * Whether the blending functions are the reverse of each other.
- */
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
-
-
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1,
-                    LLVMValueRef term2);
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle);
-
-
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef const_[4],
-                   LLVMValueRef res[4]);
-
-
-/**
- * Apply a logic op.
- *
- * src/dst parameters are packed values. It should work regardless the inputs
- * are scalars, or a vector.
- */
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst);
-
-
-#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
deleted file mode 100644
index ced7b9c11d..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- AoS layout.
- *
- * AoS blending is in general much slower than SoA, but there are some cases
- * where it might be faster. In particular, if a pixel is rendered only once
- * then the overhead of tiling and untiling will dominate over the speedup that
- * SoA gives. So we might want to detect such cases and fallback to AoS in the
- * future, but for now this function is here for historical/benchmarking
- * purposes.
- *
- * Run lp_blend_test after any change to this file.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_blend.h"
-#include "lp_bld_debug.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_aos_context
-{
-   struct lp_build_context base;
-   
-   LLVMValueRef src;
-   LLVMValueRef dst;
-   LLVMValueRef const_;
-
-   LLVMValueRef inv_src;
-   LLVMValueRef inv_dst;
-   LLVMValueRef inv_const;
-   LLVMValueRef saturate;
-
-   LLVMValueRef rgb_src_factor;
-   LLVMValueRef alpha_src_factor;
-   LLVMValueRef rgb_dst_factor;
-   LLVMValueRef alpha_dst_factor;
-};
-
-
-static LLVMValueRef
-lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
-                                 unsigned factor,
-                                 boolean alpha)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src;
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst)
-            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-         if(!bld->saturate)
-            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
-         return bld->saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->const_;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src)
-         bld->inv_src = lp_build_comp(&bld->base, bld->src);
-      return bld->inv_src;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
-         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-      return bld->inv_dst;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const)
-         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
-      return bld->inv_const;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-enum lp_build_blend_swizzle {
-   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
-   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
-};
-
-
-/**
- * How should we shuffle the base factor.
- */
-static enum lp_build_blend_swizzle
-lp_build_blend_factor_swizzle(unsigned factor)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-   case PIPE_BLENDFACTOR_ZERO:
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      return LP_BUILD_BLEND_SWIZZLE_AAAA;
-   default:
-      assert(0);
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   }
-}
-
-
-static LLVMValueRef
-lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
-                       LLVMValueRef rgb, 
-                       LLVMValueRef alpha, 
-                       enum lp_build_blend_swizzle rgb_swizzle,
-                       unsigned alpha_swizzle)
-{
-   if(rgb == alpha) {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
-         return rgb;
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
-         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
-   }
-   else {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
-         boolean cond[4] = {0, 0, 0, 0};
-         cond[alpha_swizzle] = 1;
-         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
-      }
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
-         unsigned char swizzle[4];
-         swizzle[0] = alpha_swizzle;
-         swizzle[1] = alpha_swizzle;
-         swizzle[2] = alpha_swizzle;
-         swizzle[3] = alpha_swizzle;
-         swizzle[alpha_swizzle] += 4;
-         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
-      }
-   }
-   assert(0);
-   return bld->base.undef;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
- */
-static LLVMValueRef
-lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
-                      LLVMValueRef factor1,
-                      unsigned rgb_factor,
-                      unsigned alpha_factor,
-                      unsigned alpha_swizzle)
-{
-   LLVMValueRef rgb_factor_;
-   LLVMValueRef alpha_factor_;
-   LLVMValueRef factor2;
-   enum lp_build_blend_swizzle rgb_swizzle;
-
-   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
-   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
-
-   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
-
-   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
-
-   return lp_build_mul(&bld->base, factor1, factor2);
-}
-
-
-boolean
-lp_build_blend_func_commutative(unsigned func)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-   case PIPE_BLEND_MIN:
-   case PIPE_BLEND_MAX:
-      return TRUE;
-   case PIPE_BLEND_SUBTRACT:
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return FALSE;
-   default:
-      assert(0);
-      return TRUE;
-   }
-}
-
-
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
-{
-   if(rgb_func == alpha_func)
-      return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
-      return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
-      return TRUE;
-   return FALSE;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
- */
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1, 
-                    LLVMValueRef term2)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-      return lp_build_add(bld, term1, term2);
-      break;
-   case PIPE_BLEND_SUBTRACT:
-      return lp_build_sub(bld, term1, term2);
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return lp_build_sub(bld, term2, term1);
-   case PIPE_BLEND_MIN:
-      return lp_build_min(bld, term1, term2);
-   case PIPE_BLEND_MAX:
-      return lp_build_max(bld, term1, term2);
-   default:
-      assert(0);
-      return bld->zero;
-   }
-}
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle)
-{
-   struct lp_build_blend_aos_context bld;
-   LLVMValueRef src_term;
-   LLVMValueRef dst_term;
-
-   /* FIXME */
-   assert(blend->colormask == 0xf);
-
-   if(!blend->blend_enable)
-      return src;
-
-   /* It makes no sense to blend unless values are normalized */
-   assert(type.norm);
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   bld.src = src;
-   bld.dst = dst;
-   bld.const_ = const_;
-
-   /* TODO: There are still a few optimization opportunities here. For certain
-    * combinations it is possible to reorder the operations and therefore saving
-    * some instructions. */
-
-   src_term = lp_build_blend_factor(&bld, src, blend->rgb_src_factor, blend->alpha_src_factor, alpha_swizzle);
-   dst_term = lp_build_blend_factor(&bld, dst, blend->rgb_dst_factor, blend->alpha_dst_factor, alpha_swizzle);
-
-   lp_build_name(src_term, "src_term");
-   lp_build_name(dst_term, "dst_term");
-
-   if(blend->rgb_func == blend->alpha_func) {
-      return lp_build_blend_func(&bld.base, blend->rgb_func, src_term, dst_term);
-   }
-   else {
-      /* Seperate RGB / A functions */
-
-      LLVMValueRef rgb;
-      LLVMValueRef alpha;
-
-      rgb   = lp_build_blend_func(&bld.base, blend->rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld.base, blend->alpha_func, src_term, dst_term);
-
-      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
deleted file mode 100644
index 88321f62a2..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- logic ops.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_blend.h"
-
-
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst)
-{
-   LLVMTypeRef type;
-   LLVMValueRef res;
-
-   type = LLVMTypeOf(src);
-
-   switch (logicop_func) {
-   case PIPE_LOGICOP_CLEAR:
-      res = LLVMConstNull(type);
-      break;
-   case PIPE_LOGICOP_NOR:
-      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND_INVERTED:
-      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY_INVERTED:
-      res = LLVMBuildNot(builder, src, "");
-      break;
-   case PIPE_LOGICOP_AND_REVERSE:
-      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_INVERT:
-      res = LLVMBuildNot(builder, dst, "");
-      break;
-   case PIPE_LOGICOP_XOR:
-      res = LLVMBuildXor(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_NAND:
-      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND:
-      res = LLVMBuildAnd(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_EQUIV:
-      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_NOOP:
-      res = dst;
-      break;
-   case PIPE_LOGICOP_OR_INVERTED:
-      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY:
-      res = src;
-      break;
-   case PIPE_LOGICOP_OR_REVERSE:
-      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_OR:
-      res = LLVMBuildOr(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_SET:
-      res = LLVMConstAllOnes(type);
-      break;
-   default:
-      assert(0);
-      res = src;
-   }
-
-   return res;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
deleted file mode 100644
index 9511299d55..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- SoA layout.
- *
- * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
- * factors/functions are used, since no channel masking/shuffling is necessary
- * and we can achieve the full throughput of the SIMD operations. Furthermore
- * the fragment shader output is also in SoA, so it fits nicely with the rest of
- * the fragment pipeline.
- *
- * The drawback is that to be displayed the color buffer needs to be in AoS
- * layout, so we need to tile/untile the color buffer before/after rendering.
- * A color buffer like
- *
- *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
- *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
- *
- *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
- *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
- *
- *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
- *
- * will actually be stored in memory as
- *
- *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
- *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
- *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
- *
- * NOTE: Run lp_blend_test after any change to this file.
- *
- * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
- * as:
- *
- *  lp_blend_test -o blend.tsv
- *
- * will generate a tab-seperated-file with the test results and performance
- * measurements.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_blend.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_soa_context
-{
-   struct lp_build_context base;
-
-   LLVMValueRef src[4];
-   LLVMValueRef dst[4];
-   LLVMValueRef con[4];
-
-   LLVMValueRef inv_src[4];
-   LLVMValueRef inv_dst[4];
-   LLVMValueRef inv_con[4];
-
-   LLVMValueRef src_alpha_saturate;
-
-   /**
-    * We store all factors in a table in order to eliminate redundant
-    * multiplications later.
-    */
-   LLVMValueRef factor[2][2][4];
-
-   /**
-    * Table with all terms.
-    */
-   LLVMValueRef term[2][4];
-};
-
-
-static LLVMValueRef
-lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
-                          unsigned factor, unsigned i)
-{
-   /*
-    * Compute src/first term RGB
-    */
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-      return bld->src[i];
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src[3];
-   case PIPE_BLENDFACTOR_DST_COLOR:
-      return bld->dst[i];
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst[3];
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(i == 3)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst[3])
-            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-         if(!bld->src_alpha_saturate)
-            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
-         return bld->src_alpha_saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-      return bld->con[i];
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->con[3];
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src[i])
-         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
-      return bld->inv_src[i];
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src[3])
-         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
-      return bld->inv_src[3];
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      if(!bld->inv_dst[i])
-         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
-      return bld->inv_dst[i];
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst[3])
-         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-      return bld->inv_dst[3];
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_con[i])
-         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
-      return bld->inv_con[i];
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_con[3])
-         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
-      return bld->inv_con[3];
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-/**
- * Generate blend code in SOA mode.
- * \param src  src/fragment color
- * \param dst  dst/framebuffer color
- * \param con  constant blend color
- * \param res  the result/output
- */
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef con[4],
-                   LLVMValueRef res[4])
-{
-   struct lp_build_blend_soa_context bld;
-   unsigned i, j, k;
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   for (i = 0; i < 4; ++i) {
-      bld.src[i] = src[i];
-      bld.dst[i] = dst[i];
-      bld.con[i] = con[i];
-   }
-
-   for (i = 0; i < 4; ++i) {
-      if (blend->colormask & (1 << i)) {
-         if (blend->logicop_enable) {
-            if(!type.floating) {
-               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
-            }
-            else
-               res[i] = dst[i];
-         }
-         else if (blend->blend_enable) {
-            unsigned src_factor = i < 3 ? blend->rgb_src_factor : blend->alpha_src_factor;
-            unsigned dst_factor = i < 3 ? blend->rgb_dst_factor : blend->alpha_dst_factor;
-            unsigned func = i < 3 ? blend->rgb_func : blend->alpha_func;
-            boolean func_commutative = lp_build_blend_func_commutative(func);
-
-            /* It makes no sense to blend unless values are normalized */
-            assert(type.norm);
-
-            /*
-             * Compute src/dst factors.
-             */
-
-            bld.factor[0][0][i] = src[i];
-            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
-            bld.factor[1][0][i] = dst[i];
-            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
-
-            /*
-             * Compute src/dst terms
-             */
-
-            for(k = 0; k < 2; ++k) {
-               /* See if this multiplication has been previously computed */
-               for(j = 0; j < i; ++j) {
-                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
-                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][0][i]))
-                     break;
-               }
-
-               if(j < i)
-                  bld.term[k][i] = bld.term[k][j];
-               else
-                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
-            }
-
-            /*
-             * Combine terms
-             */
-
-            /* See if this function has been previously applied */
-            for(j = 0; j < i; ++j) {
-               unsigned prev_func = j < 3 ? blend->rgb_func : blend->alpha_func;
-               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
-
-               if((!func_reverse &&
-                   bld.term[0][j] == bld.term[0][i] &&
-                   bld.term[1][j] == bld.term[1][i]) ||
-                  ((func_commutative || func_reverse) &&
-                   bld.term[0][j] == bld.term[1][i] &&
-                   bld.term[1][j] == bld.term[0][i]))
-                  break;
-            }
-
-            if(j < i)
-               res[i] = res[j];
-            else
-               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
-         }
-         else {
-            res[i] = src[i];
-         }
-      }
-      else {
-         res[i] = dst[i];
-      }
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.c b/src/gallium/drivers/llvmpipe/lp_bld_const.c
deleted file mode 100644
index c8eaa8c394..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.c
+++ /dev/null
@@ -1,369 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helper functions for constant building.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include <float.h>
-
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-
-
-unsigned
-lp_mantissa(struct lp_type type)
-{
-   assert(type.floating);
-
-   if(type.floating) {
-      switch(type.width) {
-      case 32:
-         return 23;
-      case 64:
-         return 53;
-      default:
-         assert(0);
-         return 0;
-      }
-   }
-   else {
-      if(type.sign)
-         return type.width - 1;
-      else
-         return type.width;
-   }
-}
-
-
-/**
- * Shift of the unity.
- *
- * Same as lp_const_scale(), but in terms of shifts.
- */
-unsigned
-lp_const_shift(struct lp_type type)
-{
-   if(type.floating)
-      return 0;
-   else if(type.fixed)
-      return type.width/2;
-   else if(type.norm)
-      return type.sign ? type.width - 1 : type.width;
-   else
-      return 0;
-}
-
-
-unsigned
-lp_const_offset(struct lp_type type)
-{
-   if(type.floating || type.fixed)
-      return 0;
-   else if(type.norm)
-      return 1;
-   else
-      return 0;
-}
-
-
-/**
- * Scaling factor between the LLVM native value and its interpretation.
- *
- * This is 1.0 for all floating types and unnormalized integers, and something
- * else for the fixed points types and normalized integers.
- */
-double
-lp_const_scale(struct lp_type type)
-{
-   unsigned long long llscale;
-   double dscale;
-
-   llscale = (unsigned long long)1 << lp_const_shift(type);
-   llscale -= lp_const_offset(type);
-   dscale = (double)llscale;
-   assert((unsigned long long)dscale == llscale);
-
-   return dscale;
-}
-
-
-/**
- * Minimum value representable by the type.
- */
-double
-lp_const_min(struct lp_type type)
-{
-   unsigned bits;
-
-   if(!type.sign)
-      return 0.0;
-
-   if(type.norm)
-      return -1.0;
-
-   if (type.floating) {
-      switch(type.width) {
-      case 32:
-         return -FLT_MAX;
-      case 64:
-         return -DBL_MAX;
-      default:
-         assert(0);
-         return 0.0;
-      }
-   }
-
-   if(type.fixed)
-      /* FIXME: consider the fractional bits? */
-      bits = type.width / 2 - 1;
-   else
-      bits = type.width - 1;
-
-   return (double)-((long long)1 << bits);
-}
-
-
-/**
- * Maximum value representable by the type.
- */
-double
-lp_const_max(struct lp_type type)
-{
-   unsigned bits;
-
-   if(type.norm)
-      return 1.0;
-
-   if (type.floating) {
-      switch(type.width) {
-      case 32:
-         return FLT_MAX;
-      case 64:
-         return DBL_MAX;
-      default:
-         assert(0);
-         return 0.0;
-      }
-   }
-
-   if(type.fixed)
-      bits = type.width / 2;
-   else
-      bits = type.width;
-
-   if(type.sign)
-      bits -= 1;
-
-   return (double)(((unsigned long long)1 << bits) - 1);
-}
-
-
-double
-lp_const_eps(struct lp_type type)
-{
-   if (type.floating) {
-      switch(type.width) {
-      case 32:
-         return FLT_EPSILON;
-      case 64:
-         return DBL_EPSILON;
-      default:
-         assert(0);
-         return 0.0;
-      }
-   }
-   else {
-      double scale = lp_const_scale(type);
-      return 1.0/scale;
-   }
-}
-
-
-LLVMValueRef
-lp_build_undef(struct lp_type type)
-{
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   return LLVMGetUndef(vec_type);
-}
-               
-
-LLVMValueRef
-lp_build_zero(struct lp_type type)
-{
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   return LLVMConstNull(vec_type);
-}
-               
-
-LLVMValueRef
-lp_build_one(struct lp_type type)
-{
-   LLVMTypeRef elem_type;
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
-
-   elem_type = lp_build_elem_type(type);
-
-   if(type.floating)
-      elems[0] = LLVMConstReal(elem_type, 1.0);
-   else if(type.fixed)
-      elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
-   else if(!type.norm)
-      elems[0] = LLVMConstInt(elem_type, 1, 0);
-   else if(type.sign)
-      elems[0] = LLVMConstInt(elem_type, (1LL << (type.width - 1)) - 1, 0);
-   else {
-      /* special case' -- 1.0 for normalized types is more easily attained if
-       * we start with a vector consisting of all bits set */
-      LLVMTypeRef vec_type = LLVMVectorType(elem_type, type.length);
-      LLVMValueRef vec = LLVMConstAllOnes(vec_type);
-
-#if 0
-      if(type.sign)
-         /* TODO: Unfortunately this caused "Tried to create a shift operation
-          * on a non-integer type!" */
-         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
-#endif
-
-      return vec;
-   }
-
-   for(i = 1; i < type.length; ++i)
-      elems[i] = elems[0];
-
-   return LLVMConstVector(elems, type.length);
-}
-               
-
-LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val)
-{
-   LLVMTypeRef elem_type = lp_build_elem_type(type);
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
-
-   if(type.floating) {
-      elems[0] = LLVMConstReal(elem_type, val);
-   }
-   else {
-      double dscale = lp_const_scale(type);
-
-      elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
-   }
-
-   for(i = 1; i < type.length; ++i)
-      elems[i] = elems[0];
-
-   return LLVMConstVector(elems, type.length);
-}
-
-
-LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
-                          long long val)
-{
-   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
-
-   for(i = 0; i < type.length; ++i)
-      elems[i] = LLVMConstInt(elem_type, val, type.sign ? 1 : 0);
-
-   return LLVMConstVector(elems, type.length);
-}
-
-
-LLVMValueRef
-lp_build_const_aos(struct lp_type type, 
-                   double r, double g, double b, double a, 
-                   const unsigned char *swizzle)
-{
-   const unsigned char default_swizzle[4] = {0, 1, 2, 3};
-   LLVMTypeRef elem_type;
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length % 4 == 0);
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
-
-   elem_type = lp_build_elem_type(type);
-
-   if(swizzle == NULL)
-      swizzle = default_swizzle;
-
-   if(type.floating) {
-      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
-      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
-      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
-      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
-   }
-   else {
-      double dscale = lp_const_scale(type);
-
-      elems[swizzle[0]] = LLVMConstInt(elem_type, r*dscale + 0.5, 0);
-      elems[swizzle[1]] = LLVMConstInt(elem_type, g*dscale + 0.5, 0);
-      elems[swizzle[2]] = LLVMConstInt(elem_type, b*dscale + 0.5, 0);
-      elems[swizzle[3]] = LLVMConstInt(elem_type, a*dscale + 0.5, 0);
-   }
-
-   for(i = 4; i < type.length; ++i)
-      elems[i] = elems[i % 4];
-
-   return LLVMConstVector(elems, type.length);
-}
-
-
-LLVMValueRef
-lp_build_const_mask_aos(struct lp_type type,
-                        const boolean cond[4])
-{
-   LLVMTypeRef elem_type = LLVMIntType(type.width);
-   LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
-
-   for(j = 0; j < type.length; j += 4)
-      for(i = 0; i < 4; ++i)
-         masks[j + i] = LLVMConstInt(elem_type, cond[i] ? ~0 : 0, 0);
-
-   return LLVMConstVector(masks, type.length);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
deleted file mode 100644
index cb8e1c7b00..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for constant building.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_CONST_H
-#define LP_BLD_CONST_H
-
-
-#include <llvm-c/Core.h>  
-
-#include <pipe/p_compiler.h>
-
-
-struct lp_type;
-
-
-unsigned
-lp_mantissa(struct lp_type type);
-
-
-unsigned
-lp_const_shift(struct lp_type type);
-
-
-unsigned
-lp_const_offset(struct lp_type type);
-
-
-double
-lp_const_scale(struct lp_type type);
-
-double
-lp_const_min(struct lp_type type);
-
-
-double
-lp_const_max(struct lp_type type);
-
-
-double
-lp_const_eps(struct lp_type type);
-
-
-LLVMValueRef
-lp_build_undef(struct lp_type type);
-
-
-LLVMValueRef
-lp_build_zero(struct lp_type type);
-
-
-LLVMValueRef
-lp_build_one(struct lp_type type);
-
-
-LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val);
-
-
-LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
-                          long long val);
-
-
-LLVMValueRef
-lp_build_const_aos(struct lp_type type, 
-                   double r, double g, double b, double a, 
-                   const unsigned char *swizzle);
-
-
-LLVMValueRef
-lp_build_const_mask_aos(struct lp_type type,
-                        const boolean cond[4]);
-
-
-#endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
deleted file mode 100644
index 9935209437..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helper functions for type conversions.
- *
- * We want to use the fastest type for a given computation whenever feasible.
- * The other side of this is that we need to be able convert between several
- * types accurately and efficiently.
- *
- * Conversion between types of different bit width is quite complex since a 
- *
- * To remember there are a few invariants in type conversions:
- *
- * - register width must remain constant:
- *
- *     src_type.width * src_type.length == dst_type.width * dst_type.length
- *
- * - total number of elements must remain constant:
- *
- *     src_type.length * num_srcs == dst_type.length * num_dsts
- *
- * It is not always possible to do the conversion both accurately and
- * efficiently, usually due to lack of adequate machine instructions. In these
- * cases it is important not to cut shortcuts here and sacrifice accuracy, as
- * there this functions can be used anywhere. In the future we might have a
- * precision parameter which can gauge the accuracy vs efficiency compromise,
- * but for now if the data conversion between two stages happens to be the
- * bottleneck, then most likely should just avoid converting at all and run
- * both stages with the same type.
- *
- * Make sure to run lp_test_conv unit test after any change to this file.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_debug.h"
-#include "util/u_math.h"
-#include "util/u_cpu_detect.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_pack.h"
-#include "lp_bld_conv.h"
-
-
-/**
- * Special case for converting clamped IEEE-754 floats to unsigned norms.
- *
- * The mathematical voodoo below may seem excessive but it is actually
- * paramount we do it this way for several reasons. First, there is no single
- * precision FP to unsigned integer conversion Intel SSE instruction. Second,
- * secondly, even if there was, since the FP's mantissa takes only a fraction
- * of register bits the typically scale and cast approach would require double
- * precision for accurate results, and therefore half the throughput
- *
- * Although the result values can be scaled to an arbitrary bit width specified
- * by dst_width, the actual result type will have the same width.
- */
-LLVMValueRef
-lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        struct lp_type src_type,
-                                        unsigned dst_width,
-                                        LLVMValueRef src)
-{
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
-   LLVMValueRef res;
-   unsigned mantissa;
-   unsigned n;
-   unsigned long long ubound;
-   unsigned long long mask;
-   double scale;
-   double bias;
-
-   assert(src_type.floating);
-
-   mantissa = lp_mantissa(src_type);
-
-   /* We cannot carry more bits than the mantissa */
-   n = MIN2(mantissa, dst_width);
-
-   /* This magic coefficients will make the desired result to appear in the
-    * lowest significant bits of the mantissa.
-    */
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)mask/ubound;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
-
-   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
-   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
-
-   if(dst_width > n) {
-      int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
-
-      /* TODO: Fill in the empty lower bits for additional precision? */
-#if 0
-      {
-         LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
-         res = LLVMBuildOr(builder, res, msb, "");
-      }
-#elif 0
-      while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
-         shift -= n;
-         n *= 2;
-      }
-#endif
-   }
-   else
-      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
-
-   return res;
-}
-
-
-/**
- * Inverse of lp_build_clamped_float_to_unsigned_norm above.
- */
-LLVMValueRef
-lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
-                                unsigned src_width,
-                                struct lp_type dst_type,
-                                LLVMValueRef src)
-{
-   LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
-   LLVMValueRef bias_;
-   LLVMValueRef res;
-   unsigned mantissa;
-   unsigned n;
-   unsigned long long ubound;
-   unsigned long long mask;
-   double scale;
-   double bias;
-
-   mantissa = lp_mantissa(dst_type);
-
-   n = MIN2(mantissa, src_width);
-
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)ubound/mask;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
-
-   res = src;
-
-   if(src_width > mantissa) {
-      int shift = src_width - mantissa;
-      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
-   }
-
-   bias_ = lp_build_const_scalar(dst_type, bias);
-
-   res = LLVMBuildOr(builder,
-                     res,
-                     LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
-
-   res = LLVMBuildBitCast(builder, res, vec_type, "");
-
-   res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
-
-   return res;
-}
-
-
-/**
- * Generic type conversion.
- *
- * TODO: Take a precision argument, or even better, add a new precision member
- * to the lp_type union.
- */
-void
-lp_build_conv(LLVMBuilderRef builder,
-              struct lp_type src_type,
-              struct lp_type dst_type,
-              const LLVMValueRef *src, unsigned num_srcs,
-              LLVMValueRef *dst, unsigned num_dsts)
-{
-   struct lp_type tmp_type;
-   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
-   unsigned num_tmps;
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
-
-   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
-   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
-
-   tmp_type = src_type;
-   for(i = 0; i < num_srcs; ++i)
-      tmp[i] = src[i];
-   num_tmps = num_srcs;
-
-   /*
-    * Clamp if necessary
-    */
-
-   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
-      struct lp_build_context bld;
-      double src_min = lp_const_min(src_type);
-      double dst_min = lp_const_min(dst_type);
-      double src_max = lp_const_max(src_type);
-      double dst_max = lp_const_max(dst_type);
-      LLVMValueRef thres;
-
-      lp_build_context_init(&bld, builder, tmp_type);
-
-      if(src_min < dst_min) {
-         if(dst_min == 0.0)
-            thres = bld.zero;
-         else
-            thres = lp_build_const_scalar(src_type, dst_min);
-         for(i = 0; i < num_tmps; ++i)
-            tmp[i] = lp_build_max(&bld, tmp[i], thres);
-      }
-
-      if(src_max > dst_max) {
-         if(dst_max == 1.0)
-            thres = bld.one;
-         else
-            thres = lp_build_const_scalar(src_type, dst_max);
-         for(i = 0; i < num_tmps; ++i)
-            tmp[i] = lp_build_min(&bld, tmp[i], thres);
-      }
-   }
-
-   /*
-    * Scale to the narrowest range
-    */
-
-   if(dst_type.floating) {
-      /* Nothing to do */
-   }
-   else if(tmp_type.floating) {
-      if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
-         for(i = 0; i < num_tmps; ++i) {
-            tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
-                                                             tmp_type,
-                                                             dst_type.width,
-                                                             tmp[i]);
-         }
-         tmp_type.floating = FALSE;
-      }
-      else {
-         double dst_scale = lp_const_scale(dst_type);
-         LLVMTypeRef tmp_vec_type;
-
-         if (dst_scale != 1.0) {
-            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
-            for(i = 0; i < num_tmps; ++i)
-               tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
-         }
-
-         /* Use an equally sized integer for intermediate computations */
-         tmp_type.floating = FALSE;
-         tmp_vec_type = lp_build_vec_type(tmp_type);
-         for(i = 0; i < num_tmps; ++i) {
-#if 0
-            if(dst_type.sign)
-               tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
-            else
-               tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
-#else
-           /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
-            tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
-#endif
-         }
-      }
-   }
-   else {
-      unsigned src_shift = lp_const_shift(src_type);
-      unsigned dst_shift = lp_const_shift(dst_type);
-
-      /* FIXME: compensate different offsets too */
-      if(src_shift > dst_shift) {
-         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
-         for(i = 0; i < num_tmps; ++i)
-            if(src_type.sign)
-               tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
-            else
-               tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
-      }
-   }
-
-   /*
-    * Truncate or expand bit width
-    */
-
-   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
-
-   if(tmp_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
-      num_tmps = 1;
-   }
-
-   if(tmp_type.width < dst_type.width) {
-      assert(num_tmps == 1);
-      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
-      num_tmps = num_dsts;
-   }
-
-   assert(tmp_type.width == dst_type.width);
-   assert(tmp_type.length == dst_type.length);
-   assert(num_tmps == num_dsts);
-
-   /*
-    * Scale to the widest range
-    */
-
-   if(src_type.floating) {
-      /* Nothing to do */
-   }
-   else if(!src_type.floating && dst_type.floating) {
-      if(!src_type.fixed && !src_type.sign && src_type.norm) {
-         for(i = 0; i < num_tmps; ++i) {
-            tmp[i] = lp_build_unsigned_norm_to_float(builder,
-                                                     src_type.width,
-                                                     dst_type,
-                                                     tmp[i]);
-         }
-         tmp_type.floating = TRUE;
-      }
-      else {
-         double src_scale = lp_const_scale(src_type);
-         LLVMTypeRef tmp_vec_type;
-
-         /* Use an equally sized integer for intermediate computations */
-         tmp_type.floating = TRUE;
-         tmp_type.sign = TRUE;
-         tmp_vec_type = lp_build_vec_type(tmp_type);
-         for(i = 0; i < num_tmps; ++i) {
-#if 0
-            if(dst_type.sign)
-               tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
-            else
-               tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
-#else
-            /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
-            tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
-#endif
-          }
-
-          if (src_scale != 1.0) {
-             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
-             for(i = 0; i < num_tmps; ++i)
-                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
-          }
-      }
-    }
-    else {
-       unsigned src_shift = lp_const_shift(src_type);
-       unsigned dst_shift = lp_const_shift(dst_type);
-
-       /* FIXME: compensate different offsets too */
-       if(src_shift < dst_shift) {
-          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
-          for(i = 0; i < num_tmps; ++i)
-             tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
-       }
-    }
-
-   for(i = 0; i < num_dsts; ++i)
-      dst[i] = tmp[i];
-}
-
-
-/**
- * Bit mask conversion.
- *
- * This will convert the integer masks that match the given types.
- *
- * The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
- *
- * This is basically a very trimmed down version of lp_build_conv.
- */
-void
-lp_build_conv_mask(LLVMBuilderRef builder,
-                   struct lp_type src_type,
-                   struct lp_type dst_type,
-                   const LLVMValueRef *src, unsigned num_srcs,
-                   LLVMValueRef *dst, unsigned num_dsts)
-{
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
-
-   /*
-    * Drop
-    *
-    * We assume all values are 0 or -1
-    */
-
-   src_type.floating = FALSE;
-   src_type.fixed = FALSE;
-   src_type.sign = TRUE;
-   src_type.norm = FALSE;
-
-   dst_type.floating = FALSE;
-   dst_type.fixed = FALSE;
-   dst_type.sign = TRUE;
-   dst_type.norm = FALSE;
-
-   /*
-    * Truncate or expand bit width
-    */
-
-   if(src_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
-   }
-   else if(src_type.width < dst_type.width) {
-      assert(num_srcs == 1);
-      lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
-   }
-   else {
-      assert(num_srcs == num_dsts);
-      memcpy(dst, src, num_dsts * sizeof *dst);
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
deleted file mode 100644
index 948e68fae4..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for type conversions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_CONV_H
-#define LP_BLD_CONV_H
-
-
-#include <llvm-c/Core.h>  
-
-
-struct lp_type;
-
-
-LLVMValueRef
-lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        struct lp_type src_type,
-                                        unsigned dst_width,
-                                        LLVMValueRef src);
-
-LLVMValueRef
-lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
-                                unsigned src_width,
-                                struct lp_type dst_type,
-                                LLVMValueRef src);
-
-
-void
-lp_build_conv(LLVMBuilderRef builder,
-              struct lp_type src_type,
-              struct lp_type dst_type,
-              const LLVMValueRef *srcs, unsigned num_srcs,
-              LLVMValueRef *dsts, unsigned num_dsts);
-
-void
-lp_build_conv_mask(LLVMBuilderRef builder,
-                   struct lp_type src_type,
-                   struct lp_type dst_type,
-                   const LLVMValueRef *src, unsigned num_srcs,
-                   LLVMValueRef *dst, unsigned num_dsts);
-
-#endif /* !LP_BLD_CONV_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
deleted file mode 100644
index 39dfc51e50..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifdef HAVE_UDIS86
-#include <udis86.h>
-#endif
-
-#include "util/u_math.h"
-#include "util/u_debug.h"
-#include "lp_bld_debug.h"
-
-
-/**
- * Check alignment.
- *
- * It is important that this check is not implemented as a macro or inlined
- * function, as the compiler assumptions in respect to alignment of global
- * and stack variables would often make the check a no op, defeating the
- * whole purpose of the exercise.
- */
-boolean
-lp_check_alignment(const void *ptr, unsigned alignment)
-{
-   assert(util_is_pot(alignment));
-   return ((uintptr_t)ptr & (alignment - 1)) == 0;
-}
-
-
-void
-lp_disassemble(const void* func)
-{
-#ifdef HAVE_UDIS86
-   ud_t ud_obj;
-   uint64_t max_jmp_pc;
-
-   ud_init(&ud_obj);
-
-   ud_set_input_buffer(&ud_obj, (void*)func, 0xffff);
-
-   max_jmp_pc = (uint64_t) (uintptr_t) func;
-   ud_set_pc(&ud_obj, max_jmp_pc);
-
-#ifdef PIPE_ARCH_X86
-   ud_set_mode(&ud_obj, 32);
-#endif
-#ifdef PIPE_ARCH_X86_64
-   ud_set_mode(&ud_obj, 64);
-#endif
-
-   ud_set_syntax(&ud_obj, UD_SYN_ATT);
-
-   while (ud_disassemble(&ud_obj)) {
-
-#ifdef PIPE_ARCH_X86
-      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
-#endif
-#ifdef PIPE_ARCH_X86_64
-      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
-#endif
-
-#if 0
-      debug_printf("%-16s ", ud_insn_hex(&ud_obj));
-#endif
-
-      debug_printf("%s\n", ud_insn_asm(&ud_obj));
-
-      if(ud_obj.mnemonic != UD_Icall) {
-         unsigned i;
-         for(i = 0; i < 3; ++i) {
-            const struct ud_operand *op = &ud_obj.operand[i];
-            if (op->type == UD_OP_JIMM){
-               uint64_t pc = ud_obj.pc;
-
-               switch (op->size) {
-               case 8:
-                  pc += op->lval.sbyte;
-                  break;
-               case 16:
-                  pc += op->lval.sword;
-                  break;
-               case 32:
-                  pc += op->lval.sdword;
-                  break;
-               default:
-                  break;
-               }
-               if(pc > max_jmp_pc)
-                  max_jmp_pc = pc;
-            }
-         }
-      }
-
-      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
-           ud_obj.mnemonic == UD_Iinvalid)
-         break;
-   }
-
-#if 0
-   /* Print GDB command, useful to verify udis86 output */
-   debug_printf("disassemble %p %p\n", func, (void*)(uintptr_t)ud_obj.pc);
-#endif
-
-   debug_printf("\n");
-#else
-   (void)func;
-#endif
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
deleted file mode 100644
index d438c0e63d..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Depth/stencil testing to LLVM IR translation.
- *
- * To be done accurately/efficiently the depth/stencil test must be done with
- * the same type/format of the depth/stencil buffer, which implies massaging
- * the incoming depths to fit into place. Using a more straightforward
- * type/format for depth/stencil values internally and only convert when
- * flushing would avoid this, but it would most likely result in depth fighting
- * artifacts.
- *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
- *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
- *
- * FIXME: Code generate stencil test
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_depth.h"
-
-
-/**
- * Return a type appropriate for depth/stencil testing.
- */
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length)
-{
-   struct lp_type type;
-   unsigned swizzle;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   swizzle = format_desc->swizzle[0];
-   assert(swizzle < 4);
-
-   memset(&type, 0, sizeof type);
-   type.width = format_desc->block.bits;
-
-   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
-      type.floating = TRUE;
-      assert(swizzle == 0);
-      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
-   }
-   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-      assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
-   }
-   else
-      assert(0);
-
-   assert(type.width <= length);
-   type.length = length / type.width;
-
-   return type;
-}
-
-
-/**
- * Depth test.
- */
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr)
-{
-   struct lp_build_context bld;
-   unsigned z_swizzle;
-   LLVMValueRef dst;
-   LLVMValueRef z_bitmask = NULL;
-   LLVMValueRef test;
-
-   if(!state->enabled)
-      return;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   z_swizzle = format_desc->swizzle[0];
-   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
-      return;
-
-   /* Sanity checking */
-   assert(z_swizzle < 4);
-   assert(format_desc->block.bits == type.width);
-   if(type.floating) {
-      assert(z_swizzle == 0);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
-      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
-   }
-   else {
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].normalized);
-      assert(!type.fixed);
-      assert(!type.sign);
-      assert(type.norm);
-   }
-
-   /* Setup build context */
-   lp_build_context_init(&bld, builder, type);
-
-   dst = LLVMBuildLoad(builder, dst_ptr, "");
-
-   lp_build_name(dst, "zsbuf");
-
-   /* Align the source depth bits with the destination's, and mask out any
-    * stencil or padding bits from both */
-   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
-      assert(z_swizzle == 0);
-      /* nothing to do */
-   }
-   else {
-      unsigned padding_left;
-      unsigned padding_right;
-      unsigned chan;
-
-      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
-      assert(format_desc->channel[z_swizzle].normalized);
-
-      padding_right = 0;
-      for(chan = 0; chan < z_swizzle; ++chan)
-         padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits -
-                     (padding_right + format_desc->channel[z_swizzle].size);
-
-      if(padding_left || padding_right) {
-         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
-      }
-
-      if(padding_left)
-         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
-      if(padding_right)
-         src = LLVMBuildAnd(builder, src, z_bitmask, "");
-      if(padding_left || padding_right)
-         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
-   }
-
-   lp_build_name(dst, "zsbuf.z");
-
-   test = lp_build_cmp(&bld, state->func, src, dst);
-   lp_build_mask_update(mask, test);
-
-   if(state->writemask) {
-      if(z_bitmask)
-         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
-      else
-         z_bitmask = mask->value;
-
-      dst = lp_build_select(&bld, z_bitmask, src, dst);
-      LLVMBuildStore(builder, dst, dst_ptr);
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
deleted file mode 100644
index 79d6981bb5..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * Depth/stencil testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_DEPTH_H
-#define LP_BLD_DEPTH_H
-
-
-#include <llvm-c/Core.h>  
-
- 
-struct pipe_depth_state;
-struct util_format_description;
-struct lp_type;
-struct lp_build_mask_context;
-
-
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length);
-
-
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr);
-
-
-#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
deleted file mode 100644
index 25c10af29f..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ /dev/null
@@ -1,493 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * LLVM control flow build helpers.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_flow.h"
-
-
-#define LP_BUILD_FLOW_MAX_VARIABLES 32
-#define LP_BUILD_FLOW_MAX_DEPTH 32
-
-
-/**
- * Enumeration of all possible flow constructs.
- */
-enum lp_build_flow_construct_kind {
-   lP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP
-};
-
-
-/**
- * Variable declaration scope.
- */
-struct lp_build_flow_scope
-{
-   /** Number of variables declared in this scope */
-   unsigned num_variables;
-};
-
-
-/**
- * Early exit. Useful to skip to the end of a function or block when
- * the execution mask becomes zero or when there is an error condition.
- */
-struct lp_build_flow_skip
-{
-   /** Block to skip to */
-   LLVMBasicBlockRef block;
-
-   /** Number of variables declared at the beginning */
-   unsigned num_variables;
-
-   LLVMValueRef *phi;
-};
-
-
-/**
- * Union of all possible flow constructs' data
- */
-union lp_build_flow_construct_data
-{
-   struct lp_build_flow_scope scope;
-   struct lp_build_flow_skip skip;
-};
-
-
-/**
- * Element of the flow construct stack.
- */
-struct lp_build_flow_construct
-{
-   enum lp_build_flow_construct_kind kind;
-   union lp_build_flow_construct_data data;
-};
-
-
-/**
- * All necessary data to generate LLVM control flow constructs.
- *
- * Besides keeping track of the control flow construct themselves we also
- * need to keep track of variables in order to generate SSA Phi values.
- */
-struct lp_build_flow_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * Control flow stack.
-    */
-   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
-   unsigned num_constructs;
-
-   /**
-    * Variable stack
-    */
-   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
-   unsigned num_variables;
-};
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder)
-{
-   struct lp_build_flow_context *flow;
-
-   flow = CALLOC_STRUCT(lp_build_flow_context);
-   if(!flow)
-      return NULL;
-
-   flow->builder = builder;
-
-   return flow;
-}
-
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow)
-{
-   assert(flow->num_constructs == 0);
-   assert(flow->num_variables == 0);
-   FREE(flow);
-}
-
-
-static union lp_build_flow_construct_data *
-lp_build_flow_push(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
-   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
-      return NULL;
-
-   flow->constructs[flow->num_constructs].kind = kind;
-   return &flow->constructs[flow->num_constructs++].data;
-}
-
-
-static union lp_build_flow_construct_data *
-lp_build_flow_peek(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[flow->num_constructs - 1].data;
-}
-
-
-static union lp_build_flow_construct_data *
-lp_build_flow_pop(struct lp_build_flow_context *flow,
-                  enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[--flow->num_constructs].data;
-}
-
-
-/**
- * Begin a variable scope.
- *
- *
- */
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_push(flow, lP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   scope->num_variables = 0;
-}
-
-
-/**
- * Declare a variable.
- *
- * A variable is a named entity which can have different LLVMValueRef's at
- * different points of the program. This is relevant for control flow because
- * when there are mutiple branches to a same location we need to replace
- * the variable's value with a Phi function as explained in
- * http://en.wikipedia.org/wiki/Static_single_assignment_form .
- *
- * We keep track of variables by keeping around a pointer to where their
- * current.
- *
- * There are a few cautions to observe:
- *
- * - Variable's value must not be NULL. If there is no initial value then
- *   LLVMGetUndef() should be used.
- *
- * - Variable's value must be kept up-to-date. If the variable is going to be
- *   modified by a function then a pointer should be passed so that its value
- *   is accurate. Failure to do this will cause some of the variables'
- *   transient values to be lost, leading to wrong results.
- *
- * - A program should be written from top to bottom, by always appending
- *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
- *   modifying existing statements will most likely lead to wrong results.
- *
- */
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_peek(flow, lP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(*variable);
-   if(!*variable)
-      return;
-
-   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
-   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
-      return;
-
-   flow->variables[flow->num_variables++] = variable;
-   ++scope->num_variables;
-}
-
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_pop(flow, lP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(flow->num_variables >= scope->num_variables);
-   if(flow->num_variables < scope->num_variables) {
-      flow->num_variables = 0;
-      return;
-   }
-
-   flow->num_variables -= scope->num_variables;
-}
-
-
-static LLVMBasicBlockRef
-lp_build_flow_insert_block(struct lp_build_flow_context *flow)
-{
-   LLVMBasicBlockRef current_block;
-   LLVMBasicBlockRef next_block;
-   LLVMBasicBlockRef new_block;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   next_block = LLVMGetNextBasicBlock(current_block);
-   if(next_block) {
-      new_block = LLVMInsertBasicBlock(next_block, "");
-   }
-   else {
-      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-      new_block = LLVMAppendBasicBlock(function, "");
-   }
-
-   return new_block;
-}
-
-void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_skip *skip;
-   LLVMBuilderRef builder;
-   unsigned i;
-
-   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   skip->block = lp_build_flow_insert_block(flow);
-   skip->num_variables = flow->num_variables;
-   if(!skip->num_variables) {
-      skip->phi = NULL;
-      return;
-   }
-
-   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
-   if(!skip->phi) {
-      skip->num_variables = 0;
-      return;
-   }
-
-   builder = LLVMCreateBuilder();
-   LLVMPositionBuilderAtEnd(builder, skip->block);
-
-   for(i = 0; i < skip->num_variables; ++i)
-      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-   LLVMDisposeBuilder(builder);
-}
-
-
-void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
-                              LLVMValueRef cond)
-{
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   LLVMBasicBlockRef new_block;
-   unsigned i;
-
-   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   new_block = lp_build_flow_insert_block(flow);
-
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-   }
-
-   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
-
-   LLVMPositionBuilderAtEnd(flow->builder, new_block);
- }
-
-
-void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   unsigned i;
-
-   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-      *flow->variables[i] = skip->phi[i];
-   }
-
-   LLVMBuildBr(flow->builder, skip->block);
-   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
-
-   FREE(skip->phi);
-}
-
-
-static void
-lp_build_mask_check(struct lp_build_mask_context *mask)
-{
-   LLVMBuilderRef builder = mask->flow->builder;
-   LLVMValueRef cond;
-
-   cond = LLVMBuildICmp(builder,
-                        LLVMIntEQ,
-                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
-                        LLVMConstNull(mask->reg_type),
-                        "");
-
-   lp_build_flow_skip_cond_break(mask->flow, cond);
-}
-
-
-void
-lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
-                    struct lp_type type,
-                    LLVMValueRef value)
-{
-   memset(mask, 0, sizeof *mask);
-
-   mask->flow = flow;
-   mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
-
-   lp_build_flow_scope_begin(flow);
-   lp_build_flow_scope_declare(flow, &mask->value);
-   lp_build_flow_skip_begin(flow);
-
-   lp_build_mask_check(mask);
-}
-
-
-void
-lp_build_mask_update(struct lp_build_mask_context *mask,
-                     LLVMValueRef value)
-{
-   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
-
-   lp_build_mask_check(mask);
-}
-
-
-LLVMValueRef
-lp_build_mask_end(struct lp_build_mask_context *mask)
-{
-   lp_build_flow_skip_end(mask->flow);
-   lp_build_flow_scope_end(mask->flow);
-   return mask->value;
-}
-
-
-
-void
-lp_build_loop_begin(LLVMBuilderRef builder,
-                    LLVMValueRef start,
-                    struct lp_build_loop_state *state)
-{
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
-
-   state->block = LLVMAppendBasicBlock(function, "loop");
-
-   LLVMBuildBr(builder, state->block);
-
-   LLVMPositionBuilderAtEnd(builder, state->block);
-
-   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
-
-   LLVMAddIncoming(state->counter, &start, &block, 1);
-
-}
-
-
-void
-lp_build_loop_end(LLVMBuilderRef builder,
-                  LLVMValueRef end,
-                  LLVMValueRef step,
-                  struct lp_build_loop_state *state)
-{
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
-   LLVMValueRef next;
-   LLVMValueRef cond;
-   LLVMBasicBlockRef after_block;
-
-   if (!step)
-      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
-
-   next = LLVMBuildAdd(builder, state->counter, step, "");
-
-   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
-
-   after_block = LLVMAppendBasicBlock(function, "");
-
-   LLVMBuildCondBr(builder, cond, after_block, state->block);
-
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
-   LLVMPositionBuilderAtEnd(builder, after_block);
-}
-
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.h b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
deleted file mode 100644
index e61999ff06..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * LLVM control flow build helpers.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_FLOW_H
-#define LP_BLD_FLOW_H
-
-
-#include <llvm-c/Core.h>  
-
-
-struct lp_type;
-
-
-struct lp_build_flow_context;
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder);
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable);
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
-                              LLVMValueRef cond);
-
-void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow);
-
-
-struct lp_build_mask_context
-{
-   struct lp_build_flow_context *flow;
-
-   LLVMTypeRef reg_type;
-
-   LLVMValueRef value;
-};
-
-
-void
-lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
-                    struct lp_type type,
-                    LLVMValueRef value);
-
-/**
- * Bitwise AND the mask with the given value, if a previous mask was set.
- */
-void
-lp_build_mask_update(struct lp_build_mask_context *mask,
-                     LLVMValueRef value);
-
-LLVMValueRef
-lp_build_mask_end(struct lp_build_mask_context *mask);
-
-
-/**
- * LLVM's IR doesn't represent for-loops directly. Furthermore it
- * it requires creating code blocks, branches, phi variables, so it
- * requires a fair amount of code.
- *
- * @sa http://www.llvm.org/docs/tutorial/LangImpl5.html#for
- */
-struct lp_build_loop_state
-{
-  LLVMBasicBlockRef block;
-  LLVMValueRef counter;
-};
-
-
-void
-lp_build_loop_begin(LLVMBuilderRef builder,
-                    LLVMValueRef start,
-                    struct lp_build_loop_state *state);
-
-
-void
-lp_build_loop_end(LLVMBuilderRef builder,
-                  LLVMValueRef end,
-                  LLVMValueRef step,
-                  struct lp_build_loop_state *state);
-
-
-
-#endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
deleted file mode 100644
index 970bee379f..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_format.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef LP_BLD_FORMAT_H
-#define LP_BLD_FORMAT_H
-
-
-/**
- * @file
- * Pixel format helpers.
- */
-
-#include <llvm-c/Core.h>  
-
-#include "pipe/p_format.h"
-
-struct util_format_description;
-struct lp_type;
-
-
-boolean
-lp_format_is_rgba8(const struct util_format_description *desc);
-
-
-void
-lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
-                            const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled);
-
-
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed);
-
-
-LLVMValueRef
-lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
-                          const struct util_format_description *desc,
-                          struct lp_type type,
-                          LLVMValueRef packed);
-
-
-LLVMValueRef
-lp_build_pack_rgba_aos(LLVMBuilderRef builder,
-                       const struct util_format_description *desc,
-                       LLVMValueRef rgba);
-
-
-void
-lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
-                         const struct util_format_description *format_desc,
-                         struct lp_type type,
-                         LLVMValueRef packed,
-                         LLVMValueRef *rgba);
-
-
-#endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
deleted file mode 100644
index 10e82f120b..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * AoS pixel format manipulation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_cpu_detect.h"
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_format.h"
-
-
-/**
- * Unpack a single pixel into its RGBA components.
- *
- * @param packed integer.
- *
- * @return RGBA in a 4 floats vector.
- *
- * XXX: This is mostly for reference and testing -- operating a single pixel at
- * a time is rarely if ever needed.
- */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed)
-{
-   LLVMTypeRef type;
-   LLVMValueRef shifted, casted, scaled, masked;
-   LLVMValueRef shifts[4];
-   LLVMValueRef masks[4];
-   LLVMValueRef scales[4];
-   LLVMValueRef swizzles[4];
-   LLVMValueRef aux[4];
-   bool normalized;
-   int empty_channel;
-   unsigned shift;
-   unsigned i;
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   type = LLVMIntType(desc->block.bits);
-
-   /* Do the intermediate integer computations with 32bit integers since it
-    * matches floating point size */
-   if (desc->block.bits < 32)
-      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
-
-   /* Broadcast the packed value to all four channels */
-   packed = LLVMBuildInsertElement(builder,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   packed,
-                                   LLVMConstNull(LLVMInt32Type()),
-                                   "");
-   packed = LLVMBuildShuffleVector(builder,
-                                   packed,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   "");
-
-   /* Initialize vector constants */
-   normalized = FALSE;
-   empty_channel = -1;
-   shift = 0;
-   for (i = 0; i < 4; ++i) {
-      unsigned bits = desc->channel[i].size;
-
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
-         shifts[i] = LLVMGetUndef(LLVMInt32Type());
-         masks[i] = LLVMConstNull(LLVMInt32Type());
-         scales[i] =  LLVMConstNull(LLVMFloatType());
-         empty_channel = i;
-      }
-      else {
-         unsigned mask = (1 << bits) - 1;
-
-         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(bits < 32);
-
-         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
-         masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0);
-
-         if (desc->channel[i].normalized) {
-            scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask);
-            normalized = TRUE;
-         }
-         else
-            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
-      }
-
-      shift += bits;
-   }
-
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
-   /* UIToFP can't be expressed in SSE2 */
-   casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
-
-   if (normalized)
-      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
-   else
-      scaled = casted;
-
-   for (i = 0; i < 4; ++i)
-      aux[i] = LLVMGetUndef(LLVMFloatType());
-
-   for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle = desc->swizzle[i];
-
-      switch (swizzle) {
-      case UTIL_FORMAT_SWIZZLE_X:
-      case UTIL_FORMAT_SWIZZLE_Y:
-      case UTIL_FORMAT_SWIZZLE_Z:
-      case UTIL_FORMAT_SWIZZLE_W:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_0:
-         assert(empty_channel >= 0);
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_1:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
-         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_NONE:
-         swizzles[i] = LLVMGetUndef(LLVMFloatType());
-         assert(0);
-         break;
-      }
-   }
-
-   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), "");
-}
-
-
-/**
- * Take a vector with packed pixels and unpack into a rgba8 vector.
- *
- * Formats with bit depth smaller than 32bits are accepted, but they must be
- * padded to 32bits.
- */
-LLVMValueRef
-lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
-                          const struct util_format_description *desc,
-                          struct lp_type type,
-                          LLVMValueRef packed)
-{
-   struct lp_build_context bld;
-   bool rgba8;
-   LLVMValueRef res;
-   unsigned i;
-
-   lp_build_context_init(&bld, builder, type);
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   assert(!type.floating);
-   assert(!type.fixed);
-   assert(type.norm);
-   assert(type.width == 8);
-   assert(type.length % 4 == 0);
-
-   rgba8 = TRUE;
-   for(i = 0; i < 4; ++i) {
-      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
-      if(desc->channel[0].size != 8)
-         rgba8 = FALSE;
-   }
-
-   if(rgba8) {
-      /*
-       * The pixel is already in a rgba8 format variant. All it is necessary
-       * is to swizzle the channels.
-       */
-
-      unsigned char swizzles[4];
-      boolean zeros[4]; /* bitwise AND mask */
-      boolean ones[4]; /* bitwise OR mask */
-      boolean swizzles_needed = FALSE;
-      boolean zeros_needed = FALSE;
-      boolean ones_needed = FALSE;
-
-      for(i = 0; i < 4; ++i) {
-         enum util_format_swizzle swizzle = desc->swizzle[i];
-
-         /* Initialize with the no-op case */
-         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
-         zeros[i] = TRUE;
-         ones[i] = FALSE;
-
-         switch (swizzle) {
-         case UTIL_FORMAT_SWIZZLE_X:
-         case UTIL_FORMAT_SWIZZLE_Y:
-         case UTIL_FORMAT_SWIZZLE_Z:
-         case UTIL_FORMAT_SWIZZLE_W:
-            if(swizzle != swizzles[i]) {
-               swizzles[i] = swizzle;
-               swizzles_needed = TRUE;
-            }
-            break;
-         case UTIL_FORMAT_SWIZZLE_0:
-            zeros[i] = FALSE;
-            zeros_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_1:
-            ones[i] = TRUE;
-            ones_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_NONE:
-            assert(0);
-            break;
-         }
-      }
-
-      res = packed;
-
-      if(swizzles_needed)
-         res = lp_build_swizzle1_aos(&bld, res, swizzles);
-
-      if(zeros_needed) {
-         /* Mask out zero channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
-         res = LLVMBuildAnd(builder, res, mask, "");
-      }
-
-      if(ones_needed) {
-         /* Or one channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
-         res = LLVMBuildOr(builder, res, mask, "");
-      }
-   }
-   else {
-      /* FIXME */
-      assert(0);
-      res = lp_build_undef(type);
-   }
-
-   return res;
-}
-
-
-/**
- * Pack a single pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- *
- * XXX: This is mostly for reference and testing -- operating a single pixel at
- * a time is rarely if ever needed.
- */
-LLVMValueRef
-lp_build_pack_rgba_aos(LLVMBuilderRef builder,
-                       const struct util_format_description *desc,
-                       LLVMValueRef rgba)
-{
-   LLVMTypeRef type;
-   LLVMValueRef packed = NULL;
-   LLVMValueRef swizzles[4];
-   LLVMValueRef shifted, casted, scaled, unswizzled;
-   LLVMValueRef shifts[4];
-   LLVMValueRef scales[4];
-   bool normalized;
-   unsigned shift;
-   unsigned i, j;
-
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-
-   type = LLVMIntType(desc->block.bits);
-
-   /* Unswizzle the color components into the source vector. */
-   for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) {
-         if (desc->swizzle[j] == i)
-            break;
-      }
-      if (j < 4)
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0);
-      else
-         swizzles[i] = LLVMGetUndef(LLVMInt32Type());
-   }
-
-   unswizzled = LLVMBuildShuffleVector(builder, rgba,
-                                       LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)),
-                                       LLVMConstVector(swizzles, 4), "");
-
-   normalized = FALSE;
-   shift = 0;
-   for (i = 0; i < 4; ++i) {
-      unsigned bits = desc->channel[i].size;
-
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
-         shifts[i] = LLVMGetUndef(LLVMInt32Type());
-         scales[i] =  LLVMGetUndef(LLVMFloatType());
-      }
-      else {
-         unsigned mask = (1 << bits) - 1;
-
-         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(bits < 32);
-
-         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
-
-         if (desc->channel[i].normalized) {
-            scales[i] = LLVMConstReal(LLVMFloatType(), mask);
-            normalized = TRUE;
-         }
-         else
-            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
-      }
-
-      shift += bits;
-   }
-
-   if (normalized)
-      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
-   else
-      scaled = unswizzled;
-
-   casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), "");
-
-   shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
-   
-   /* Bitwise or all components */
-   for (i = 0; i < 4; ++i) {
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-         LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), "");
-         if (packed)
-            packed = LLVMBuildOr(builder, packed, component, "");
-         else
-            packed = component;
-      }
-   }
-
-   if (!packed)
-      packed = LLVMGetUndef(LLVMInt32Type());
-
-   if (desc->block.bits < 32)
-      packed = LLVMBuildTrunc(builder, packed, type, "");
-
-   return packed;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_query.c b/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
deleted file mode 100644
index f3832d07ff..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Utility functions to make assertions about formats.
- *
- * This module centralizes most of logic used when determining what algorithm
- * is most suitable (i.e., most efficient yet correct) for a given format.
- *
- * It might be possible to move some of these functions to u_format module,
- * but since tiny differences in the format my render it more/less
- * appropriate to a given algorithm it is impossible to make any long term
- * guarantee about the semantics of these functions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_format.h"
-
-#include "lp_bld_format.h"
-
-
-/**
- * Whether this format is a 4 rgba8 variant
- */
-boolean
-lp_format_is_rgba8(const struct util_format_description *desc)
-{
-   unsigned chan;
-
-   if(desc->block.width != 1 ||
-      desc->block.height != 1 ||
-      desc->block.bits != 32)
-      return FALSE;
-
-   for(chan = 0; chan < 4; ++chan) {
-      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
-         return FALSE;
-      if(desc->channel[chan].size != 8)
-         return FALSE;
-   }
-
-   return TRUE;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
deleted file mode 100644
index 64151d169d..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_conv.h"
-#include "lp_bld_format.h"
-
-
-static LLVMValueRef
-lp_build_format_swizzle_chan_soa(struct lp_type type,
-                                 const LLVMValueRef *unswizzled,
-                                 enum util_format_swizzle swizzle)
-{
-   switch (swizzle) {
-   case UTIL_FORMAT_SWIZZLE_X:
-   case UTIL_FORMAT_SWIZZLE_Y:
-   case UTIL_FORMAT_SWIZZLE_Z:
-   case UTIL_FORMAT_SWIZZLE_W:
-      return unswizzled[swizzle];
-   case UTIL_FORMAT_SWIZZLE_0:
-      return lp_build_zero(type);
-   case UTIL_FORMAT_SWIZZLE_1:
-      return lp_build_one(type);
-   case UTIL_FORMAT_SWIZZLE_NONE:
-      return lp_build_undef(type);
-   default:
-      assert(0);
-      return lp_build_undef(type);
-   }
-}
-
-
-void
-lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
-                            const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled)
-{
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      enum util_format_swizzle swizzle = format_desc->swizzle[0];
-      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
-      swizzled[2] = swizzled[1] = swizzled[0] = depth;
-      swizzled[3] = lp_build_one(type);
-   }
-   else {
-      unsigned chan;
-      for (chan = 0; chan < 4; ++chan) {
-         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
-      }
-   }
-}
-
-
-void
-lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
-                         const struct util_format_description *format_desc,
-                         struct lp_type type,
-                         LLVMValueRef packed,
-                         LLVMValueRef *rgba)
-{
-   LLVMValueRef inputs[4];
-   unsigned start;
-   unsigned chan;
-
-   /* FIXME: Support more formats */
-   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH ||
-          (format_desc->layout == UTIL_FORMAT_LAYOUT_ARRAY &&
-           format_desc->block.bits == format_desc->channel[0].size));
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-   assert(format_desc->block.bits <= 32);
-
-   /* Decode the input vector components */
-   start = 0;
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned width = format_desc->channel[chan].size;
-      unsigned stop = start + width;
-      LLVMValueRef input;
-
-      input = packed;
-
-      switch(format_desc->channel[chan].type) {
-      case UTIL_FORMAT_TYPE_VOID:
-         input = NULL;
-         break;
-
-      case UTIL_FORMAT_TYPE_UNSIGNED:
-         if(type.floating) {
-            if(start)
-               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
-            if(stop < format_desc->block.bits) {
-               unsigned mask = ((unsigned long long)1 << width) - 1;
-               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
-            }
-
-            if(format_desc->channel[chan].normalized)
-               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
-            else
-               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
-         }
-         break;
-
-      default:
-         /* fall through */
-         input = lp_build_undef(type);
-         break;
-      }
-
-      inputs[chan] = input;
-
-      start = stop;
-   }
-
-   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
deleted file mode 100644
index 49dab8ab61..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "tgsi/tgsi_parse.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_interp.h"
-
-
-static void
-attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
-{
-   if(attrib == 0)
-      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
-   else
-      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
-}
-
-
-static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
-            LLVMValueRef a0_ptr,
-            LLVMValueRef dadx_ptr,
-            LLVMValueRef dady_ptr)
-{
-   LLVMBuilderRef builder = bld->base.builder;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
-            LLVMValueRef a0 = NULL;
-            LLVMValueRef dadx = NULL;
-            LLVMValueRef dady = NULL;
-
-            switch( mode ) {
-            case TGSI_INTERPOLATE_PERSPECTIVE:
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_LINEAR:
-               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
-               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
-               dady = lp_build_broadcast_scalar(&bld->base, dady);
-               attrib_name(dadx, attrib, chan, ".dadx");
-               attrib_name(dady, attrib, chan, ".dady");
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_CONSTANT:
-               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
-               a0 = lp_build_broadcast_scalar(&bld->base, a0);
-               attrib_name(a0, attrib, chan, ".dady");
-               break;
-
-            default:
-               assert(0);
-               break;
-            }
-
-            bld->a0  [attrib][chan] = a0;
-            bld->dadx[attrib][chan] = dadx;
-            bld->dady[attrib][chan] = dady;
-         }
-      }
-   }
-}
-
-
-/**
- * Multiply the dadx and dady with the xstep and ystep respectively.
- */
-static void
-coeffs_update(struct lp_build_interp_soa_context *bld)
-{
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      if (mode != TGSI_INTERPOLATE_CONSTANT) {
-         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-            if(mask & (1 << chan)) {
-               bld->dadx[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dadx[attrib][chan], bld->xstep);
-               bld->dady[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dady[attrib][chan], bld->ystep);
-            }
-         }
-      }
-   }
-}
-
-
-static void
-attribs_init(struct lp_build_interp_soa_context *bld)
-{
-   LLVMValueRef x = bld->pos[0];
-   LLVMValueRef y = bld->pos[1];
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef a0   = bld->a0  [attrib][chan];
-            LLVMValueRef dadx = bld->dadx[attrib][chan];
-            LLVMValueRef dady = bld->dady[attrib][chan];
-            LLVMValueRef res;
-
-            res = a0;
-
-            if (mode != TGSI_INTERPOLATE_CONSTANT) {
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
-            }
-
-            /* Keep the value of the attribue before perspective divide for faster updates */
-            bld->attribs_pre[attrib][chan] = res;
-
-            if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-               LLVMValueRef w = bld->pos[3];
-               assert(attrib != 0);
-               if(!oow)
-                  oow = lp_build_rcp(&bld->base, w);
-               res = lp_build_mul(&bld->base, res, oow);
-            }
-
-            attrib_name(res, attrib, chan, "");
-
-            bld->attribs[attrib][chan] = res;
-         }
-      }
-   }
-}
-
-
-static void
-attribs_update(struct lp_build_interp_soa_context *bld)
-{
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-
-      if (mode != TGSI_INTERPOLATE_CONSTANT) {
-         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-            if(mask & (1 << chan)) {
-               LLVMValueRef dadx = bld->dadx[attrib][chan];
-               LLVMValueRef dady = bld->dady[attrib][chan];
-               LLVMValueRef res;
-
-               res = bld->attribs_pre[attrib][chan];
-
-               if(bld->xstep)
-                  res = lp_build_add(&bld->base, res, dadx);
-
-               if(bld->ystep)
-                  res = lp_build_add(&bld->base, res, dady);
-
-               bld->attribs_pre[attrib][chan] = res;
-
-               if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-                  LLVMValueRef w = bld->pos[3];
-                  assert(attrib != 0);
-                  if(!oow)
-                     oow = lp_build_rcp(&bld->base, w);
-                  res = lp_build_mul(&bld->base, res, oow);
-               }
-
-               attrib_name(res, attrib, chan, "");
-
-               bld->attribs[attrib][chan] = res;
-            }
-         }
-      }
-   }
-}
-
-
-/**
- * Generate the position vectors.
- *
- * Parameter x0, y0 are the integer values with the quad upper left coordinates.
- */
-static void
-pos_init(struct lp_build_interp_soa_context *bld,
-         LLVMValueRef x0,
-         LLVMValueRef y0)
-{
-   lp_build_name(x0, "pos.x");
-   lp_build_name(y0, "pos.y");
-
-   bld->attribs[0][0] = x0;
-   bld->attribs[0][1] = y0;
-}
-
-
-static void
-pos_update(struct lp_build_interp_soa_context *bld)
-{
-   LLVMValueRef x = bld->attribs[0][0];
-   LLVMValueRef y = bld->attribs[0][1];
-
-   if(bld->xstep)
-      x = lp_build_add(&bld->base, x, lp_build_const_scalar(bld->base.type, bld->xstep));
-
-   if(bld->ystep)
-      y = lp_build_add(&bld->base, y, lp_build_const_scalar(bld->base.type, bld->ystep));
-
-   lp_build_name(x, "pos.x");
-   lp_build_name(y, "pos.y");
-
-   bld->attribs[0][0] = x;
-   bld->attribs[0][1] = y;
-}
-
-
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0,
-                         int xstep,
-                         int ystep)
-{
-   struct tgsi_parse_context parse;
-   struct tgsi_full_declaration *decl;
-
-   memset(bld, 0, sizeof *bld);
-
-   lp_build_context_init(&bld->base, builder, type);
-
-   /* For convenience */
-   bld->pos = bld->attribs[0];
-   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
-
-   /* Position */
-   bld->num_attribs = 1;
-   bld->mask[0] = TGSI_WRITEMASK_ZW;
-   bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
-
-   /* Inputs */
-   tgsi_parse_init( &parse, tokens );
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         decl = &parse.FullToken.FullDeclaration;
-         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-            unsigned first, last, mask;
-            unsigned attrib;
-
-            first = decl->Range.First;
-            last = decl->Range.Last;
-            mask = decl->Declaration.UsageMask;
-
-            for( attrib = first; attrib <= last; ++attrib ) {
-               bld->mask[1 + attrib] = mask;
-               bld->mode[1 + attrib] = decl->Declaration.Interpolate;
-            }
-
-            bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-   tgsi_parse_free( &parse );
-
-   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   pos_init(bld, x0, y0);
-
-   attribs_init(bld);
-
-   bld->xstep = xstep;
-   bld->ystep = ystep;
-
-   coeffs_update(bld);
-}
-
-
-/**
- * Advance the position and inputs with the xstep and ystep.
- */
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld)
-{
-   pos_update(bld);
-
-   attribs_update(bld);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
deleted file mode 100644
index 9c57a10879..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * Special attention is given to the interpolation of side by side quads.
- * Multiplications are made only for the first quad. Interpolation of
- * inputs for posterior quads are done exclusively with additions, and
- * perspective divide if necessary.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_INTERP_H
-#define LP_BLD_INTERP_H
-
-
-#include <llvm-c/Core.h>
-
-#include "tgsi/tgsi_exec.h"
-
-#include "lp_bld_type.h"
-
-
-struct tgsi_token;
-
-
-struct lp_build_interp_soa_context
-{
-   struct lp_build_context base;
-
-   unsigned num_attribs;
-   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS];
-   unsigned mode[1 + PIPE_MAX_SHADER_INPUTS];
-
-   LLVMValueRef a0  [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   int xstep;
-   int ystep;
-
-   /* Attribute values before perspective divide */
-   LLVMValueRef attribs_pre[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   /*
-    * Convenience pointers. Callers may access this one.
-    */
-   const LLVMValueRef *pos;
-   const LLVMValueRef (*inputs)[NUM_CHANNELS];
-};
-
-
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0,
-                         int xstep,
-                         int ystep);
-
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld);
-
-
-#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.c b/src/gallium/drivers/llvmpipe/lp_bld_intr.c
deleted file mode 100644
index 9895749d56..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_intr.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helpers for emiting intrinsic calls.
- *
- * LLVM vanilla IR doesn't represent all basic arithmetic operations we care
- * about, and it is often necessary to resort target-specific intrinsics for
- * performance, convenience.
- *
- * Ideally we would like to stay away from target specific intrinsics and
- * move all the instruction selection logic into upstream LLVM where it belongs.
- *
- * These functions are also used for calling C functions provided by us from
- * generated LLVM code.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_debug.h"
-
-#include "lp_bld_intr.h"
-
-
-LLVMValueRef
-lp_declare_intrinsic(LLVMModuleRef module,
-                     const char *name,
-                     LLVMTypeRef ret_type,
-                     LLVMTypeRef *arg_types,
-                     unsigned num_args)
-{
-   LLVMTypeRef function_type;
-   LLVMValueRef function;
-
-   assert(!LLVMGetNamedFunction(module, name));
-
-   function_type = LLVMFunctionType(ret_type, arg_types, num_args, 0);
-   function = LLVMAddFunction(module, name, function_type);
-
-   LLVMSetFunctionCallConv(function, LLVMCCallConv);
-   LLVMSetLinkage(function, LLVMExternalLinkage);
-
-   assert(LLVMIsDeclaration(function));
-
-   if(name[0] == 'l' &&
-      name[1] == 'l' &&
-      name[2] == 'v' &&
-      name[3] == 'm' &&
-      name[4] == '.')
-      assert(LLVMGetIntrinsicID(function));
-
-   return function;
-}
-
-
-LLVMValueRef
-lp_build_intrinsic(LLVMBuilderRef builder,
-                   const char *name,
-                   LLVMTypeRef ret_type,
-                   LLVMValueRef *args,
-                   unsigned num_args)
-{
-   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-   LLVMValueRef function;
-
-   function = LLVMGetNamedFunction(module, name);
-   if(!function) {
-      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
-      unsigned i;
-
-      assert(num_args <= LP_MAX_FUNC_ARGS);
-
-      for(i = 0; i < num_args; ++i) {
-         assert(args[i]);
-         arg_types[i] = LLVMTypeOf(args[i]);
-      }
-
-      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
-   }
-
-   return LLVMBuildCall(builder, function, args, num_args, "");
-}
-
-
-LLVMValueRef
-lp_build_intrinsic_unary(LLVMBuilderRef builder,
-                         const char *name,
-                         LLVMTypeRef ret_type,
-                         LLVMValueRef a)
-{
-   return lp_build_intrinsic(builder, name, ret_type, &a, 1);
-}
-
-
-LLVMValueRef
-lp_build_intrinsic_binary(LLVMBuilderRef builder,
-                          const char *name,
-                          LLVMTypeRef ret_type,
-                          LLVMValueRef a,
-                          LLVMValueRef b)
-{
-   LLVMValueRef args[2];
-
-   args[0] = a;
-   args[1] = b;
-
-   return lp_build_intrinsic(builder, name, ret_type, args, 2);
-}
-
-
-LLVMValueRef
-lp_build_intrinsic_map(LLVMBuilderRef builder,
-                       const char *name,
-                       LLVMTypeRef ret_type,
-                       LLVMValueRef *args,
-                       unsigned num_args)
-{
-   LLVMTypeRef ret_elem_type = LLVMGetElementType(ret_type);
-   unsigned n = LLVMGetVectorSize(ret_type);
-   unsigned i, j;
-   LLVMValueRef res;
-
-   assert(num_args <= LP_MAX_FUNC_ARGS);
-
-   res = LLVMGetUndef(ret_type);
-   for(i = 0; i < n; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef arg_elems[LP_MAX_FUNC_ARGS];
-      LLVMValueRef res_elem;
-      for(j = 0; j < num_args; ++j)
-         arg_elems[j] = LLVMBuildExtractElement(builder, args[j], index, "");
-      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args);
-      res = LLVMBuildInsertElement(builder, res, res_elem, index, "");
-   }
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
-                             const char *name,
-                             LLVMTypeRef ret_type,
-                             LLVMValueRef a)
-{
-   return lp_build_intrinsic_map(builder, name, ret_type, &a, 1);
-}
-
-
-LLVMValueRef
-lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
-                              const char *name,
-                              LLVMTypeRef ret_type,
-                              LLVMValueRef a,
-                              LLVMValueRef b)
-{
-   LLVMValueRef args[2];
-
-   args[0] = a;
-   args[1] = b;
-
-   return lp_build_intrinsic_map(builder, name, ret_type, args, 2);
-}
-
-
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.h b/src/gallium/drivers/llvmpipe/lp_bld_intr.h
deleted file mode 100644
index f813f27074..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_intr.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for calling intrinsics.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_INTR_H
-#define LP_BLD_INTR_H
-
-
-#include <llvm-c/Core.h>  
-
-
-/**
- * Max number of arguments in an intrinsic.
- */
-#define LP_MAX_FUNC_ARGS 32
-
-
-LLVMValueRef
-lp_declare_intrinsic(LLVMModuleRef module,
-                     const char *name,
-                     LLVMTypeRef ret_type,
-                     LLVMTypeRef *arg_types,
-                     unsigned num_args);
-
-LLVMValueRef
-lp_build_intrinsic(LLVMBuilderRef builder,
-                   const char *name,
-                   LLVMTypeRef ret_type,
-                   LLVMValueRef *args,
-                   unsigned num_args);
-
-
-LLVMValueRef
-lp_build_intrinsic_unary(LLVMBuilderRef builder,
-                         const char *name,
-                         LLVMTypeRef ret_type,
-                         LLVMValueRef a);
-
-
-LLVMValueRef
-lp_build_intrinsic_binary(LLVMBuilderRef builder,
-                          const char *name,
-                          LLVMTypeRef ret_type,
-                          LLVMValueRef a,
-                          LLVMValueRef b);
-
-
-LLVMValueRef
-lp_build_intrinsic_map(LLVMBuilderRef builder,
-                       const char *name,
-                       LLVMTypeRef ret_type,
-                       LLVMValueRef *args,
-                       unsigned num_args);
-
-
-LLVMValueRef
-lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
-                             const char *name,
-                             LLVMTypeRef ret_type,
-                             LLVMValueRef a);
-
-
-LLVMValueRef
-lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
-                              const char *name,
-                              LLVMTypeRef ret_type,
-                              LLVMValueRef a,
-                              LLVMValueRef b);
-
-
-#endif /* !LP_BLD_INTR_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
deleted file mode 100644
index db22a8028a..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for logical operations.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_cpu_detect.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_logic.h"
-
-
-LLVMValueRef
-lp_build_cmp(struct lp_build_context *bld,
-             unsigned func,
-             LLVMValueRef a,
-             LLVMValueRef b)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
-   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
-   LLVMValueRef cond;
-   LLVMValueRef res;
-   unsigned i;
-
-   if(func == PIPE_FUNC_NEVER)
-      return zeros;
-   if(func == PIPE_FUNC_ALWAYS)
-      return ones;
-
-   /* TODO: optimize the constant case */
-
-   /* XXX: It is not clear if we should use the ordered or unordered operators */
-
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width * type.length == 128) {
-      if(type.floating && util_cpu_caps.has_sse) {
-         LLVMValueRef args[3];
-         unsigned cc;
-         boolean swap;
-
-         swap = FALSE;
-         switch(func) {
-         case PIPE_FUNC_EQUAL:
-            cc = 0;
-            break;
-         case PIPE_FUNC_NOTEQUAL:
-            cc = 4;
-            break;
-         case PIPE_FUNC_LESS:
-            cc = 1;
-            break;
-         case PIPE_FUNC_LEQUAL:
-            cc = 2;
-            break;
-         case PIPE_FUNC_GREATER:
-            cc = 1;
-            swap = TRUE;
-            break;
-         case PIPE_FUNC_GEQUAL:
-            cc = 2;
-            swap = TRUE;
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-
-         if(swap) {
-            args[0] = b;
-            args[1] = a;
-         }
-         else {
-            args[0] = a;
-            args[1] = b;
-         }
-
-         args[2] = LLVMConstInt(LLVMInt8Type(), cc, 0);
-         res = lp_build_intrinsic(bld->builder,
-                                  "llvm.x86.sse.cmp.ps",
-                                  vec_type,
-                                  args, 3);
-         res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
-         return res;
-      }
-      else if(util_cpu_caps.has_sse2) {
-         static const struct {
-            unsigned swap:1;
-            unsigned eq:1;
-            unsigned gt:1;
-            unsigned not:1;
-         } table[] = {
-            {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */
-            {1, 0, 1, 0}, /* PIPE_FUNC_LESS */
-            {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */
-            {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */
-            {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */
-            {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */
-            {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */
-            {0, 0, 0, 0}  /* PIPE_FUNC_ALWAYS */
-         };
-         const char *pcmpeq;
-         const char *pcmpgt;
-         LLVMValueRef args[2];
-         LLVMValueRef res;
-
-         switch (type.width) {
-         case 8:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.b";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.b";
-            break;
-         case 16:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.w";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.w";
-            break;
-         case 32:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.d";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.d";
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-
-         /* There are no signed byte and unsigned word/dword comparison
-          * instructions. So flip the sign bit so that the results match.
-          */
-         if(table[func].gt &&
-            ((type.width == 8 && type.sign) ||
-             (type.width != 8 && !type.sign))) {
-            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
-            a = LLVMBuildXor(bld->builder, a, msb, "");
-            b = LLVMBuildXor(bld->builder, b, msb, "");
-         }
-
-         if(table[func].swap) {
-            args[0] = b;
-            args[1] = a;
-         }
-         else {
-            args[0] = a;
-            args[1] = b;
-         }
-
-         if(table[func].eq)
-            res = lp_build_intrinsic(bld->builder, pcmpeq, vec_type, args, 2);
-         else if (table[func].gt)
-            res = lp_build_intrinsic(bld->builder, pcmpgt, vec_type, args, 2);
-         else
-            res = LLVMConstNull(vec_type);
-
-         if(table[func].not)
-            res = LLVMBuildNot(bld->builder, res, "");
-
-         return res;
-      }
-   }
-#endif
-
-   if(type.floating) {
-      LLVMRealPredicate op;
-      switch(func) {
-      case PIPE_FUNC_NEVER:
-         op = LLVMRealPredicateFalse;
-         break;
-      case PIPE_FUNC_ALWAYS:
-         op = LLVMRealPredicateTrue;
-         break;
-      case PIPE_FUNC_EQUAL:
-         op = LLVMRealUEQ;
-         break;
-      case PIPE_FUNC_NOTEQUAL:
-         op = LLVMRealUNE;
-         break;
-      case PIPE_FUNC_LESS:
-         op = LLVMRealULT;
-         break;
-      case PIPE_FUNC_LEQUAL:
-         op = LLVMRealULE;
-         break;
-      case PIPE_FUNC_GREATER:
-         op = LLVMRealUGT;
-         break;
-      case PIPE_FUNC_GEQUAL:
-         op = LLVMRealUGE;
-         break;
-      default:
-         assert(0);
-         return bld->undef;
-      }
-
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
-      cond = LLVMBuildFCmp(bld->builder, op, a, b, "");
-      res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
-#else
-      debug_printf("%s: warning: using slow element-wise vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildFCmp(bld->builder, op,
-                              LLVMBuildExtractElement(bld->builder, a, index, ""),
-                              LLVMBuildExtractElement(bld->builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(bld->builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
-      }
-#endif
-   }
-   else {
-      LLVMIntPredicate op;
-      switch(func) {
-      case PIPE_FUNC_EQUAL:
-         op = LLVMIntEQ;
-         break;
-      case PIPE_FUNC_NOTEQUAL:
-         op = LLVMIntNE;
-         break;
-      case PIPE_FUNC_LESS:
-         op = type.sign ? LLVMIntSLT : LLVMIntULT;
-         break;
-      case PIPE_FUNC_LEQUAL:
-         op = type.sign ? LLVMIntSLE : LLVMIntULE;
-         break;
-      case PIPE_FUNC_GREATER:
-         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
-         break;
-      case PIPE_FUNC_GEQUAL:
-         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
-         break;
-      default:
-         assert(0);
-         return bld->undef;
-      }
-
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
-      cond = LLVMBuildICmp(bld->builder, op, a, b, "");
-      res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
-#else
-      debug_printf("%s: warning: using slow element-wise vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildICmp(bld->builder, op,
-                              LLVMBuildExtractElement(bld->builder, a, index, ""),
-                              LLVMBuildExtractElement(bld->builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(bld->builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
-      }
-#endif
-   }
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_select(struct lp_build_context *bld,
-                LLVMValueRef mask,
-                LLVMValueRef a,
-                LLVMValueRef b)
-{
-   struct lp_type type = bld->type;
-   LLVMValueRef res;
-
-   if(a == b)
-      return a;
-
-   if(type.floating) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
-   }
-
-   a = LLVMBuildAnd(bld->builder, a, mask, "");
-
-   /* This often gets translated to PANDN, but sometimes the NOT is
-    * pre-computed and stored in another constant. The best strategy depends
-    * on available registers, so it is not a big deal -- hopefully LLVM does
-    * the right decision attending the rest of the program.
-    */
-   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
-
-   res = LLVMBuildOr(bld->builder, a, b, "");
-
-   if(type.floating) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
-   }
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_select_aos(struct lp_build_context *bld,
-                    LLVMValueRef a,
-                    LLVMValueRef b,
-                    const boolean cond[4])
-{
-   const struct lp_type type = bld->type;
-   const unsigned n = type.length;
-   unsigned i, j;
-
-   if(a == b)
-      return a;
-   if(cond[0] && cond[1] && cond[2] && cond[3])
-      return a;
-   if(!cond[0] && !cond[1] && !cond[2] && !cond[3])
-      return b;
-   if(a == bld->undef || b == bld->undef)
-      return bld->undef;
-
-   /*
-    * There are three major ways of accomplishing this:
-    * - with a shuffle,
-    * - with a select,
-    * - or with a bit mask.
-    *
-    * Select isn't supported for vector types yet.
-    * The flip between these is empirical and might need to be.
-    */
-   if (n <= 4) {
-      /*
-       * Shuffle.
-       */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, (cond[i] ? 0 : n) + j + i, 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
-   }
-   else {
-#if 0
-      /* XXX: Unfortunately select of vectors do not work */
-      /* Use a select */
-      LLVMTypeRef elem_type = LLVMInt1Type();
-      LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
-
-      return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
-#else
-      LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
-      return lp_build_select(bld, mask, a, b);
-#endif
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
deleted file mode 100644
index 6e79438ead..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "pipe/p_config.h"
-
-#include "lp_bld_misc.h"
-
-
-#ifndef LLVM_NATIVE_ARCH
-
-namespace llvm {
-   extern void LinkInJIT();
-}
-
-
-void
-LLVMLinkInJIT(void)
-{
-   llvm::LinkInJIT();
-}
-
-
-extern "C" int X86TargetMachineModule;
-
-
-int
-LLVMInitializeNativeTarget(void)
-{
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   X86TargetMachineModule = 1;
-#endif
-   return 0;
-}
-
-
-#endif
-
-
-/* 
- * Hack to allow the linking of release LLVM static libraries on a debug build.
- *
- * See also:
- * - http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/7234ea2b-0042-42ed-b4e2-5d8644dfb57d
- */
-#if defined(_MSC_VER) && defined(_DEBUG)
-#include <crtdefs.h>
-extern "C" {
-   _CRTIMP void __cdecl _invalid_parameter_noinfo(void) {}
-}
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.c b/src/gallium/drivers/llvmpipe/lp_bld_pack.c
deleted file mode 100644
index bc360ad77a..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_pack.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helper functions for packing/unpacking.
- *
- * Pack/unpacking is necessary for conversion between types of different
- * bit width.
- *
- * They are also commonly used when an computation needs higher
- * precision for the intermediate values. For example, if one needs the
- * function:
- *
- *   c = compute(a, b);
- *
- * to use more precision for intermediate results then one should implement it
- * as:
- *
- *   LLVMValueRef
- *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
- *   {
- *      struct lp_type wide_type = lp_wider_type(type);
- *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
- *
- *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
- *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
- *
- *      cl = compute_half(al, bl);
- *      ch = compute_half(ah, bh);
- *
- *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
- *
- *      return c;
- *   }
- *
- * where compute_half() would do the computation for half the elements with
- * twice the precision.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_debug.h"
-#include "util/u_math.h"
-#include "util/u_cpu_detect.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_pack.h"
-
-
-/**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef
-lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build shuffle vectors that match PACKxx instructions.
- */
-static LLVMValueRef
-lp_build_const_pack_shuffle(unsigned n)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Interleave vector elements.
- *
- * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
- */
-LLVMValueRef
-lp_build_interleave2(LLVMBuilderRef builder,
-                     struct lp_type type,
-                     LLVMValueRef a,
-                     LLVMValueRef b,
-                     unsigned lo_hi)
-{
-   LLVMValueRef shuffle;
-
-   shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
-
-   return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
-}
-
-
-/**
- * Double the bit width.
- *
- * This will only change the number of bits the values are represented, not the
- * values themselves.
- */
-void
-lp_build_unpack2(LLVMBuilderRef builder,
-                 struct lp_type src_type,
-                 struct lp_type dst_type,
-                 LLVMValueRef src,
-                 LLVMValueRef *dst_lo,
-                 LLVMValueRef *dst_hi)
-{
-   LLVMValueRef msb;
-   LLVMTypeRef dst_vec_type;
-
-   assert(!src_type.floating);
-   assert(!dst_type.floating);
-   assert(dst_type.width == src_type.width * 2);
-   assert(dst_type.length * 2 == src_type.length);
-
-   if(dst_type.sign && src_type.sign) {
-      /* Replicate the sign bit in the most significant bits */
-      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
-   }
-   else
-      /* Most significant bits always zero */
-      msb = lp_build_zero(src_type);
-
-   /* Interleave bits */
-   if(util_cpu_caps.little_endian) {
-      *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
-      *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
-   }
-   else {
-      *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
-      *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
-   }
-
-   /* Cast the result into the new type (twice as wide) */
-
-   dst_vec_type = lp_build_vec_type(dst_type);
-
-   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
-   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
-}
-
-
-/**
- * Expand the bit width.
- *
- * This will only change the number of bits the values are represented, not the
- * values themselves.
- */
-void
-lp_build_unpack(LLVMBuilderRef builder,
-                struct lp_type src_type,
-                struct lp_type dst_type,
-                LLVMValueRef src,
-                LLVMValueRef *dst, unsigned num_dsts)
-{
-   unsigned num_tmps;
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length == dst_type.length * num_dsts);
-
-   num_tmps = 1;
-   dst[0] = src;
-
-   while(src_type.width < dst_type.width) {
-      struct lp_type tmp_type = src_type;
-
-      tmp_type.width *= 2;
-      tmp_type.length /= 2;
-
-      for(i = num_tmps; i--; ) {
-         lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
-      }
-
-      src_type = tmp_type;
-
-      num_tmps *= 2;
-   }
-
-   assert(num_tmps == num_dsts);
-}
-
-
-/**
- * Non-interleaved pack.
- *
- * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
- *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
- *
- * This will only change the number of bits the values are represented, not the
- * values themselves.
- *
- * It is assumed the values are already clamped into the destination type range.
- * Values outside that range will produce undefined results. Use
- * lp_build_packs2 instead.
- */
-LLVMValueRef
-lp_build_pack2(LLVMBuilderRef builder,
-               struct lp_type src_type,
-               struct lp_type dst_type,
-               LLVMValueRef lo,
-               LLVMValueRef hi)
-{
-   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
-   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
-   LLVMValueRef shuffle;
-   LLVMValueRef res;
-
-   dst_vec_type = lp_build_vec_type(dst_type);
-
-   assert(!src_type.floating);
-   assert(!dst_type.floating);
-   assert(src_type.width == dst_type.width * 2);
-   assert(src_type.length * 2 == dst_type.length);
-
-   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
-      switch(src_type.width) {
-      case 32:
-         if(dst_type.sign) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-         }
-         else {
-            if (util_cpu_caps.has_sse4_1) {
-               /* PACKUSDW is the only instrinsic with a consistent signature */
-               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-            }
-            else {
-               assert(0);
-               return LLVMGetUndef(dst_vec_type);
-            }
-         }
-         break;
-
-      case 16:
-         if(dst_type.sign)
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-         else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
-         break;
-      }
-
-      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
-      return res;
-   }
-
-   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
-   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
-
-   shuffle = lp_build_const_pack_shuffle(dst_type.length);
-
-   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
-
-   return res;
-}
-
-
-
-/**
- * Non-interleaved pack and saturate.
- *
- * Same as lp_build_pack2 but will saturate values so that they fit into the
- * destination type.
- */
-LLVMValueRef
-lp_build_packs2(LLVMBuilderRef builder,
-                struct lp_type src_type,
-                struct lp_type dst_type,
-                LLVMValueRef lo,
-                LLVMValueRef hi)
-{
-   boolean clamp;
-
-   assert(!src_type.floating);
-   assert(!dst_type.floating);
-   assert(src_type.sign == dst_type.sign);
-   assert(src_type.width == dst_type.width * 2);
-   assert(src_type.length * 2 == dst_type.length);
-
-   clamp = TRUE;
-
-   /* All X86 SSE non-interleaved pack instructions take signed inputs and
-    * saturate them, so no need to clamp for those cases. */
-   if(util_cpu_caps.has_sse2 &&
-      src_type.width * src_type.length == 128 &&
-      src_type.sign)
-      clamp = FALSE;
-
-   if(clamp) {
-      struct lp_build_context bld;
-      unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
-      lp_build_context_init(&bld, builder, src_type);
-      lo = lp_build_min(&bld, lo, dst_max);
-      hi = lp_build_min(&bld, hi, dst_max);
-      /* FIXME: What about lower bound? */
-   }
-
-   return lp_build_pack2(builder, src_type, dst_type, lo, hi);
-}
-
-
-/**
- * Truncate the bit width.
- *
- * TODO: Handle saturation consistently.
- */
-LLVMValueRef
-lp_build_pack(LLVMBuilderRef builder,
-              struct lp_type src_type,
-              struct lp_type dst_type,
-              boolean clamped,
-              const LLVMValueRef *src, unsigned num_srcs)
-{
-   LLVMValueRef (*pack2)(LLVMBuilderRef builder,
-                         struct lp_type src_type,
-                         struct lp_type dst_type,
-                         LLVMValueRef lo,
-                         LLVMValueRef hi);
-   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * num_srcs == dst_type.length);
-
-   if(clamped)
-      pack2 = &lp_build_pack2;
-   else
-      pack2 = &lp_build_packs2;
-
-   for(i = 0; i < num_srcs; ++i)
-      tmp[i] = src[i];
-
-   while(src_type.width > dst_type.width) {
-      struct lp_type tmp_type = src_type;
-
-      tmp_type.width /= 2;
-      tmp_type.length *= 2;
-
-      /* Take in consideration the sign changes only in the last step */
-      if(tmp_type.width == dst_type.width)
-         tmp_type.sign = dst_type.sign;
-
-      num_srcs /= 2;
-
-      for(i = 0; i < num_srcs; ++i)
-         tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
-
-      src_type = tmp_type;
-   }
-
-   assert(num_srcs == 1);
-
-   return tmp[0];
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.h b/src/gallium/drivers/llvmpipe/lp_bld_pack.h
deleted file mode 100644
index fb2a34984a..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_pack.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for packing/unpacking conversions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_PACK_H
-#define LP_BLD_PACK_H
-
-
-#include <llvm-c/Core.h>  
-
-
-struct lp_type;
-
-
-LLVMValueRef
-lp_build_interleave2(LLVMBuilderRef builder,
-                     struct lp_type type,
-                     LLVMValueRef a,
-                     LLVMValueRef b,
-                     unsigned lo_hi);
-
-
-void
-lp_build_unpack2(LLVMBuilderRef builder,
-                 struct lp_type src_type,
-                 struct lp_type dst_type,
-                 LLVMValueRef src,
-                 LLVMValueRef *dst_lo,
-                 LLVMValueRef *dst_hi);
-
-
-void
-lp_build_unpack(LLVMBuilderRef builder,
-                struct lp_type src_type,
-                struct lp_type dst_type,
-                LLVMValueRef src,
-                LLVMValueRef *dst, unsigned num_dsts);
-
-
-LLVMValueRef
-lp_build_packs2(LLVMBuilderRef builder,
-                struct lp_type src_type,
-                struct lp_type dst_type,
-                LLVMValueRef lo,
-                LLVMValueRef hi);
-
-
-LLVMValueRef
-lp_build_pack2(LLVMBuilderRef builder,
-               struct lp_type src_type,
-               struct lp_type dst_type,
-               LLVMValueRef lo,
-               LLVMValueRef hi);
-
-
-LLVMValueRef
-lp_build_pack(LLVMBuilderRef builder,
-              struct lp_type src_type,
-              struct lp_type dst_type,
-              boolean clamped,
-              const LLVMValueRef *src, unsigned num_srcs);
-
-
-#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
deleted file mode 100644
index 9003e108c1..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Texture sampling -- common code.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_defines.h"
-#include "pipe/p_state.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_type.h"
-#include "lp_bld_format.h"
-#include "lp_bld_sample.h"
-
-
-void
-lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
-                        const struct pipe_sampler_state *sampler)
-{
-   memset(state, 0, sizeof *state);
-
-   if(!texture)
-      return;
-
-   if(!sampler)
-      return;
-
-   state->format            = texture->format;
-   state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width0);
-   state->pot_height        = util_is_pot(texture->height0);
-   state->pot_depth         = util_is_pot(texture->depth0);
-
-   state->wrap_s            = sampler->wrap_s;
-   state->wrap_t            = sampler->wrap_t;
-   state->wrap_r            = sampler->wrap_r;
-   state->min_img_filter    = sampler->min_img_filter;
-   state->min_mip_filter    = sampler->min_mip_filter;
-   state->mag_img_filter    = sampler->mag_img_filter;
-   state->compare_mode      = sampler->compare_mode;
-   if(sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
-      state->compare_func      = sampler->compare_func;
-   }
-   state->normalized_coords = sampler->normalized_coords;
-   state->prefilter         = sampler->prefilter;
-}
-
-
-/**
- * Gather elements from scatter positions in memory into a single vector.
- *
- * @param src_width src element width
- * @param dst_width result element width (source will be expanded to fit)
- * @param length length of the offsets,
- * @param base_ptr base pointer, should be a i8 pointer type.
- * @param offsets vector with offsets
- */
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
-{
-   LLVMTypeRef src_type = LLVMIntType(src_width);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
-   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-   LLVMValueRef res;
-   unsigned i;
-
-   res = LLVMGetUndef(dst_vec_type);
-   for(i = 0; i < length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef elem_offset;
-      LLVMValueRef elem_ptr;
-      LLVMValueRef elem;
-
-      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
-      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
-      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
-      elem = LLVMBuildLoad(builder, elem_ptr, "");
-
-      assert(src_width <= dst_width);
-      if(src_width > dst_width)
-         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
-      if(src_width < dst_width)
-         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
-
-      res = LLVMBuildInsertElement(builder, res, elem, index, "");
-   }
-
-   return res;
-}
-
-
-/**
- * Compute the offset of a pixel.
- *
- * x, y, y_stride are vectors
- */
-LLVMValueRef
-lp_build_sample_offset(struct lp_build_context *bld,
-                       const struct util_format_description *format_desc,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
-{
-   LLVMValueRef x_stride;
-   LLVMValueRef offset;
-
-   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
-
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      LLVMValueRef x_lo, x_hi;
-      LLVMValueRef y_lo, y_hi;
-      LLVMValueRef x_stride_lo, x_stride_hi;
-      LLVMValueRef y_stride_lo, y_stride_hi;
-      LLVMValueRef x_offset_lo, x_offset_hi;
-      LLVMValueRef y_offset_lo, y_offset_hi;
-      LLVMValueRef offset_lo, offset_hi;
-
-      x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
-      y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
-
-      x_hi = LLVMBuildLShr(bld->builder, x, bld->one, "");
-      y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
-
-      x_stride_lo = x_stride;
-      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
-
-      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
-      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
-
-      x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
-      y_offset_lo = lp_build_mul(bld, y_lo, y_stride_lo);
-      offset_lo = lp_build_add(bld, x_offset_lo, y_offset_lo);
-
-      x_offset_hi = lp_build_mul(bld, x_hi, x_stride_hi);
-      y_offset_hi = lp_build_mul(bld, y_hi, y_stride_hi);
-      offset_hi = lp_build_add(bld, x_offset_hi, y_offset_hi);
-
-      offset = lp_build_add(bld, offset_hi, offset_lo);
-   }
-   else {
-      LLVMValueRef x_offset;
-      LLVMValueRef y_offset;
-
-      x_offset = lp_build_mul(bld, x, x_stride);
-      y_offset = lp_build_mul(bld, y, y_stride);
-
-      offset = lp_build_add(bld, x_offset, y_offset);
-   }
-
-   return offset;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
deleted file mode 100644
index 8cb8210ca7..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Texture sampling.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_SAMPLE_H
-#define LP_BLD_SAMPLE_H
-
-
-#include <llvm-c/Core.h>
-
-struct pipe_texture;
-struct pipe_sampler_state;
-struct util_format_description;
-struct lp_type;
-struct lp_build_context;
-
-
-/**
- * Sampler static state.
- *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
- * are embedded in the generated code.
- */
-struct lp_sampler_static_state
-{
-   /* pipe_texture's state */
-   enum pipe_format format;
-   unsigned target:2;
-   unsigned pot_width:1;
-   unsigned pot_height:1;
-   unsigned pot_depth:1;
-
-   /* pipe_sampler_state's state */
-   unsigned wrap_s:3;
-   unsigned wrap_t:3;
-   unsigned wrap_r:3;
-   unsigned min_img_filter:2;
-   unsigned min_mip_filter:2;
-   unsigned mag_img_filter:2;
-   unsigned compare_mode:1;
-   unsigned compare_func:3;
-   unsigned normalized_coords:1;
-   unsigned prefilter:4;
-};
-
-
-/**
- * Sampler dynamic state.
- *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
- * are computed in runtime.
- *
- * There are obtained through callbacks, as we don't want to tie the texture
- * sampling code generation logic to any particular texture layout or pipe
- * driver.
- */
-struct lp_sampler_dynamic_state
-{
-
-   /** Obtain the base texture width. */
-   LLVMValueRef
-   (*width)( struct lp_sampler_dynamic_state *state,
-             LLVMBuilderRef builder,
-             unsigned unit);
-
-   /** Obtain the base texture height. */
-   LLVMValueRef
-   (*height)( struct lp_sampler_dynamic_state *state,
-              LLVMBuilderRef builder,
-              unsigned unit);
-
-   LLVMValueRef
-   (*stride)( struct lp_sampler_dynamic_state *state,
-              LLVMBuilderRef builder,
-              unsigned unit);
-
-   LLVMValueRef
-   (*data_ptr)( struct lp_sampler_dynamic_state *state,
-                LLVMBuilderRef builder,
-                unsigned unit);
-
-};
-
-
-/**
- * Derive the sampler static state.
- */
-void
-lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
-                        const struct pipe_sampler_state *sampler);
-
-
-LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
-
-
-LLVMValueRef
-lp_build_sample_offset(struct lp_build_context *bld,
-                       const struct util_format_description *format_desc,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr);
-
-
-void
-lp_build_sample_soa(LLVMBuilderRef builder,
-                    const struct lp_sampler_static_state *static_state,
-                    struct lp_sampler_dynamic_state *dynamic_state,
-                    struct lp_type fp_type,
-                    unsigned unit,
-                    unsigned num_coords,
-                    const LLVMValueRef *coords,
-                    LLVMValueRef lodbias,
-                    LLVMValueRef *texel);
-
-
-
-#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
deleted file mode 100644
index 5ee8d556a6..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Texture sampling -- SoA.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_defines.h"
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-#include "util/u_debug_dump.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "util/u_format.h"
-#include "util/u_cpu_detect.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_conv.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_pack.h"
-#include "lp_bld_format.h"
-#include "lp_bld_sample.h"
-
-
-/**
- * Keep all information for sampling code generation in a single place.
- */
-struct lp_build_sample_context
-{
-   LLVMBuilderRef builder;
-
-   const struct lp_sampler_static_state *static_state;
-
-   struct lp_sampler_dynamic_state *dynamic_state;
-
-   const struct util_format_description *format_desc;
-
-   /** Incoming coordinates type and build context */
-   struct lp_type coord_type;
-   struct lp_build_context coord_bld;
-
-   /** Integer coordinates */
-   struct lp_type int_coord_type;
-   struct lp_build_context int_coord_bld;
-
-   /** Output texels type and build context */
-   struct lp_type texel_type;
-   struct lp_build_context texel_bld;
-};
-
-
-static void
-lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
-                          LLVMValueRef x,
-                          LLVMValueRef y,
-                          LLVMValueRef y_stride,
-                          LLVMValueRef data_ptr,
-                          LLVMValueRef *texel)
-{
-   LLVMValueRef offset;
-   LLVMValueRef packed;
-
-   offset = lp_build_sample_offset(&bld->int_coord_bld,
-                                   bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   packed = lp_build_gather(bld->builder,
-                            bld->texel_type.length,
-                            bld->format_desc->block.bits,
-                            bld->texel_type.width,
-                            data_ptr, offset);
-
-   lp_build_unpack_rgba_soa(bld->builder,
-                            bld->format_desc,
-                            bld->texel_type,
-                            packed, texel);
-}
-
-
-static LLVMValueRef
-lp_build_sample_packed(struct lp_build_sample_context *bld,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
-{
-   LLVMValueRef offset;
-
-   offset = lp_build_sample_offset(&bld->int_coord_bld,
-                                   bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   return lp_build_gather(bld->builder,
-                          bld->texel_type.length,
-                          bld->format_desc->block.bits,
-                          bld->texel_type.width,
-                          data_ptr, offset);
-}
-
-
-static LLVMValueRef
-lp_build_sample_wrap(struct lp_build_sample_context *bld,
-                     LLVMValueRef coord,
-                     LLVMValueRef length,
-                     boolean is_pot,
-                     unsigned wrap_mode)
-{
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef length_minus_one;
-
-   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
-   switch(wrap_mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      if(is_pot)
-         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
-         coord = LLVMBuildURem(bld->builder, coord, length, "");
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP:
-      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
-      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      /* FIXME */
-      _debug_printf("warning: failed to translate texture wrap mode %s\n",
-                    debug_dump_tex_wrap(wrap_mode, TRUE));
-      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
-      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
-      break;
-
-   default:
-      assert(0);
-   }
-
-   return coord;
-}
-
-
-static void
-lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
-                               LLVMValueRef s,
-                               LLVMValueRef t,
-                               LLVMValueRef width,
-                               LLVMValueRef height,
-                               LLVMValueRef stride,
-                               LLVMValueRef data_ptr,
-                               LLVMValueRef *texel)
-{
-   LLVMValueRef x;
-   LLVMValueRef y;
-
-   x = lp_build_ifloor(&bld->coord_bld, s);
-   y = lp_build_ifloor(&bld->coord_bld, t);
-
-   x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
-   y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
-
-   lp_build_sample_texel_soa(bld, x, y, stride, data_ptr, texel);
-}
-
-
-static void
-lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
-                              LLVMValueRef s,
-                              LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
-                              LLVMValueRef *texel)
-{
-   LLVMValueRef half;
-   LLVMValueRef s_ipart;
-   LLVMValueRef t_ipart;
-   LLVMValueRef s_fpart;
-   LLVMValueRef t_fpart;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2][4];
-   unsigned chan;
-
-   half = lp_build_const_scalar(bld->coord_type, 0.5);
-   s = lp_build_sub(&bld->coord_bld, s, half);
-   t = lp_build_sub(&bld->coord_bld, t, half);
-
-   s_ipart = lp_build_floor(&bld->coord_bld, s);
-   t_ipart = lp_build_floor(&bld->coord_bld, t);
-
-   s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
-   t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
-
-   x0 = lp_build_itrunc(&bld->coord_bld, s_ipart);
-   y0 = lp_build_itrunc(&bld->coord_bld, t_ipart);
-
-   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
-
-   lp_build_sample_texel_soa(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
-   lp_build_sample_texel_soa(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
-   lp_build_sample_texel_soa(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
-
-   /* TODO: Don't interpolate missing channels */
-   for(chan = 0; chan < 4; ++chan) {
-      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
-                                     s_fpart, t_fpart,
-                                     neighbors[0][0][chan],
-                                     neighbors[0][1][chan],
-                                     neighbors[1][0][chan],
-                                     neighbors[1][1][chan]);
-   }
-}
-
-
-static void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
-{
-   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
-   unsigned chan;
-
-   /* Decode the input vector components */
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned start = chan*8;
-      unsigned stop = start + 8;
-      LLVMValueRef input;
-
-      input = packed;
-
-      if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
-
-      if(stop < 32)
-         input = LLVMBuildAnd(builder, input, mask, "");
-
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
-
-      rgba[chan] = input;
-   }
-}
-
-
-static void
-lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
-                              LLVMValueRef s,
-                              LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
-                              LLVMValueRef *texel)
-{
-   LLVMBuilderRef builder = bld->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
-   LLVMValueRef i32_c8, i32_c128, i32_c255;
-   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2];
-   LLVMValueRef neighbors_lo[2][2];
-   LLVMValueRef neighbors_hi[2][2];
-   LLVMValueRef packed, packed_lo, packed_hi;
-   LLVMValueRef unswizzled[4];
-
-   lp_build_context_init(&i32, builder, lp_type_int(32));
-   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
-
-   i32_vec_type = lp_build_vec_type(i32.type);
-   h16_vec_type = lp_build_vec_type(h16.type);
-   u8n_vec_type = lp_build_vec_type(u8n.type);
-
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-
-   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
-   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
-
-   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
-   s = LLVMBuildAdd(builder, s, i32_c128, "");
-   t = LLVMBuildAdd(builder, t, i32_c128, "");
-
-   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
-   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
-   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
-
-   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
-   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
-   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
-
-   x0 = s_ipart;
-   y0 = t_ipart;
-
-   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
-
-   /*
-    * Transform 4 x i32 in
-    *
-    *   s_fpart = {s0, s1, s2, s3}
-    *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
-    *
-    * into two 8 x i16
-    *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
-    *
-    * and likewise for t_fpart. There is no risk of loosing precision here
-    * since the fractional parts only use the lower 8bits.
-    */
-
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
-   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
-
-   {
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffle_lo;
-      LLVMValueRef shuffle_hi;
-      unsigned i, j;
-
-      for(j = 0; j < h16.type.length; j += 4) {
-         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
-         LLVMValueRef index;
-
-         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_lo[j + i] = index;
-
-         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_hi[j + i] = index;
-      }
-
-      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
-
-      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
-      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
-      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
-      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
-   }
-
-   /*
-    * Fetch the pixels as 4 x 32bit (rgba order might differ):
-    *
-    *   rgba0 rgba1 rgba2 rgba3
-    *
-    * bit cast them into 16 x u8
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * unpack them into two 8 x i16:
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1
-    *   r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * The higher 8 bits of the resulting elements will be zero.
-    */
-
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
-
-   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
-   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
-   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
-   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
-
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
-
-   /*
-    * Linear interpolate with 8.8 fixed point.
-    */
-
-   packed_lo = lp_build_lerp_2d(&h16,
-                                s_fpart_lo, t_fpart_lo,
-                                neighbors_lo[0][0],
-                                neighbors_lo[0][1],
-                                neighbors_lo[1][0],
-                                neighbors_lo[1][1]);
-
-   packed_hi = lp_build_lerp_2d(&h16,
-                                s_fpart_hi, t_fpart_hi,
-                                neighbors_hi[0][0],
-                                neighbors_hi[0][1],
-                                neighbors_hi[1][0],
-                                neighbors_hi[1][1]);
-
-   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
-
-   /*
-    * Convert to SoA and swizzle.
-    */
-
-   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
-
-   lp_build_rgba8_to_f32_soa(bld->builder,
-                             bld->texel_type,
-                             packed, unswizzled);
-
-   lp_build_format_swizzle_soa(bld->format_desc,
-                               bld->texel_type, unswizzled,
-                               texel);
-}
-
-
-static void
-lp_build_sample_compare(struct lp_build_sample_context *bld,
-                        LLVMValueRef p,
-                        LLVMValueRef *texel)
-{
-   struct lp_build_context *texel_bld = &bld->texel_bld;
-   LLVMValueRef res;
-   unsigned chan;
-
-   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
-      return;
-
-   /* TODO: Compare before swizzling, to avoid redundant computations */
-   res = NULL;
-   for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef cmp;
-      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
-      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
-
-      if(res)
-         res = lp_build_add(texel_bld, res, cmp);
-      else
-         res = cmp;
-   }
-
-   assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for(chan = 0; chan < 3; ++chan)
-      texel[chan] = res;
-   texel[3] = texel_bld->one;
-}
-
-
-void
-lp_build_sample_soa(LLVMBuilderRef builder,
-                    const struct lp_sampler_static_state *static_state,
-                    struct lp_sampler_dynamic_state *dynamic_state,
-                    struct lp_type type,
-                    unsigned unit,
-                    unsigned num_coords,
-                    const LLVMValueRef *coords,
-                    LLVMValueRef lodbias,
-                    LLVMValueRef *texel)
-{
-   struct lp_build_sample_context bld;
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef stride;
-   LLVMValueRef data_ptr;
-   LLVMValueRef s;
-   LLVMValueRef t;
-   LLVMValueRef p;
-
-   /* Setup our build context */
-   memset(&bld, 0, sizeof bld);
-   bld.builder = builder;
-   bld.static_state = static_state;
-   bld.dynamic_state = dynamic_state;
-   bld.format_desc = util_format_description(static_state->format);
-   bld.coord_type = type;
-   bld.int_coord_type = lp_int_type(type);
-   bld.texel_type = type;
-   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
-   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
-   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
-
-   /* Get the dynamic state */
-   width = dynamic_state->width(dynamic_state, builder, unit);
-   height = dynamic_state->height(dynamic_state, builder, unit);
-   stride = dynamic_state->stride(dynamic_state, builder, unit);
-   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
-
-   s = coords[0];
-   t = coords[1];
-   p = coords[2];
-
-   width = lp_build_broadcast_scalar(&bld.int_coord_bld, width);
-   height = lp_build_broadcast_scalar(&bld.int_coord_bld, height);
-   stride = lp_build_broadcast_scalar(&bld.int_coord_bld, stride);
-
-   if(static_state->target == PIPE_TEXTURE_1D)
-      t = bld.coord_bld.zero;
-
-   if(static_state->normalized_coords) {
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld.coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(builder, width, coord_vec_type, "");
-      LLVMValueRef fp_height = LLVMBuildSIToFP(builder, height, coord_vec_type, "");
-      s = lp_build_mul(&bld.coord_bld, s, fp_width);
-      t = lp_build_mul(&bld.coord_bld, t, fp_height);
-   }
-
-   switch (static_state->min_img_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      if(lp_format_is_rgba8(bld.format_desc))
-         lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
-      else
-         lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
-      break;
-   default:
-      assert(0);
-   }
-
-   /* FIXME: respect static_state->min_mip_filter */;
-   /* FIXME: respect static_state->mag_img_filter */;
-   /* FIXME: respect static_state->prefilter */;
-
-   lp_build_sample_compare(&bld, p, texel);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.c b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
deleted file mode 100644
index 3998ac374f..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Helper functions for manipulation structures.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-
-#include "lp_bld_debug.h"
-#include "lp_bld_struct.h"
-
-
-LLVMValueRef
-lp_build_struct_get_ptr(LLVMBuilderRef builder,
-                        LLVMValueRef ptr,
-                        unsigned member,
-                        const char *name)
-{
-   LLVMValueRef indices[2];
-   LLVMValueRef member_ptr;
-   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
-   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
-   return member_ptr;
-}
-
-
-LLVMValueRef
-lp_build_struct_get(LLVMBuilderRef builder,
-                    LLVMValueRef ptr,
-                    unsigned member,
-                    const char *name)
-{
-   LLVMValueRef member_ptr;
-   LLVMValueRef res;
-   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
-   res = LLVMBuildLoad(builder, member_ptr, "");
-   lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
-   return res;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.h b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
deleted file mode 100644
index 740392f561..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for type conversions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_STRUCT_H
-#define LP_BLD_STRUCT_H
-
-
-#include <llvm-c/Core.h>  
-#include <llvm-c/Target.h>
-
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-
-
-#define LP_CHECK_STRUCT_SIZE(_ctype, _ltarget, _ltype) \
-      assert(LLVMABISizeOfType(_ltarget, _ltype) == \
-             sizeof(_ctype))
-
-#define LP_CHECK_MEMBER_OFFSET(_ctype, _cmember, _ltarget, _ltype, _lindex) \
-      assert(LLVMOffsetOfElement(_ltarget, _ltype, _lindex) == \
-             offsetof(_ctype, _cmember))
-
-
-/**
- * Get value pointer to a structure member.
- */
-LLVMValueRef
-lp_build_struct_get_ptr(LLVMBuilderRef builder,
-                        LLVMValueRef ptr,
-                        unsigned member,
-                        const char *name);
-
-/**
- * Get the value of a structure member.
- */
-LLVMValueRef
-lp_build_struct_get(LLVMBuilderRef builder,
-                    LLVMValueRef ptr,
-                    unsigned member,
-                    const char *name);
-
-
-#endif /* !LP_BLD_STRUCT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
deleted file mode 100644
index 64e81f7b1f..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for swizzling/shuffling.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-
-
-LLVMValueRef
-lp_build_broadcast(LLVMBuilderRef builder,
-                   LLVMTypeRef vec_type,
-                   LLVMValueRef scalar)
-{
-   const unsigned n = LLVMGetVectorSize(vec_type);
-   LLVMValueRef res;
-   unsigned i;
-
-   res = LLVMGetUndef(vec_type);
-   for(i = 0; i < n; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      res = LLVMBuildInsertElement(builder, res, scalar, index, "");
-   }
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_broadcast_scalar(struct lp_build_context *bld,
-                          LLVMValueRef scalar)
-{
-   const struct lp_type type = bld->type;
-   LLVMValueRef res;
-   unsigned i;
-
-   res = bld->undef;
-   for(i = 0; i < type.length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      res = LLVMBuildInsertElement(bld->builder, res, scalar, index, "");
-   }
-
-   return res;
-}
-
-
-LLVMValueRef
-lp_build_broadcast_aos(struct lp_build_context *bld,
-                       LLVMValueRef a,
-                       unsigned channel)
-{
-   const struct lp_type type = bld->type;
-   const unsigned n = type.length;
-   unsigned i, j;
-
-   if(a == bld->undef || a == bld->zero || a == bld->one)
-      return a;
-
-   /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
-    * using shuffles here actually causes worst results. More investigation is
-    * needed. */
-   if (n <= 4) {
-      /*
-       * Shuffle.
-       */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
-   }
-   else {
-      /*
-       * Bit mask and recursive shifts
-       *
-       *   XYZW XYZW .... XYZW  <= input
-       *   0Y00 0Y00 .... 0Y00
-       *   YY00 YY00 .... YY00
-       *   YYYY YYYY .... YYYY  <= output
-       */
-      struct lp_type type4 = type;
-      const char shifts[4][2] = {
-         { 1,  2},
-         {-1,  2},
-         { 1, -2},
-         {-1, -2}
-      };
-      boolean cond[4];
-      unsigned i;
-
-      memset(cond, 0, sizeof cond);
-      cond[channel] = 1;
-
-      a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
-
-      type4.width *= 4;
-      type4.length /= 4;
-
-      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
-
-      for(i = 0; i < 2; ++i) {
-         LLVMValueRef tmp = NULL;
-         int shift = shifts[channel][i];
-
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
-         shift = -shift;
-#endif
-
-         if(shift > 0)
-            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
-         if(shift < 0)
-            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
-
-         assert(tmp);
-         if(tmp)
-            a = LLVMBuildOr(bld->builder, a, tmp, "");
-      }
-
-      return LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type), "");
-   }
-}
-
-
-LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4])
-{
-   const unsigned n = bld->type.length;
-   unsigned i, j;
-
-   if(a == bld->undef || a == bld->zero || a == bld->one)
-      return a;
-
-   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
-      return lp_build_broadcast_aos(bld, a, swizzle[0]);
-
-   {
-      /*
-       * Shuffle.
-       */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
-   }
-}
-
-
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4])
-{
-   const unsigned n = bld->type.length;
-   unsigned i, j;
-
-   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
-      return lp_build_swizzle1_aos(bld, a, swizzle);
-
-   if(a == b) {
-      unsigned char swizzle1[4];
-      swizzle1[0] = swizzle[0] % 4;
-      swizzle1[1] = swizzle[1] % 4;
-      swizzle1[2] = swizzle[2] % 4;
-      swizzle1[3] = swizzle[3] % 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle1);
-   }
-
-   if(swizzle[0] % 4 == 0 &&
-      swizzle[1] % 4 == 1 &&
-      swizzle[2] % 4 == 2 &&
-      swizzle[3] % 4 == 3) {
-      boolean cond[4];
-      cond[0] = swizzle[0] / 4;
-      cond[1] = swizzle[1] / 4;
-      cond[2] = swizzle[2] / 4;
-      cond[3] = swizzle[3] / 4;
-      return lp_build_select_aos(bld, a, b, cond);
-   }
-
-   {
-      /*
-       * Shuffle.
-       */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
-
-      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
-   }
-}
-
-
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
deleted file mode 100644
index b9472127a6..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Helper functions for swizzling/shuffling.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_SWIZZLE_H
-#define LP_BLD_SWIZZLE_H
-
-
-#include <llvm-c/Core.h>  
-
-
-struct lp_type;
-struct lp_build_context;
-
-
-LLVMValueRef
-lp_build_broadcast(LLVMBuilderRef builder,
-                   LLVMTypeRef vec_type,
-                   LLVMValueRef scalar);
-
-
-LLVMValueRef
-lp_build_broadcast_scalar(struct lp_build_context *bld,
-                          LLVMValueRef scalar);
-
-
-/**
- * Broadcast one channel of a vector composed of arrays of XYZW structures into
- * all four channel.
- */
-LLVMValueRef
-lp_build_broadcast_aos(struct lp_build_context *bld,
-                       LLVMValueRef a,
-                       unsigned channel);
-
-
-/**
- * Swizzle a vector consisting of an array of XYZW structs.
- *
- * @param swizzle is the in [0,4[ range.
- */
-LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4]);
-
-
-/**
- * Swizzle two vector consisting of an array of XYZW structs.
- *
- * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
- */
-LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4]);
-
-
-#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
deleted file mode 100644
index eddb7a83fa..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * TGSI to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_TGSI_H
-#define LP_BLD_TGSI_H
-
-#include <llvm-c/Core.h>
-
-
-struct tgsi_token;
-struct lp_type;
-struct lp_build_context;
-struct lp_build_mask_context;
-
-
-/**
- * Sampler code generation interface.
- *
- * Although texture sampling is a requirement for TGSI translation, it is
- * a very different problem with several different approaches to it. This
- * structure establishes an interface for texture sampling code generation, so
- * that we can easily use different texture sampling strategies.
- */
-struct lp_build_sampler_soa
-{
-   void
-   (*destroy)( struct lp_build_sampler_soa *sampler );
-
-   void
-   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
-                        LLVMBuilderRef builder,
-                        struct lp_type type,
-                        unsigned unit,
-                        unsigned num_coords,
-                        const LLVMValueRef *coords,
-                        LLVMValueRef lodbias,
-                        LLVMValueRef *texel);
-};
-
-
-void
-lp_build_tgsi_soa(LLVMBuilderRef builder,
-                  const struct tgsi_token *tokens,
-                  struct lp_type type,
-                  struct lp_build_mask_context *mask,
-                  LLVMValueRef consts_ptr,
-                  const LLVMValueRef *pos,
-                  const LLVMValueRef (*inputs)[4],
-                  LLVMValueRef (*outputs)[4],
-                  struct lp_build_sampler_soa *sampler);
-
-
-#endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
deleted file mode 100644
index fb1eda4423..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ /dev/null
@@ -1,1469 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * TGSI to LLVM IR translation -- SoA.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- *
- * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
- * Brian Paul, and others.
- */
-
-#include "pipe/p_config.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_tgsi.h"
-#include "lp_bld_debug.h"
-
-
-#define LP_MAX_TEMPS 256
-#define LP_MAX_IMMEDIATES 256
-
-
-#define FOR_EACH_CHANNEL( CHAN )\
-   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
-
-#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
-
-#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
-
-#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
-   FOR_EACH_CHANNEL( CHAN )\
-      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
-
-#define CHAN_X 0
-#define CHAN_Y 1
-#define CHAN_Z 2
-#define CHAN_W 3
-
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
-
-
-struct lp_build_tgsi_soa_context
-{
-   struct lp_build_context base;
-
-   LLVMValueRef consts_ptr;
-   const LLVMValueRef *pos;
-   const LLVMValueRef (*inputs)[NUM_CHANNELS];
-   LLVMValueRef (*outputs)[NUM_CHANNELS];
-
-   struct lp_build_sampler_soa *sampler;
-
-   LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
-   LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
-
-   struct lp_build_mask_context *mask;
-};
-
-
-static const unsigned char
-swizzle_left[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
-};
-
-static const unsigned char
-swizzle_right[4] = {
-   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
-   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
-};
-
-static const unsigned char
-swizzle_top[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
-};
-
-static const unsigned char
-swizzle_bottom[4] = {
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
-};
-
-
-static LLVMValueRef
-emit_ddx(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
-{
-   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
-   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
-   return lp_build_sub(&bld->base, src_right, src_left);
-}
-
-
-static LLVMValueRef
-emit_ddy(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
-{
-   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
-   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
-   return lp_build_sub(&bld->base, src_top, src_bottom);
-}
-
-
-/**
- * Register fetch.
- */
-static LLVMValueRef
-emit_fetch(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   unsigned index,
-   const unsigned chan_index )
-{
-   const struct tgsi_full_src_register *reg = &inst->Src[index];
-   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
-   LLVMValueRef res;
-
-   switch (swizzle) {
-   case TGSI_SWIZZLE_X:
-   case TGSI_SWIZZLE_Y:
-   case TGSI_SWIZZLE_Z:
-   case TGSI_SWIZZLE_W:
-
-      switch (reg->Register.File) {
-      case TGSI_FILE_CONSTANT: {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
-         LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
-         LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
-         res = lp_build_broadcast_scalar(&bld->base, scalar);
-         break;
-      }
-
-      case TGSI_FILE_IMMEDIATE:
-         res = bld->immediates[reg->Register.Index][swizzle];
-         assert(res);
-         break;
-
-      case TGSI_FILE_INPUT:
-         res = bld->inputs[reg->Register.Index][swizzle];
-         assert(res);
-         break;
-
-      case TGSI_FILE_TEMPORARY:
-         res = bld->temps[reg->Register.Index][swizzle];
-         if(!res)
-            return bld->base.undef;
-         break;
-
-      default:
-         assert( 0 );
-         return bld->base.undef;
-      }
-      break;
-
-   default:
-      assert( 0 );
-      return bld->base.undef;
-   }
-
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      res = lp_build_abs( &bld->base, res );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      /* TODO: Use bitwese OR for floating point */
-      res = lp_build_abs( &bld->base, res );
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
-   }
-
-   return res;
-}
-
-
-/**
- * Register fetch with derivatives.
- */
-static void
-emit_fetch_deriv(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   unsigned index,
-   const unsigned chan_index,
-   LLVMValueRef *res,
-   LLVMValueRef *ddx,
-   LLVMValueRef *ddy)
-{
-   LLVMValueRef src;
-
-   src = emit_fetch(bld, inst, index, chan_index);
-
-   if(res)
-      *res = src;
-
-   /* TODO: use interpolation coeffs for inputs */
-
-   if(ddx)
-      *ddx = emit_ddx(bld, src);
-
-   if(ddy)
-      *ddy = emit_ddy(bld, src);
-}
-
-
-/**
- * Register store.
- */
-static void
-emit_store(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   unsigned index,
-   unsigned chan_index,
-   LLVMValueRef value)
-{
-   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
-
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      value = lp_build_max(&bld->base, value, bld->base.zero);
-      value = lp_build_min(&bld->base, value, bld->base.one);
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
-      value = lp_build_min(&bld->base, value, bld->base.one);
-      break;
-
-   default:
-      assert(0);
-   }
-
-   switch( reg->Register.File ) {
-   case TGSI_FILE_OUTPUT:
-      bld->outputs[reg->Register.Index][chan_index] = value;
-      break;
-
-   case TGSI_FILE_TEMPORARY:
-      bld->temps[reg->Register.Index][chan_index] = value;
-      break;
-
-   case TGSI_FILE_ADDRESS:
-      /* FIXME */
-      assert(0);
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
-
-
-/**
- * High-level instruction translators.
- */
-
-
-static void
-emit_tex( struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst,
-          boolean apply_lodbias,
-          boolean projected,
-          LLVMValueRef *texel)
-{
-   const uint unit = inst->Src[1].Register.Index;
-   LLVMValueRef lodbias;
-   LLVMValueRef oow = NULL;
-   LLVMValueRef coords[3];
-   unsigned num_coords;
-   unsigned i;
-
-   switch (inst->Texture.Texture) {
-   case TGSI_TEXTURE_1D:
-      num_coords = 1;
-      break;
-   case TGSI_TEXTURE_2D:
-   case TGSI_TEXTURE_RECT:
-      num_coords = 2;
-      break;
-   case TGSI_TEXTURE_SHADOW1D:
-   case TGSI_TEXTURE_SHADOW2D:
-   case TGSI_TEXTURE_SHADOWRECT:
-   case TGSI_TEXTURE_3D:
-   case TGSI_TEXTURE_CUBE:
-      num_coords = 3;
-      break;
-   default:
-      assert(0);
-      return;
-   }
-
-   if(apply_lodbias)
-      lodbias = emit_fetch( bld, inst, 0, 3 );
-   else
-      lodbias = bld->base.zero;
-
-   if (projected) {
-      oow = emit_fetch( bld, inst, 0, 3 );
-      oow = lp_build_rcp(&bld->base, oow);
-   }
-
-   for (i = 0; i < num_coords; i++) {
-      coords[i] = emit_fetch( bld, inst, 0, i );
-      if (projected)
-         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
-   }
-   for (i = num_coords; i < 3; i++) {
-      coords[i] = bld->base.undef;
-   }
-
-   bld->sampler->emit_fetch_texel(bld->sampler,
-                                  bld->base.builder,
-                                  bld->base.type,
-                                  unit, num_coords, coords, lodbias,
-                                  texel);
-}
-
-
-static void
-emit_kil(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst )
-{
-   const struct tgsi_full_src_register *reg = &inst->Src[0];
-   LLVMValueRef terms[NUM_CHANNELS];
-   LLVMValueRef mask;
-   unsigned chan_index;
-
-   memset(&terms, 0, sizeof terms);
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* Unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
-
-      /* Check if the component has not been already tested. */
-      assert(swizzle < NUM_CHANNELS);
-      if( !terms[swizzle] )
-         /* TODO: change the comparison operator instead of setting the sign */
-         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
-   }
-
-   mask = NULL;
-   FOR_EACH_CHANNEL( chan_index ) {
-      if(terms[chan_index]) {
-         LLVMValueRef chan_mask;
-
-         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
-
-         if(mask)
-            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
-         else
-            mask = chan_mask;
-      }
-   }
-
-   if(mask)
-      lp_build_mask_update(bld->mask, mask);
-}
-
-
-/**
- * Check if inst src/dest regs use indirect addressing into temporary
- * register file.
- */
-static boolean
-indirect_temp_reference(const struct tgsi_full_instruction *inst)
-{
-   uint i;
-   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *reg = &inst->Src[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
-          reg->Register.Indirect)
-         return TRUE;
-   }
-   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
-          reg->Register.Indirect)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-static int
-emit_instruction(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   const struct tgsi_opcode_info *info)
-{
-   unsigned chan_index;
-   LLVMValueRef src0, src1, src2;
-   LLVMValueRef tmp0, tmp1, tmp2;
-   LLVMValueRef tmp3 = NULL;
-   LLVMValueRef tmp4 = NULL;
-   LLVMValueRef tmp5 = NULL;
-   LLVMValueRef tmp6 = NULL;
-   LLVMValueRef tmp7 = NULL;
-   LLVMValueRef res;
-   LLVMValueRef dst0[NUM_CHANNELS];
-
-   /* we can't handle indirect addressing into temp register file yet */
-   if (indirect_temp_reference(inst))
-      return FALSE;
-
-   assert(info->num_dst <= 1);
-   if(info->num_dst) {
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = bld->base.undef;
-      }
-   }
-
-   switch (inst->Instruction.Opcode) {
-#if 0
-   case TGSI_OPCODE_ARL:
-      /* FIXME */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_flr(bld, 0, 0);
-         emit_f2it( bld, 0 );
-         dst0[chan_index] = tmp0;
-      }
-      break;
-#endif
-
-   case TGSI_OPCODE_MOV:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
-         dst0[CHAN_X] = bld->base.one;
-      }
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
-         src0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
-      }
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
-         /* XMM[1] = SrcReg[0].yyyy */
-         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
-         /* XMM[1] = max(XMM[1], 0) */
-         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
-         /* XMM[2] = SrcReg[0].wwww */
-         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
-         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
-         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
-      }
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
-         dst0[CHAN_W] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      res = lp_build_rcp(&bld->base, src0);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = res;
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      src0 = lp_build_abs(&bld->base, src0);
-      res = lp_build_rsqrt(&bld->base, src0);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = res;
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
-         LLVMValueRef *p_exp2_int_part = NULL;
-         LLVMValueRef *p_frac_part = NULL;
-         LLVMValueRef *p_exp2 = NULL;
-
-         src0 = emit_fetch( bld, inst, 0, CHAN_X );
-
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            p_exp2_int_part = &tmp0;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            p_frac_part = &tmp1;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            p_exp2 = &tmp2;
-
-         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
-
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            dst0[CHAN_X] = tmp0;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            dst0[CHAN_Y] = tmp1;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            dst0[CHAN_Z] = tmp2;
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         dst0[CHAN_W] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
-         LLVMValueRef *p_floor_log2 = NULL;
-         LLVMValueRef *p_exp = NULL;
-         LLVMValueRef *p_log2 = NULL;
-
-         src0 = emit_fetch( bld, inst, 0, CHAN_X );
-         src0 = lp_build_abs( &bld->base, src0 );
-
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            p_floor_log2 = &tmp0;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            p_exp = &tmp1;
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            p_log2 = &tmp2;
-
-         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
-
-         /* dst.x = floor(lg2(abs(src.x))) */
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            dst0[CHAN_X] = tmp0;
-         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
-            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
-         }
-         /* dst.z = lg2(abs(src.x)) */
-         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            dst0[CHAN_Z] = tmp2;
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         dst0[CHAN_W] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
-      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
-      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         dst0[CHAN_X] = bld->base.one;
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
-         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
-         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
-      }
-      break;
-
-   case TGSI_OPCODE_LRP:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp0 = lp_build_sub( &bld->base, src1, src2 );
-         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
-         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
-      }
-      break;
-
-   case TGSI_OPCODE_DP2A:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
-      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
-      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
-      }
-      break;
-
-   case TGSI_OPCODE_FRC:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp0 = lp_build_floor(&bld->base, src0);
-         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp0 = lp_build_max(&bld->base, tmp0, src1);
-         tmp0 = lp_build_min(&bld->base, tmp0, src2);
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_FLR:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
-      }
-      break;
-
-   case TGSI_OPCODE_EX2: {
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp0 = lp_build_exp2( &bld->base, tmp0);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-   }
-
-   case TGSI_OPCODE_LG2:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp0 = lp_build_log2( &bld->base, tmp0);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_POW:
-      src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      src1 = emit_fetch( bld, inst, 1, CHAN_X );
-      res = lp_build_pow( &bld->base, src0, src1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = res;
-      }
-      break;
-
-   case TGSI_OPCODE_XPD:
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
-         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
-         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
-         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         tmp2 = tmp0;
-         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
-         tmp5 = tmp3;
-         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
-         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
-         dst0[CHAN_X] = tmp2;
-      }
-      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
-         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
-         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
-         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
-         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
-         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
-         dst0[CHAN_Y] = tmp3;
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
-         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
-         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
-         dst0[CHAN_Z] = tmp5;
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         dst0[CHAN_W] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      /* deprecated? */
-      assert(0);
-      return 0;
-
-   case TGSI_OPCODE_DPH:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
-      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp0 = lp_build_cos( &bld->base, tmp0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
-      }
-      break;
-
-   case TGSI_OPCODE_DDY:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
-      }
-      break;
-
-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
-      emit_kil( bld, inst );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_SFL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = bld->base.zero;
-      }
-      break;
-
-   case TGSI_OPCODE_SGT:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_SIN:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-      tmp0 = lp_build_sin( &bld->base, tmp0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_SNE:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-      }
-      break;
-
-   case TGSI_OPCODE_STR:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, FALSE, FALSE, dst0 );
-      break;
-
-   case TGSI_OPCODE_TXD:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      /* deprecated */
-      assert (0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-#if 0
-   case TGSI_OPCODE_ARR:
-      /* FIXME */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_rnd( bld, 0, 0 );
-         emit_f2it( bld, 0 );
-         dst0[chan_index] = tmp0;
-      }
-      break;
-#endif
-
-   case TGSI_OPCODE_BRA:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
-      }
-      break;
-
-   case TGSI_OPCODE_CMP:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         src0 = emit_fetch( bld, inst, 0, chan_index );
-         src1 = emit_fetch( bld, inst, 1, chan_index );
-         src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
-         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
-      }
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         dst0[CHAN_Z] = bld->base.zero;
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         dst0[CHAN_W] = bld->base.one;
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TRUE, FALSE, dst0 );
-      break;
-
-   case TGSI_OPCODE_NRM:
-      /* fall-through */
-   case TGSI_OPCODE_NRM4:
-      /* 3 or 4-component normalization */
-      {
-         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
-
-         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
-             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
-             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
-             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
-
-            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
-
-            /* xmm4 = src.x */
-            /* xmm0 = src.x * src.x */
-            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
-               tmp4 = tmp0;
-            }
-            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
-
-            /* xmm5 = src.y */
-            /* xmm0 = xmm0 + src.y * src.y */
-            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
-               tmp5 = tmp1;
-            }
-            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
-            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-
-            /* xmm6 = src.z */
-            /* xmm0 = xmm0 + src.z * src.z */
-            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
-               tmp6 = tmp1;
-            }
-            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
-            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-
-            if (dims == 4) {
-               /* xmm7 = src.w */
-               /* xmm0 = xmm0 + src.w * src.w */
-               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
-               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
-                  tmp7 = tmp1;
-               }
-               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
-               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
-            }
-
-            /* xmm1 = 1 / sqrt(xmm0) */
-            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
-
-            /* dst.x = xmm1 * src.x */
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
-               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
-            }
-
-            /* dst.y = xmm1 * src.y */
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
-               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
-            }
-
-            /* dst.z = xmm1 * src.z */
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
-               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
-            }
-
-            /* dst.w = xmm1 * src.w */
-            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
-               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
-            }
-         }
-
-         /* dst.w = 1.0 */
-         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
-            dst0[CHAN_W] = bld->base.one;
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_DIV:
-      /* deprecated */
-      assert( 0 );
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DP2:
-      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
-      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
-      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
-      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
-      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
-      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
-      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
-      }
-      break;
-
-   case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TRUE, FALSE, dst0 );
-      break;
-
-   case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, FALSE, TRUE, dst0 );
-      break;
-      
-   case TGSI_OPCODE_BRK:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_IF:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BGNFOR:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_REP:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ELSE:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDIF:
-      /* FIXME */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDFOR:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDREP:
-      /* deprecated */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PUSHA:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_POPA:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CEIL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
-      }
-      break;
-
-   case TGSI_OPCODE_I2F:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NOT:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TRUNC:
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
-      }
-      break;
-
-   case TGSI_OPCODE_SHL:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ISHR:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_AND:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_OR:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_MOD:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_XOR:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SAD:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXF:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXQ:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CONT:
-      /* deprecated? */
-      assert(0);
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NOP:
-      break;
-
-   default:
-      return 0;
-   }
-   
-   if(info->num_dst) {
-      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
-      }
-   }
-
-   return 1;
-}
-
-
-void
-lp_build_tgsi_soa(LLVMBuilderRef builder,
-                  const struct tgsi_token *tokens,
-                  struct lp_type type,
-                  struct lp_build_mask_context *mask,
-                  LLVMValueRef consts_ptr,
-                  const LLVMValueRef *pos,
-                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
-                  LLVMValueRef (*outputs)[NUM_CHANNELS],
-                  struct lp_build_sampler_soa *sampler)
-{
-   struct lp_build_tgsi_soa_context bld;
-   struct tgsi_parse_context parse;
-   uint num_immediates = 0;
-   unsigned i;
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   bld.mask = mask;
-   bld.pos = pos;
-   bld.inputs = inputs;
-   bld.outputs = outputs;
-   bld.consts_ptr = consts_ptr;
-   bld.sampler = sampler;
-
-   tgsi_parse_init( &parse, tokens );
-
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         /* Inputs already interpolated */
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
-            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
-               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-                             info ? info->mnemonic : "<invalid>");
-         }
-
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* simply copy the immediate values into the next immediates[] slot */
-         {
-            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
-            assert(size <= 4);
-            assert(num_immediates < LP_MAX_IMMEDIATES);
-            for( i = 0; i < size; ++i )
-               bld.immediates[num_immediates][i] =
-                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
-            for( i = size; i < 4; ++i )
-               bld.immediates[num_immediates][i] = bld.base.undef;
-            num_immediates++;
-         }
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-
-   tgsi_parse_free( &parse );
-}
-
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
deleted file mode 100644
index 1320a26721..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-
-
-LLVMTypeRef
-lp_build_elem_type(struct lp_type type)
-{
-   if (type.floating) {
-      switch(type.width) {
-      case 32:
-         return LLVMFloatType();
-         break;
-      case 64:
-         return LLVMDoubleType();
-         break;
-      default:
-         assert(0);
-         return LLVMFloatType();
-      }
-   }
-   else {
-      return LLVMIntType(type.width);
-   }
-}
-
-
-LLVMTypeRef
-lp_build_vec_type(struct lp_type type)
-{
-   LLVMTypeRef elem_type = lp_build_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
-}
-
-
-/**
- * This function is a mirror of lp_build_elem_type() above.
- *
- * XXX: I'm not sure if it wouldn't be easier/efficient to just recreate the
- * type and check for identity.
- */
-boolean
-lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
-{
-   LLVMTypeKind elem_kind;
-
-   assert(elem_type);
-   if(!elem_type)
-      return FALSE;
-
-   elem_kind = LLVMGetTypeKind(elem_type);
-
-   if (type.floating) {
-      switch(type.width) {
-      case 32:
-         if(elem_kind != LLVMFloatTypeKind)
-            return FALSE;
-         break;
-      case 64:
-         if(elem_kind != LLVMDoubleTypeKind)
-            return FALSE;
-         break;
-      default:
-         assert(0);
-         return FALSE;
-      }
-   }
-   else {
-      if(elem_kind != LLVMIntegerTypeKind)
-         return FALSE;
-
-      if(LLVMGetIntTypeWidth(elem_type) != type.width)
-         return FALSE;
-   }
-
-   return TRUE; 
-}
-
-
-boolean
-lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
-{
-   LLVMTypeRef elem_type;
-
-   assert(vec_type);
-   if(!vec_type)
-      return FALSE;
-
-   if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
-      return FALSE;
-
-   if(LLVMGetVectorSize(vec_type) != type.length)
-      return FALSE;
-
-   elem_type = LLVMGetElementType(vec_type);
-
-   return lp_check_elem_type(type, elem_type);
-}
-
-
-boolean
-lp_check_value(struct lp_type type, LLVMValueRef val) 
-{
-   LLVMTypeRef vec_type;
-
-   assert(val);
-   if(!val)
-      return FALSE;
-
-   vec_type = LLVMTypeOf(val);
-
-   return lp_check_vec_type(type, vec_type);
-}
-
-
-LLVMTypeRef
-lp_build_int_elem_type(struct lp_type type)
-{
-   return LLVMIntType(type.width);
-}
-
-
-LLVMTypeRef
-lp_build_int_vec_type(struct lp_type type)
-{
-   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
-}
-
-
-struct lp_type
-lp_int_type(struct lp_type type)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.width = type.width;
-   res_type.length = type.length;
-
-   return res_type;
-}
-
-
-/**
- * Return the type with twice the bit width (hence half the number of elements).
- */
-struct lp_type
-lp_wider_type(struct lp_type type)
-{
-   struct lp_type res_type;
-
-   memcpy(&res_type, &type, sizeof res_type);
-   res_type.width *= 2;
-   res_type.length /= 2;
-
-   assert(res_type.length);
-
-   return res_type;
-}
-
-
-void
-lp_build_context_init(struct lp_build_context *bld,
-                      LLVMBuilderRef builder,
-                      struct lp_type type)
-{
-   bld->builder = builder;
-   bld->type = type;
-   bld->undef = lp_build_undef(type);
-   bld->zero = lp_build_zero(type);
-   bld->one = lp_build_one(type);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
deleted file mode 100644
index 2fb233d335..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Convenient representation of SIMD types.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#ifndef LP_BLD_TYPE_H
-#define LP_BLD_TYPE_H
-
-
-#include <llvm-c/Core.h>  
-
-#include <pipe/p_compiler.h>
-
-
-/**
- * Native SIMD register width.
- *
- * 128 for all architectures we care about.
- */
-#define LP_NATIVE_VECTOR_WIDTH 128
-
-/**
- * Several functions can only cope with vectors of length up to this value.
- * You may need to increase that value if you want to represent bigger vectors.
- */
-#define LP_MAX_VECTOR_LENGTH 16
-
-
-/**
- * The LLVM type system can't conveniently express all the things we care about
- * on the types used for intermediate computations, such as signed vs unsigned,
- * normalized values, or fixed point.
- */
-struct lp_type {
-   /**
-    * Floating-point. Cannot be used with fixed. Integer numbers are
-    * represented by this zero.
-    */
-   unsigned floating:1;
-
-   /**
-    * Fixed-point. Cannot be used with floating. Integer numbers are
-    * represented by this zero.
-    */
-   unsigned fixed:1;
-
-   /**
-    * Whether it can represent negative values or not.
-    *
-    * If this is not set for floating point, it means that all values are
-    * assumed to be positive.
-    */
-   unsigned sign:1;
-
-   /**
-    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
-    * interval for signed types.
-    *
-    * For integer types it means the representable integer range should be
-    * interpreted as the interval above.
-    *
-    * For floating and fixed point formats it means the values should be
-    * clamped to the interval above.
-    */
-   unsigned norm:1;
-
-   /**
-    * Element width.
-    *
-    * For fixed point values, the fixed point is assumed to be at half the
-    * width.
-    */
-   unsigned width:14;
-
-   /**
-    * Vector length.
-    *
-    * width*length should be a power of two greater or equal to eight.
-    *
-    * @sa LP_MAX_VECTOR_LENGTH
-    */
-   unsigned length:14;
-};
-
-
-/**
- * We need most of the information here in order to correctly and efficiently
- * translate an arithmetic operation into LLVM IR. Putting it here avoids the
- * trouble of passing it as parameters.
- */
-struct lp_build_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * This not only describes the input/output LLVM types, but also whether
-    * to normalize/clamp the results.
-    */
-   struct lp_type type;
-
-   /** Same as lp_build_undef(type) */
-   LLVMValueRef undef;
-
-   /** Same as lp_build_zero(type) */
-   LLVMValueRef zero;
-
-   /** Same as lp_build_one(type) */
-   LLVMValueRef one;
-};
-
-
-static INLINE struct lp_type
-lp_type_float(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.floating = TRUE;
-   res_type.sign = TRUE;
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-static INLINE struct lp_type
-lp_type_int(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.sign = TRUE;
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-static INLINE struct lp_type
-lp_type_uint(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-static INLINE struct lp_type
-lp_type_unorm(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.norm = TRUE;
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-static INLINE struct lp_type
-lp_type_fixed(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.sign = TRUE;
-   res_type.fixed = TRUE;
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
-{
-   struct lp_type res_type;
-
-   memset(&res_type, 0, sizeof res_type);
-   res_type.fixed = TRUE;
-   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
-
-   return res_type;
-}
-
-
-LLVMTypeRef
-lp_build_elem_type(struct lp_type type);
-
-
-LLVMTypeRef
-lp_build_vec_type(struct lp_type type);
-
-
-boolean
-lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
-
-
-boolean
-lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
-
-
-boolean
-lp_check_value(struct lp_type type, LLVMValueRef val);
-
-
-LLVMTypeRef
-lp_build_int_elem_type(struct lp_type type);
-
-
-LLVMTypeRef
-lp_build_int_vec_type(struct lp_type type);
-
-
-struct lp_type
-lp_int_type(struct lp_type type);
-
-
-struct lp_type
-lp_wider_type(struct lp_type type);
-
-
-void
-lp_build_context_init(struct lp_build_context *bld,
-                      LLVMBuilderRef builder,
-                      struct lp_type type);
-
-
-#endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_buffer.c b/src/gallium/drivers/llvmpipe/lp_buffer.c
index 66f1f8e138..9eda972081 100644
--- a/src/gallium/drivers/llvmpipe/lp_buffer.c
+++ b/src/gallium/drivers/llvmpipe/lp_buffer.c
@@ -26,12 +26,12 @@
  **************************************************************************/
 
 
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
 #include "lp_winsys.h"
 #include "lp_screen.h"
-#include "lp_texture.h"
 #include "lp_buffer.h"
 
 
@@ -108,32 +108,6 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen,
 }
 
 
-static void
-llvmpipe_fence_reference(struct pipe_screen *screen,
-                         struct pipe_fence_handle **ptr,
-                         struct pipe_fence_handle *fence)
-{
-}
-
-
-static int
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence,
-                         unsigned flag)
-{
-   return 0;
-}
-
-
-static int
-llvmpipe_fence_finish(struct pipe_screen *screen,
-                      struct pipe_fence_handle *fence,
-                      unsigned flag)
-{
-   return 0;
-}
-
-
 void
 llvmpipe_init_screen_buffer_funcs(struct pipe_screen *screen)
 {
@@ -142,9 +116,4 @@ llvmpipe_init_screen_buffer_funcs(struct pipe_screen *screen)
    screen->buffer_map = llvmpipe_buffer_map;
    screen->buffer_unmap = llvmpipe_buffer_unmap;
    screen->buffer_destroy = llvmpipe_buffer_destroy;
-
-   screen->fence_reference = llvmpipe_fence_reference;
-   screen->fence_signalled = llvmpipe_fence_signalled;
-   screen->fence_finish = llvmpipe_fence_finish;
-
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index 08d9f2e273..3e8c410925 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -33,12 +33,9 @@
 
 
 #include "pipe/p_defines.h"
-#include "util/u_pack_color.h"
 #include "lp_clear.h"
 #include "lp_context.h"
-#include "lp_surface.h"
-#include "lp_state.h"
-#include "lp_tile_cache.h"
+#include "lp_setup.h"
 
 
 /**
@@ -46,37 +43,16 @@
  * No masking, no scissor (clear entire buffer).
  */
 void
-llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
-               double depth, unsigned stencil)
+llvmpipe_clear(struct pipe_context *pipe, 
+               unsigned buffers,
+               const float *rgba,
+               double depth,
+               unsigned stencil)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   union util_color uc;
-   unsigned cv;
-   uint i;
 
    if (llvmpipe->no_rast)
       return;
 
-#if 0
-   llvmpipe_update_derived(llvmpipe); /* not needed?? */
-#endif
-
-   if (buffers & PIPE_CLEAR_COLOR) {
-      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
-         struct pipe_surface *ps = llvmpipe->framebuffer.cbufs[i];
-
-         util_pack_color(rgba, ps->format, &uc);
-         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, uc.ui);
-      }
-      llvmpipe->dirty_render_cache = TRUE;
-   }
-
-   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
-      struct pipe_surface *ps = llvmpipe->framebuffer.zsbuf;
-
-      cv = util_pack_z_stencil(ps->format, depth, stencil);
-
-      /* non-cached surface */
-      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, cv);
-   }
+   lp_setup_clear( llvmpipe->setup, rgba, depth, stencil, buffers );
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 1cc3c9227c..43d610631d 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -33,70 +33,22 @@
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
+#include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_clear.h"
 #include "lp_context.h"
 #include "lp_flush.h"
-#include "lp_prim_vbuf.h"
+#include "lp_perf.h"
 #include "lp_state.h"
 #include "lp_surface.h"
-#include "lp_tile_cache.h"
-#include "lp_tex_cache.h"
 #include "lp_texture.h"
 #include "lp_winsys.h"
 #include "lp_query.h"
+#include "lp_setup.h"
 
 
 
-/**
- * Map any drawing surfaces which aren't already mapped
- */
-void
-llvmpipe_map_transfers(struct llvmpipe_context *lp)
-{
-   struct pipe_screen *screen = lp->pipe.screen;
-   struct pipe_surface *zsbuf = lp->framebuffer.zsbuf;
-   unsigned i;
-
-   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
-      lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
-   }
-
-   if(zsbuf) {
-      if(!lp->zsbuf_transfer)
-         lp->zsbuf_transfer = screen->get_tex_transfer(screen, zsbuf->texture,
-                                                       zsbuf->face, zsbuf->level, zsbuf->zslice,
-                                                       PIPE_TRANSFER_READ_WRITE,
-                                                       0, 0, zsbuf->width, zsbuf->height);
-      if(lp->zsbuf_transfer && !lp->zsbuf_map)
-         lp->zsbuf_map = screen->transfer_map(screen, lp->zsbuf_transfer);
-
-   }
-}
-
-
-/**
- * Unmap any mapped drawing surfaces
- */
-void
-llvmpipe_unmap_transfers(struct llvmpipe_context *lp)
-{
-   uint i;
-
-   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
-      lp_tile_cache_unmap_transfers(lp->cbuf_cache[i]);
-   }
-
-   if(lp->zsbuf_transfer) {
-      struct pipe_screen *screen = lp->pipe.screen;
-
-      if(lp->zsbuf_map) {
-         screen->transfer_unmap(screen, lp->zsbuf_transfer);
-         lp->zsbuf_map = NULL;
-      }
-   }
-}
 
 
 static void llvmpipe_destroy( struct pipe_context *pipe )
@@ -104,28 +56,30 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
    uint i;
 
+   lp_print_counters();
+
+   /* This will also destroy llvmpipe->setup:
+    */
    if (llvmpipe->draw)
       draw_destroy( llvmpipe->draw );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
       pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
    }
+
    pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
       pipe_texture_reference(&llvmpipe->texture[i], NULL);
    }
 
    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      lp_destroy_tex_tile_cache(llvmpipe->vertex_tex_cache[i]);
       pipe_texture_reference(&llvmpipe->vertex_textures[i], NULL);
    }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
-      if (llvmpipe->constants[i].buffer) {
-         pipe_buffer_reference(&llvmpipe->constants[i].buffer, NULL);
+      if (llvmpipe->constants[i]) {
+         pipe_buffer_reference(&llvmpipe->constants[i], NULL);
       }
    }
 
@@ -138,33 +92,8 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
 				unsigned face, unsigned level)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
-   unsigned i;
-
-   /* check if any of the bound drawing surfaces are this texture */
-   if(llvmpipe->dirty_render_cache) {
-      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
-         if(llvmpipe->framebuffer.cbufs[i] && 
-            llvmpipe->framebuffer.cbufs[i]->texture == texture)
-            return PIPE_REFERENCED_FOR_WRITE;
-      }
-      if(llvmpipe->framebuffer.zsbuf && 
-         llvmpipe->framebuffer.zsbuf->texture == texture)
-         return PIPE_REFERENCED_FOR_WRITE;
-   }
 
-   /* check if any of the tex_cache textures are this texture */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      if (llvmpipe->tex_cache[i] &&
-            llvmpipe->tex_cache[i]->texture == texture)
-         return PIPE_REFERENCED_FOR_READ;
-   }
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      if (llvmpipe->vertex_tex_cache[i] &&
-          llvmpipe->vertex_tex_cache[i]->texture == texture)
-         return PIPE_REFERENCED_FOR_READ;
-   }
-   
-   return PIPE_UNREFERENCED;
+   return lp_setup_is_texture_referenced(llvmpipe->setup, texture);
 }
 
 static unsigned int
@@ -175,10 +104,9 @@ llvmpipe_is_buffer_referenced( struct pipe_context *pipe,
 }
 
 struct pipe_context *
-llvmpipe_create( struct pipe_screen *screen )
+llvmpipe_create_context( struct pipe_screen *screen, void *priv )
 {
    struct llvmpipe_context *llvmpipe;
-   uint i;
 
    llvmpipe = align_malloc(sizeof(struct llvmpipe_context), 16);
    if (!llvmpipe)
@@ -190,6 +118,7 @@ llvmpipe_create( struct pipe_screen *screen )
 
    llvmpipe->pipe.winsys = screen->winsys;
    llvmpipe->pipe.screen = screen;
+   llvmpipe->pipe.priv = priv;
    llvmpipe->pipe.destroy = llvmpipe_destroy;
 
    /* state setters */
@@ -242,19 +171,6 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.is_buffer_referenced = llvmpipe_is_buffer_referenced;
 
    llvmpipe_init_query_funcs( llvmpipe );
-   llvmpipe_init_texture_funcs( llvmpipe );
-
-   /*
-    * Alloc caches for accessing drawing surfaces and textures.
-    */
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
-      llvmpipe->cbuf_cache[i] = lp_create_tile_cache( screen );
-
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      llvmpipe->tex_cache[i] = lp_create_tex_tile_cache( screen );
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
-      llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
-
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -268,19 +184,11 @@ llvmpipe_create( struct pipe_screen *screen )
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
 
-   llvmpipe->vbuf_backend = lp_create_vbuf_backend(llvmpipe);
-   if (!llvmpipe->vbuf_backend)
-      goto fail;
-
-   llvmpipe->vbuf = draw_vbuf_stage(llvmpipe->draw, llvmpipe->vbuf_backend);
-   if (!llvmpipe->vbuf)
+   llvmpipe->setup = lp_setup_create( screen,
+                                      llvmpipe->draw );
+   if (!llvmpipe->setup)
       goto fail;
 
-   draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->vbuf);
-   draw_set_render(llvmpipe->draw, llvmpipe->vbuf_backend);
-
-
-
    /* plug in AA line/point stages */
    draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
    draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
@@ -292,6 +200,8 @@ llvmpipe_create( struct pipe_screen *screen )
 
    lp_init_surface_functions(llvmpipe);
 
+   lp_reset_counters();
+
    return &llvmpipe->pipe;
 
  fail:
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 6411797cf5..3bde485ac0 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -42,12 +42,10 @@
 struct llvmpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
-struct llvmpipe_tile_cache;
-struct llvmpipe_tex_tile_cache;
 struct lp_fragment_shader;
 struct lp_vertex_shader;
 struct lp_blend_state;
-
+struct setup_context;
 
 struct llvmpipe_context {
    struct pipe_context pipe;  /**< base class */
@@ -62,9 +60,9 @@ struct llvmpipe_context {
    const struct lp_vertex_shader *vs;
 
    /** Other rendering state */
-   struct pipe_blend_color blend_color[4][16];
+   struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
@@ -94,52 +92,26 @@ struct llvmpipe_context {
    
    /** Vertex format */
    struct vertex_info vertex_info;
-   struct vertex_info vertex_info_vbuf;
 
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   /* The reduced version of the primitive supplied by the state
-    * tracker.
-    */
-   unsigned reduced_api_prim;
-
-   /* The reduced primitive after unfilled triangles, wide-line
-    * decomposition, etc, are taken into account.  This is the
-    * primitive actually rasterized.
-    */
-   unsigned reduced_prim;
-
-   /** Derived from scissor and surface bounds: */
-   struct pipe_scissor_state cliprect;
-
-   unsigned line_stipple_counter;
+   /** The tiling engine */
+   struct setup_context *setup;
 
    /** The primitive drawing context */
    struct draw_context *draw;
 
-   /** Draw module backend */
-   struct vbuf_render *vbuf_backend;
-   struct draw_stage *vbuf;
-
-   boolean dirty_render_cache;
-   
-   struct llvmpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
-   
-   /* TODO: we shouldn't be using external interfaces internally like this */
-   struct pipe_transfer *zsbuf_transfer;
-   uint8_t *zsbuf_map;
-
    unsigned tex_timestamp;
-   struct llvmpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
-   struct llvmpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
+   boolean no_rast;
 
-   unsigned no_rast : 1;
-
-   struct lp_jit_context jit_context;
 };
 
 
+struct pipe_context *
+llvmpipe_create_context( struct pipe_screen *screen, void *priv );
+
+
 static INLINE struct llvmpipe_context *
 llvmpipe_context( struct pipe_context *pipe )
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index 74b2757494..ee81814361 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -45,6 +45,11 @@ st_print_current(void);
 #define DEBUG_QUERY     0x40
 #define DEBUG_SCREEN    0x80
 #define DEBUG_JIT       0x100
+#define DEBUG_SHOW_TILES    0x200
+#define DEBUG_SHOW_SUBTILES 0x400
+#define DEBUG_COUNTERS      0x800
+#define DEBUG_NO_LLVM_OPT  0x1000
+
 
 #ifdef DEBUG
 extern int LP_DEBUG;
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index c152b4413f..3dd68d5794 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -33,8 +33,6 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_inlines.h"
 #include "util/u_prim.h"
 
 #include "lp_buffer.h"
@@ -70,13 +68,9 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
    struct draw_context *draw = lp->draw;
    unsigned i;
 
-   lp->reduced_api_prim = u_reduced_prim(mode);
-
    if (lp->dirty)
       llvmpipe_update_derived( lp );
 
-   llvmpipe_map_transfers(lp);
-
    /*
     * Map vertex buffers
     */
@@ -118,10 +112,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
     * internally when this condition is seen?)
     */
    draw_flush(draw);
-
-   /* Note: leave drawing surfaces mapped */
-
-   lp->dirty_render_cache = TRUE;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c
new file mode 100644
index 0000000000..525c117f31
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -0,0 +1,110 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "lp_fence.h"
+
+
+struct lp_fence *
+lp_fence_create(unsigned rank)
+{
+   struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
+
+   pipe_reference_init(&fence->reference, 1);
+
+   pipe_mutex_init(fence->mutex);
+   pipe_condvar_init(fence->signalled);
+
+   fence->rank = rank;
+
+   return fence;
+}
+
+
+static void
+lp_fence_destroy(struct lp_fence *fence)
+{
+   pipe_mutex_destroy(fence->mutex);
+   pipe_condvar_destroy(fence->signalled);
+   FREE(fence);
+}
+
+
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   struct lp_fence *old = (struct lp_fence *) *ptr;
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   if (pipe_reference(&old->reference, &f->reference)) {
+      lp_fence_destroy(old);
+   }
+}
+
+
+static int
+llvmpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   return f->count == f->rank;
+}
+
+
+static int
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence_handle,
+                      unsigned flag)
+{
+   struct lp_fence *fence = (struct lp_fence *) fence_handle;
+
+   pipe_mutex_lock(fence->mutex);
+   while (fence->count < fence->rank) {
+      pipe_condvar_wait(fence->signalled, fence->mutex);
+   }
+   pipe_mutex_unlock(fence->mutex);
+
+   return 0;
+}
+
+
+
+
+void
+llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+{
+   screen->fence_reference = llvmpipe_fence_reference;
+   screen->fence_signalled = llvmpipe_fence_signalled;
+   screen->fence_finish = llvmpipe_fence_finish;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 583e6132b4..c90e6de423 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -26,39 +26,35 @@
  **************************************************************************/
 
 
-#ifndef LP_BLD_DEBUG_H
-#define LP_BLD_DEBUG_H
+#ifndef LP_FENCE_H
+#define LP_FENCE_H
 
 
-#include <llvm-c/Core.h>
+#include "os/os_thread.h"
+#include "pipe/p_state.h"
 
-#include "pipe/p_compiler.h"
-#include "util/u_string.h"
 
+struct pipe_screen;
 
-static INLINE void
-lp_build_name(LLVMValueRef val, const char *format, ...)
+
+struct lp_fence
 {
-#ifdef DEBUG
-   char name[32];
-   va_list ap;
-   va_start(ap, format);
-   util_vsnprintf(name, sizeof name, format, ap);
-   va_end(ap);
-   LLVMSetValueName(val, name);
-#else
-   (void)val;
-   (void)format;
-#endif
-}
+   struct pipe_reference reference;
+
+   pipe_mutex mutex;
+   pipe_condvar signalled;
+
+   unsigned rank;
+   unsigned count;
+};
 
 
-boolean
-lp_check_alignment(const void *ptr, unsigned alignment);
+struct lp_fence *
+lp_fence_create(unsigned rank);
 
 
 void
-lp_disassemble(const void* func);
+llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
 
 
-#endif /* !LP_BLD_DEBUG_H */
+#endif /* LP_FENCE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index cd8381fe30..bf832433be 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -34,11 +34,7 @@
 #include "draw/draw_context.h"
 #include "lp_flush.h"
 #include "lp_context.h"
-#include "lp_surface.h"
-#include "lp_state.h"
-#include "lp_tile_cache.h"
-#include "lp_tex_cache.h"
-#include "lp_winsys.h"
+#include "lp_setup.h"
 
 
 void
@@ -47,56 +43,52 @@ llvmpipe_flush( struct pipe_context *pipe,
                 struct pipe_fence_handle **fence )
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   uint i;
 
    draw_flush(llvmpipe->draw);
 
-   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
-      /* If this is a swapbuffers, just flush color buffers.
-       *
-       * The zbuffer changes are not discarded, but held in the cache
-       * in the hope that a later clear will wipe them out.
-       */
-      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i]) {
-            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
-            lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
-         }
+   if (fence) {
+      if ((flags & (PIPE_FLUSH_SWAPBUFFERS |
+                    PIPE_FLUSH_RENDER_CACHE))) {
+         /* if we're going to flush the setup/rasterization modules, emit
+          * a fence.
+          * XXX this (and the code below) may need fine tuning...
+          */
+         *fence = lp_setup_fence( llvmpipe->setup );
+      }
+      else {
+         *fence = NULL;
+      }
+   }
 
-      /* Need this call for hardware buffers before swapbuffers.
-       *
-       * there should probably be another/different flush-type function
-       * that's called before swapbuffers because we don't always want
-       * to unmap surfaces when flushing.
-       */
-      llvmpipe_unmap_transfers(llvmpipe);
+   /* XXX the lp_setup_flush(flags) param is not a bool, and it's ignored
+    * at this time!
+    */
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      lp_setup_flush( llvmpipe->setup, FALSE );
    }
    else if (flags & PIPE_FLUSH_RENDER_CACHE) {
-      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i]) {
-            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
-            lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
-         }
-
-      /* FIXME: untile zsbuf! */
-     
-      llvmpipe->dirty_render_cache = FALSE;
+      lp_setup_flush( llvmpipe->setup, TRUE );
    }
 
    /* Enable to dump BMPs of the color/depth buffers each frame */
 #if 0
-   if(flags & PIPE_FLUSH_FRAME) {
+   if (flags & PIPE_FLUSH_FRAME) {
       static unsigned frame_no = 1;
-      static char filename[256];
-      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
-      debug_dump_surface_bmp(filename, llvmpipe->framebuffer.cbufs[0]);
-      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
-      debug_dump_surface_bmp(filename, llvmpipe->framebuffer.zsbuf);
+      char filename[256];
+      unsigned i;
+
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
+	 util_snprintf(filename, sizeof(filename), "cbuf%u_%u", i, frame_no);
+         debug_dump_surface(filename, llvmpipe->framebuffer.cbufs[i]);
+      }
+
+      if (0) {
+         util_snprintf(filename, sizeof(filename), "zsbuf_%u", frame_no);
+         debug_dump_surface(filename, llvmpipe->framebuffer.zsbuf);
+      }
+
       ++frame_no;
    }
 #endif
-   
-   if (fence)
-      *fence = NULL;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 4ef0783f3e..27b54c5959 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -37,9 +37,10 @@
 
 #include "util/u_memory.h"
 #include "util/u_cpu_detect.h"
+#include "lp_debug.h"
 #include "lp_screen.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_misc.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_misc.h"
 #include "lp_jit.h"
 
 
@@ -79,13 +80,16 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 
    /* struct lp_jit_context */
    {
-      LLVMTypeRef elem_types[4];
+      LLVMTypeRef elem_types[8];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
-      elem_types[1] = LLVMFloatType();                     /* alpha_ref_value */
-      elem_types[2] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
-      elem_types[3] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[1] = LLVMFloatType();                     /* alpha_ref_value */      elem_types[2] = LLVMFloatType();                     /* scissor_xmin */
+      elem_types[3] = LLVMFloatType();                     /* scissor_ymin */
+      elem_types[4] = LLVMFloatType();                     /* scissor_xmax */
+      elem_types[5] = LLVMFloatType();                     /* scissor_ymax */
+      elem_types[6] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[7] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -93,8 +97,16 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
                              screen->target, context_type, 0);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
                              screen->target, context_type, 1);
-      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmin,
                              screen->target, context_type, 2);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymin,
+                             screen->target, context_type, 3);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmax,
+                             screen->target, context_type, 4);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymax,
+                             screen->target, context_type, 5);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+                             screen->target, context_type, 6);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
                              screen->target, context_type,
                              LP_JIT_CONTEXT_TEXTURES_INDEX);
@@ -154,20 +166,23 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 
    screen->pass = LLVMCreateFunctionPassManager(screen->provider);
    LLVMAddTargetData(screen->target, screen->pass);
-   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
-    * but there are more on SVN. */
-   /* TODO: Add more passes */
-   LLVMAddConstantPropagationPass(screen->pass);
-   if(util_cpu_caps.has_sse4_1) {
-      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-       * and sitofp (necessary for trunc/floor/ceil/round implementation)
-       * somehow becomes invalid code.
-       */
-      LLVMAddInstructionCombiningPass(screen->pass);
+
+   if ((LP_DEBUG & DEBUG_NO_LLVM_OPT) == 0) {
+      /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+       * but there are more on SVN. */
+      /* TODO: Add more passes */
+      LLVMAddConstantPropagationPass(screen->pass);
+      if(util_cpu_caps.has_sse4_1) {
+         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+          * and sitofp (necessary for trunc/floor/ceil/round implementation)
+          * somehow becomes invalid code.
+          */
+         LLVMAddInstructionCombiningPass(screen->pass);
+      }
+      LLVMAddPromoteMemoryToRegisterPass(screen->pass);
+      LLVMAddGVNPass(screen->pass);
+      LLVMAddCFGSimplificationPass(screen->pass);
    }
-   LLVMAddPromoteMemoryToRegisterPass(screen->pass);
-   LLVMAddGVNPass(screen->pass);
-   LLVMAddCFGSimplificationPass(screen->pass);
 
    lp_jit_init_globals(screen);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 277b690c02..8df3015d4b 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -36,7 +36,7 @@
 #define LP_JIT_H
 
 
-#include "lp_bld_struct.h"
+#include "gallivm/lp_bld_struct.h"
 
 #include "pipe/p_state.h"
 
@@ -79,6 +79,9 @@ struct lp_jit_context
 
    float alpha_ref_value;
 
+   /** floats, not ints */
+   float scissor_xmin, scissor_ymin, scissor_xmax, scissor_ymax;
+
    /* FIXME: store (also?) in floats */
    uint8_t *blend_color;
 
@@ -92,25 +95,43 @@ struct lp_jit_context
 #define lp_jit_context_alpha_ref_value(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 1, "alpha_ref_value")
 
+#define lp_jit_context_scissor_xmin_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 2, "scissor_xmin")
+
+#define lp_jit_context_scissor_ymin_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 3, "scissor_ymin")
+
+#define lp_jit_context_scissor_xmax_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 4, "scissor_xmax")
+
+#define lp_jit_context_scissor_ymax_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 5, "scissor_ymax")
+
 #define lp_jit_context_blend_color(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 2, "blend_color")
+   lp_build_struct_get(_builder, _ptr, 6, "blend_color")
 
-#define LP_JIT_CONTEXT_TEXTURES_INDEX 3
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 7
 
 #define lp_jit_context_textures(_builder, _ptr) \
    lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
 
 
 typedef void
-(*lp_jit_frag_func)(struct lp_jit_context *context,
+(*lp_jit_frag_func)(const struct lp_jit_context *context,
                     uint32_t x,
                     uint32_t y,
                     const void *a0,
                     const void *dadx,
                     const void *dady,
-                    uint32_t *mask,
-                    void *color,
-                    void *depth);
+                    uint8_t **color,
+                    void *depth,
+                    const int32_t c1,
+                    const int32_t c2,
+                    const int32_t c3,
+                    const int32_t *step1,
+                    const int32_t *step2,
+                    const int32_t *step3);
+
 
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c
new file mode 100644
index 0000000000..a316597675
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_perf.c
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+
+
+
+struct lp_counters lp_count;
+
+
+void
+lp_reset_counters(void)
+{
+   memset(&lp_count, 0, sizeof(lp_count));
+}
+
+
+void
+lp_print_counters(void)
+{
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      unsigned total_64, total_16, total_4;
+      float p1, p2, p3;
+
+      debug_printf("llvmpipe: nr_triangles:               %9u\n", lp_count.nr_tris);
+      debug_printf("llvmpipe: nr_culled_triangles:        %9u\n", lp_count.nr_culled_tris);
+
+      total_64 = (lp_count.nr_empty_64 + 
+                  lp_count.nr_fully_covered_64 +
+                  lp_count.nr_partially_covered_64);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64;
+
+      debug_printf("llvmpipe: nr_empty_64x64:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);
+      debug_printf("llvmpipe: nr_fully_covered_64x64:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64);
+      debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64);
+
+      total_16 = (lp_count.nr_empty_16 + 
+                  lp_count.nr_fully_covered_16 +
+                  lp_count.nr_partially_covered_16);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_16 / (float) total_16;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16;
+
+      debug_printf("llvmpipe: nr_empty_16x16:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);
+      debug_printf("llvmpipe: nr_fully_covered_16x16:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16);
+      debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16);
+
+      total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;
+      p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4;
+
+      debug_printf("llvmpipe: nr_empty_4x4:               %9u (%2.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4);
+      debug_printf("llvmpipe: nr_non_empty_4x4:           %9u (%2.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4);
+
+      debug_printf("llvmpipe: nr_color_tile_clear:        %9u\n", lp_count.nr_color_tile_clear);
+      debug_printf("llvmpipe: nr_color_tile_load:         %9u\n", lp_count.nr_color_tile_load);
+      debug_printf("llvmpipe: nr_color_tile_store:        %9u\n", lp_count.nr_color_tile_store);
+
+      debug_printf("llvmpipe: nr_llvm_compiles:           %u\n", lp_count.nr_llvm_compiles);
+      debug_printf("llvmpipe: total LLVM compile time:    %.2f sec\n", lp_count.llvm_compile_time / 1000000.0);
+      debug_printf("llvmpipe: average LLVM compile time:  %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles);
+
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_perf.h
index d67500ef70..a9629dae3c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.h
+++ b/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -26,47 +26,57 @@
  **************************************************************************/
 
 /**
- * @file
- * Helper functions for logical operations.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
+ * Performance / statistic counters, etc.
  */
 
 
-#ifndef LP_BLD_LOGIC_H
-#define LP_BLD_LOGIC_H
+#ifndef LP_PERF_H
+#define LP_PERF_H
+
 
+/**
+ * Various counters
+ */
+struct lp_counters
+{
+   unsigned nr_tris;
+   unsigned nr_culled_tris;
+   unsigned nr_empty_64;
+   unsigned nr_fully_covered_64;
+   unsigned nr_partially_covered_64;
+   unsigned nr_empty_16;
+   unsigned nr_fully_covered_16;
+   unsigned nr_partially_covered_16;
+   unsigned nr_empty_4;
+   unsigned nr_non_empty_4;
+   unsigned nr_llvm_compiles;
+   int64_t llvm_compile_time;  /**< total, in microseconds */
 
-#include <llvm-c/Core.h>  
+   unsigned nr_color_tile_clear;
+   unsigned nr_color_tile_load;
+   unsigned nr_color_tile_store;
+};
 
-#include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
+extern struct lp_counters lp_count;
 
-struct lp_type;
-struct lp_build_context;
 
+/** Increment the named counter (only for debug builds) */
+#ifdef DEBUG
+#define LP_COUNT(counter) lp_count.counter++
+#define LP_COUNT_ADD(counter, incr)  lp_count.counter += (incr)
+#else
+#define LP_COUNT(counter)
+#define LP_COUNT_ADD(counter, incr) (void) incr
+#endif
 
-/**
- * @param func is one of PIPE_FUNC_xxx
- */
-LLVMValueRef
-lp_build_cmp(struct lp_build_context *bld,
-             unsigned func,
-             LLVMValueRef a,
-             LLVMValueRef b);
 
+extern void
+lp_reset_counters(void);
 
-LLVMValueRef
-lp_build_select(struct lp_build_context *bld,
-                LLVMValueRef mask,
-                LLVMValueRef a,
-                LLVMValueRef b);
 
-LLVMValueRef
-lp_build_select_aos(struct lp_build_context *bld,
-                    LLVMValueRef a,
-                    LLVMValueRef b,
-                    const boolean cond[4]);
+extern void
+lp_print_counters(void);
 
 
-#endif /* !LP_BLD_LOGIC_H */
+#endif /* LP_PERF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
deleted file mode 100644
index e8e2e2524a..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
+++ /dev/null
@@ -1,563 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Interface between 'draw' module's output and the llvmpipe rasterizer/setup
- * code.  When the 'draw' module has finished filling a vertex buffer, the
- * draw_arrays() functions below will be called.  Loop over the vertices and
- * call the point/line/tri setup functions.
- *
- * Authors
- *  Brian Paul
- */
-
-
-#include "lp_context.h"
-#include "lp_setup.h"
-#include "lp_state.h"
-#include "lp_prim_vbuf.h"
-#include "draw/draw_context.h"
-#include "draw/draw_vbuf.h"
-#include "util/u_memory.h"
-#include "util/u_prim.h"
-
-
-#define LP_MAX_VBUF_INDEXES 1024
-#define LP_MAX_VBUF_SIZE    4096
-
-typedef const float (*cptrf4)[4];
-
-/**
- * Subclass of vbuf_render.
- */
-struct llvmpipe_vbuf_render
-{
-   struct vbuf_render base;
-   struct llvmpipe_context *llvmpipe;
-   struct setup_context *setup;
-
-   uint prim;
-   uint vertex_size;
-   uint nr_vertices;
-   uint vertex_buffer_size;
-   void *vertex_buffer;
-};
-
-
-/** cast wrapper */
-static struct llvmpipe_vbuf_render *
-llvmpipe_vbuf_render(struct vbuf_render *vbr)
-{
-   return (struct llvmpipe_vbuf_render *) vbr;
-}
-
-
-
-
-
-
-
-static const struct vertex_info *
-lp_vbuf_get_vertex_info(struct vbuf_render *vbr)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   return llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
-}
-
-
-static boolean
-lp_vbuf_allocate_vertices(struct vbuf_render *vbr,
-                          ushort vertex_size, ushort nr_vertices)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   unsigned size = vertex_size * nr_vertices;
-
-   if (cvbr->vertex_buffer_size < size) {
-      align_free(cvbr->vertex_buffer);
-      cvbr->vertex_buffer = align_malloc(size, 16);
-      cvbr->vertex_buffer_size = size;
-   }
-
-   cvbr->vertex_size = vertex_size;
-   cvbr->nr_vertices = nr_vertices;
-   
-   return cvbr->vertex_buffer != NULL;
-}
-
-static void
-lp_vbuf_release_vertices(struct vbuf_render *vbr)
-{
-   /* keep the old allocation for next time */
-}
-
-static void *
-lp_vbuf_map_vertices(struct vbuf_render *vbr)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   return cvbr->vertex_buffer;
-}
-
-static void 
-lp_vbuf_unmap_vertices(struct vbuf_render *vbr, 
-                       ushort min_index,
-                       ushort max_index )
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
-   (void) cvbr;
-   /* do nothing */
-}
-
-
-static boolean
-lp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   struct setup_context *setup_ctx = cvbr->setup;
-   
-   llvmpipe_setup_prepare( setup_ctx );
-
-   cvbr->llvmpipe->reduced_prim = u_reduced_prim(prim);
-   cvbr->prim = prim;
-   return TRUE;
-
-}
-
-
-static INLINE cptrf4 get_vert( const void *vertex_buffer,
-                               int index,
-                               int stride )
-{
-   return (cptrf4)((char *)vertex_buffer + index * stride);
-}
-
-
-/**
- * draw elements / indexed primitives
- */
-static void
-lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
-   const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
-   const void *vertex_buffer = cvbr->vertex_buffer;
-   struct setup_context *setup_ctx = cvbr->setup;
-   unsigned i;
-
-   switch (cvbr->prim) {
-   case PIPE_PRIM_POINTS:
-      for (i = 0; i < nr; i++) {
-         llvmpipe_setup_point( setup_ctx,
-                      get_vert(vertex_buffer, indices[i-0], stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (i = 1; i < nr; i += 2) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, indices[i-1], stride),
-                     get_vert(vertex_buffer, indices[i-0], stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      for (i = 1; i < nr; i ++) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, indices[i-1], stride),
-                     get_vert(vertex_buffer, indices[i-0], stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      for (i = 1; i < nr; i ++) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, indices[i-1], stride),
-                     get_vert(vertex_buffer, indices[i-0], stride) );
-      }
-      if (nr) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, indices[nr-1], stride),
-                     get_vert(vertex_buffer, indices[0], stride) );
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i += 3) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride),
-                       get_vert(vertex_buffer, indices[i-2], stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i += 3) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
-                       get_vert(vertex_buffer, indices[i-(i&1)], stride),
-                       get_vert(vertex_buffer, indices[i-2], stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
-                       get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-0], stride),
-                       get_vert(vertex_buffer, indices[0], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[0], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUADS:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 3; i < nr; i += 4) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-3], stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride),
-                       get_vert(vertex_buffer, indices[i-3], stride) );
-         }
-      }
-      else {
-         for (i = 3; i < nr; i += 4) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-3], stride),
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 3; i < nr; i += 2) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-0], stride),
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-3], stride));
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride),
-                       get_vert(vertex_buffer, indices[i-3], stride) );
-         }
-      }
-      else {
-         for (i = 3; i < nr; i += 2) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-3], stride),
-                       get_vert(vertex_buffer, indices[i-2], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, indices[i-1], stride),
-                       get_vert(vertex_buffer, indices[i-3], stride),
-                       get_vert(vertex_buffer, indices[i-0], stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-      /* Almost same as tri fan but the _first_ vertex specifies the flat
-       * shading color.  Note that the first polygon vertex is passed as
-       * the last triangle vertex here.
-       * flatshade_first state makes no difference.
-       */
-      for (i = 2; i < nr; i += 1) {
-         llvmpipe_setup_tri( setup_ctx,
-                    get_vert(vertex_buffer, indices[i-0], stride),
-                    get_vert(vertex_buffer, indices[i-1], stride),
-                    get_vert(vertex_buffer, indices[0], stride) );
-      }
-      break;
-
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * This function is hit when the draw module is working in pass-through mode.
- * It's up to us to convert the vertex array into point/line/tri prims.
- */
-static void
-lp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
-   struct setup_context *setup_ctx = cvbr->setup;
-   const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
-   const void *vertex_buffer =
-      (void *) get_vert(cvbr->vertex_buffer, start, stride);
-   unsigned i;
-
-   switch (cvbr->prim) {
-   case PIPE_PRIM_POINTS:
-      for (i = 0; i < nr; i++) {
-         llvmpipe_setup_point( setup_ctx,
-                      get_vert(vertex_buffer, i-0, stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (i = 1; i < nr; i += 2) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, i-1, stride),
-                     get_vert(vertex_buffer, i-0, stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      for (i = 1; i < nr; i ++) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, i-1, stride),
-                     get_vert(vertex_buffer, i-0, stride) );
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      for (i = 1; i < nr; i ++) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, i-1, stride),
-                     get_vert(vertex_buffer, i-0, stride) );
-      }
-      if (nr) {
-         llvmpipe_setup_line( setup_ctx,
-                     get_vert(vertex_buffer, nr-1, stride),
-                     get_vert(vertex_buffer, 0, stride) );
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i += 3) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-0, stride),
-                       get_vert(vertex_buffer, i-2, stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i += 3) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i++) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i+(i&1)-1, stride),
-                       get_vert(vertex_buffer, i-(i&1), stride),
-                       get_vert(vertex_buffer, i-2, stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i++) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i+(i&1)-2, stride),
-                       get_vert(vertex_buffer, i-(i&1)-1, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-0, stride),
-                       get_vert(vertex_buffer, 0, stride),
-                       get_vert(vertex_buffer, i-1, stride) );
-         }
-      }
-      else {
-         for (i = 2; i < nr; i += 1) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, 0, stride),
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUADS:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 3; i < nr; i += 4) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-3, stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-0, stride),
-                       get_vert(vertex_buffer, i-3, stride) );
-         }
-      }
-      else {
-         for (i = 3; i < nr; i += 4) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-3, stride),
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      if (llvmpipe->rasterizer->flatshade_first) {
-         for (i = 3; i < nr; i += 2) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-0, stride),
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-3, stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-0, stride),
-                       get_vert(vertex_buffer, i-3, stride) );
-         }
-      }
-      else {
-         for (i = 3; i < nr; i += 2) {
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-3, stride),
-                       get_vert(vertex_buffer, i-2, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-            llvmpipe_setup_tri( setup_ctx,
-                       get_vert(vertex_buffer, i-1, stride),
-                       get_vert(vertex_buffer, i-3, stride),
-                       get_vert(vertex_buffer, i-0, stride) );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-      /* Almost same as tri fan but the _first_ vertex specifies the flat
-       * shading color.  Note that the first polygon vertex is passed as
-       * the last triangle vertex here.
-       * flatshade_first state makes no difference.
-       */
-      for (i = 2; i < nr; i += 1) {
-         llvmpipe_setup_tri( setup_ctx,
-                    get_vert(vertex_buffer, i-1, stride),
-                    get_vert(vertex_buffer, i-0, stride),
-                    get_vert(vertex_buffer, 0, stride) );
-      }
-      break;
-
-   default:
-      assert(0);
-   }
-}
-
-
-
-static void
-lp_vbuf_destroy(struct vbuf_render *vbr)
-{
-   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   llvmpipe_setup_destroy_context(cvbr->setup);
-   FREE(cvbr);
-}
-
-
-/**
- * Create the post-transform vertex handler for the given context.
- */
-struct vbuf_render *
-lp_create_vbuf_backend(struct llvmpipe_context *lp)
-{
-   struct llvmpipe_vbuf_render *cvbr = CALLOC_STRUCT(llvmpipe_vbuf_render);
-
-   assert(lp->draw);
-
-
-   cvbr->base.max_indices = LP_MAX_VBUF_INDEXES;
-   cvbr->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
-
-   cvbr->base.get_vertex_info = lp_vbuf_get_vertex_info;
-   cvbr->base.allocate_vertices = lp_vbuf_allocate_vertices;
-   cvbr->base.map_vertices = lp_vbuf_map_vertices;
-   cvbr->base.unmap_vertices = lp_vbuf_unmap_vertices;
-   cvbr->base.set_primitive = lp_vbuf_set_primitive;
-   cvbr->base.draw = lp_vbuf_draw;
-   cvbr->base.draw_arrays = lp_vbuf_draw_arrays;
-   cvbr->base.release_vertices = lp_vbuf_release_vertices;
-   cvbr->base.destroy = lp_vbuf_destroy;
-
-   cvbr->llvmpipe = lp;
-
-   cvbr->setup = llvmpipe_setup_create_context(cvbr->llvmpipe);
-
-   return &cvbr->base;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
deleted file mode 100644
index 7eb05de77a..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_quad.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#ifndef LP_QUAD_H
-#define LP_QUAD_H
-
-#include "pipe/p_state.h"
-#include "tgsi/tgsi_exec.h"
-
-
-#define QUAD_PRIM_POINT 1
-#define QUAD_PRIM_LINE  2
-#define QUAD_PRIM_TRI   3
-
-
-/* The rasterizer generates 2x2 quads of fragment and feeds them to
- * the current fp_machine (see below).
- * Remember that Y=0=top with Y increasing down the window.
- */
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
-
-#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
-#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
-#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
-#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
-#define MASK_ALL          0xf
-
-
-/**
- * Quad stage inputs (pos, coverage, front/back face, etc)
- */
-struct quad_header_input
-{
-   int x0, y0;                /**< quad window pos, always even */
-   float coverage[QUAD_SIZE]; /**< fragment coverage for antialiasing */
-   unsigned facing:1;         /**< Front (0) or back (1) facing? */
-   unsigned prim:2;           /**< QUAD_PRIM_POINT, LINE, TRI */
-};
-
-
-/**
- * Quad stage inputs/outputs.
- */
-struct quad_header_inout
-{
-   unsigned mask:4;
-};
-
-
-/**
- * Quad stage outputs (color & depth).
- */
-struct quad_header_output
-{
-   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
-};
-
-
-/**
- * Input interpolation coefficients
- */
-struct quad_interp_coef
-{
-   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-};
-
-
-/**
- * Encodes everything we need to know about a 2x2 pixel block.  Uses
- * "Channel-Serial" or "SoA" layout.  
- */
-struct quad_header {
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-
-   /* Redundant/duplicated:
-    */
-   const struct quad_interp_coef *coef;
-};
-
-#endif /* LP_QUAD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
new file mode 100644
index 0000000000..5ae323fd96
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -0,0 +1,1036 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <limits.h>
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_surface.h"
+
+#include "lp_scene_queue.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_perf.h"
+#include "lp_rast.h"
+#include "lp_rast_priv.h"
+#include "lp_tile_soa.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_scene.h"
+
+
+/**
+ * Begin the rasterization phase.
+ * Map the framebuffer surfaces.  Initialize the 'rast' state.
+ */
+static boolean
+lp_rast_begin( struct lp_rasterizer *rast,
+               const struct pipe_framebuffer_state *fb,
+               boolean write_color,
+               boolean write_zstencil )
+{
+   struct pipe_screen *screen = rast->screen;
+   struct pipe_surface *cbuf, *zsbuf;
+   int i;
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   util_copy_framebuffer_state(&rast->state.fb, fb);
+
+   rast->state.write_zstencil = write_zstencil;
+   rast->state.write_color = write_color;
+
+   rast->check_for_clipped_tiles = (fb->width % TILE_SIZE != 0 ||
+                                    fb->height % TILE_SIZE != 0);
+
+   
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+      cbuf = rast->state.fb.cbufs[i];
+      if (cbuf) {
+	 rast->cbuf_transfer[i] = screen->get_tex_transfer(rast->screen,
+							   cbuf->texture,
+							   cbuf->face,
+							   cbuf->level,
+							   cbuf->zslice,
+							   PIPE_TRANSFER_READ_WRITE,
+							   0, 0,
+							   cbuf->width, 
+							   cbuf->height);
+	 if (!rast->cbuf_transfer[i])
+	    goto fail;
+
+	 rast->cbuf_map[i] = screen->transfer_map(rast->screen, 
+						  rast->cbuf_transfer[i]);
+	 if (!rast->cbuf_map[i])
+	    goto fail;
+      }
+   }
+
+   zsbuf = rast->state.fb.zsbuf;
+   if (zsbuf) {
+      rast->zsbuf_transfer = screen->get_tex_transfer(rast->screen,
+                                                      zsbuf->texture,
+                                                      zsbuf->face,
+                                                      zsbuf->level,
+                                                      zsbuf->zslice,
+                                                      PIPE_TRANSFER_READ_WRITE,
+                                                      0, 0,
+                                                      zsbuf->width,
+						      zsbuf->height);
+      if (!rast->zsbuf_transfer)
+         goto fail;
+
+      rast->zsbuf_map = screen->transfer_map(rast->screen, 
+                                            rast->zsbuf_transfer);
+      if (!rast->zsbuf_map)
+	 goto fail;
+   }
+
+   return TRUE;
+
+fail:
+   /* Unmap and release transfers?
+    */
+   return FALSE;
+}
+
+
+/**
+ * Finish the rasterization phase.
+ * Unmap framebuffer surfaces.
+ */
+static void
+lp_rast_end( struct lp_rasterizer *rast )
+{
+   struct pipe_screen *screen = rast->screen;
+   unsigned i;
+
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+      if (rast->cbuf_map[i]) 
+	 screen->transfer_unmap(screen, rast->cbuf_transfer[i]);
+
+      if (rast->cbuf_transfer[i])
+	 screen->tex_transfer_destroy(rast->cbuf_transfer[i]);
+
+      rast->cbuf_transfer[i] = NULL;
+      rast->cbuf_map[i] = NULL;
+   }
+
+   if (rast->zsbuf_map) 
+      screen->transfer_unmap(screen, rast->zsbuf_transfer);
+
+   if (rast->zsbuf_transfer)
+      screen->tex_transfer_destroy(rast->zsbuf_transfer);
+
+   rast->zsbuf_transfer = NULL;
+   rast->zsbuf_map = NULL;
+}
+
+
+/**
+ * Begining rasterization of a tile.
+ * \param x  window X position of the tile, in pixels
+ * \param y  window Y position of the tile, in pixels
+ */
+static void
+lp_rast_start_tile( struct lp_rasterizer *rast,
+                    unsigned thread_index,
+                    unsigned x, unsigned y )
+{
+   LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y);
+
+   rast->tasks[thread_index].x = x;
+   rast->tasks[thread_index].y = y;
+}
+
+
+/**
+ * Clear the rasterizer's current color tile.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_clear_color( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const union lp_rast_cmd_arg arg )
+{
+   const uint8_t *clear_color = arg.clear_color;
+   uint8_t **color_tile = rast->tasks[thread_index].tile.color;
+   unsigned i;
+
+   LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, 
+              clear_color[0],
+              clear_color[1],
+              clear_color[2],
+              clear_color[3]);
+
+   if (clear_color[0] == clear_color[1] &&
+       clear_color[1] == clear_color[2] &&
+       clear_color[2] == clear_color[3]) {
+      /* clear to grayscale value {x, x, x, x} */
+      for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+	 memset(color_tile[i], clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+      }
+   }
+   else {
+      /* Non-gray color.
+       * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
+       * will need to change.  It'll be pretty obvious when clearing no longer
+       * works.
+       */
+      const unsigned chunk = TILE_SIZE / 4;
+      for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+         uint8_t *c = color_tile[i];
+         unsigned j;
+         for (j = 0; j < 4 * TILE_SIZE; j++) {
+            memset(c, clear_color[0], chunk);
+            c += chunk;
+            memset(c, clear_color[1], chunk);
+            c += chunk;
+            memset(c, clear_color[2], chunk);
+            c += chunk;
+            memset(c, clear_color[3], chunk);
+            c += chunk;
+         }
+         assert(c - color_tile[i] == TILE_SIZE * TILE_SIZE * 4);
+      }
+   }
+
+   LP_COUNT(nr_color_tile_clear);
+}
+
+
+/**
+ * Clear the rasterizer's current z/stencil tile.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_clear_zstencil( struct lp_rasterizer *rast,
+                             unsigned thread_index,
+                             const union lp_rast_cmd_arg arg)
+{
+   unsigned i;
+   uint32_t *depth_tile = rast->tasks[thread_index].tile.depth;
+   
+   LP_DBG(DEBUG_RAST, "%s 0x%x\n", __FUNCTION__, arg.clear_zstencil);
+
+   for (i = 0; i < TILE_SIZE * TILE_SIZE; i++)
+      depth_tile[i] = arg.clear_zstencil;
+}
+
+
+/**
+ * Load tile color from the framebuffer surface.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_load_color( struct lp_rasterizer *rast,
+                         unsigned thread_index,
+                         const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const unsigned x = task->x;
+   const unsigned y = task->y;
+   unsigned i;
+
+   LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
+
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+      struct pipe_transfer *transfer = rast->cbuf_transfer[i];
+      int w = TILE_SIZE;
+      int h = TILE_SIZE;
+
+      if (x >= transfer->width)
+	 continue;
+
+      if (y >= transfer->height)
+	 continue;
+
+      assert(w >= 0);
+      assert(h >= 0);
+      assert(w <= TILE_SIZE);
+      assert(h <= TILE_SIZE);
+
+      lp_tile_read_4ub(transfer->texture->format,
+		       task->tile.color[i],
+		       rast->cbuf_map[i], 
+		       transfer->stride,
+		       x, y,
+		       w, h);
+
+      LP_COUNT(nr_color_tile_load);
+   }
+}
+
+
+static void
+lp_tile_read_z32(uint32_t *tile,
+                 const uint8_t *map,
+                 unsigned map_stride,
+                 unsigned x0, unsigned y0, unsigned w, unsigned h)
+{
+   unsigned x, y;
+   const uint8_t *map_row = map + y0*map_stride;
+   for (y = 0; y < h; ++y) {
+      const uint32_t *map_pixel = (uint32_t *)(map_row + x0*4);
+      for (x = 0; x < w; ++x) {
+         *tile++ = *map_pixel++;
+      }
+      map_row += map_stride;
+   }
+}
+
+/**
+ * Load tile z/stencil from the framebuffer surface.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_load_zstencil( struct lp_rasterizer *rast,
+                            unsigned thread_index,
+                            const union lp_rast_cmd_arg arg )
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const unsigned x = task->x;
+   const unsigned y = task->y;
+   unsigned w = TILE_SIZE;
+   unsigned h = TILE_SIZE;
+
+   if (x + w > rast->state.fb.width)
+      w -= x + w - rast->state.fb.width;
+
+   if (y + h > rast->state.fb.height)
+      h -= y + h - rast->state.fb.height;
+
+   LP_DBG(DEBUG_RAST, "%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
+
+   assert(rast->zsbuf_transfer->texture->format == PIPE_FORMAT_Z32_UNORM);
+   lp_tile_read_z32(task->tile.depth,
+                    rast->zsbuf_map, 
+                    rast->zsbuf_transfer->stride,
+                    x, y, w, h);
+}
+
+
+void lp_rast_set_state( struct lp_rasterizer *rast,
+                        unsigned thread_index,
+                        const union lp_rast_cmd_arg arg )
+{
+   const struct lp_rast_state *state = arg.set_state;
+
+   LP_DBG(DEBUG_RAST, "%s %p\n", __FUNCTION__, (void *) state);
+
+   /* just set the current state pointer for this rasterizer */
+   rast->tasks[thread_index].current_state = state;
+}
+
+
+
+/**
+ * Run the shader on all blocks in a tile.  This is used when a tile is
+ * completely contained inside a triangle.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_shade_tile( struct lp_rasterizer *rast,
+                         unsigned thread_index,
+                         const union lp_rast_cmd_arg arg )
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const struct lp_rast_state *state = task->current_state;
+   struct lp_rast_tile *tile = &task->tile;
+   const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
+   const unsigned tile_x = task->x;
+   const unsigned tile_y = task->y;
+   unsigned x, y;
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   /* render the whole 64x64 tile in 4x4 chunks */
+   for (y = 0; y < TILE_SIZE; y += 4){
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         uint8_t *color[PIPE_MAX_COLOR_BUFS];
+         uint32_t *depth;
+         unsigned block_offset, i;
+
+         /* offset of the 16x16 pixel block within the tile */
+         block_offset = ((y / 4) * (16 * 16) + (x / 4) * 16);
+
+         /* color buffer */
+         for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+            color[i] = tile->color[i] + 4 * block_offset;
+
+         /* depth buffer */
+         depth = tile->depth + block_offset;
+
+         /* run shader */
+         state->jit_function[0]( &state->jit_context,
+                                 tile_x + x, tile_y + y,
+                                 inputs->a0,
+                                 inputs->dadx,
+                                 inputs->dady,
+                                 color,
+                                 depth,
+                                 INT_MIN, INT_MIN, INT_MIN,
+                                 NULL, NULL, NULL );
+      }
+   }
+}
+
+
+/**
+ * Compute shading for a 4x4 block of pixels.
+ * This is a bin command called during bin processing.
+ */
+void lp_rast_shade_quads( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const struct lp_rast_shader_inputs *inputs,
+                          unsigned x, unsigned y,
+                          int32_t c1, int32_t c2, int32_t c3)
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const struct lp_rast_state *state = task->current_state;
+   struct lp_rast_tile *tile = &task->tile;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   void *depth;
+   unsigned i;
+   unsigned ix, iy;
+   int block_offset;
+
+#ifdef DEBUG
+   assert(state);
+
+   /* Sanity checks */
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+
+   assert((x % 4) == 0);
+   assert((y % 4) == 0);
+#endif
+
+   ix = x % TILE_SIZE;
+   iy = y % TILE_SIZE;
+
+   /* offset of the 16x16 pixel block within the tile */
+   block_offset = ((iy / 4) * (16 * 16) + (ix / 4) * 16);
+
+   /* color buffer */
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+      color[i] = tile->color[i] + 4 * block_offset;
+
+   /* depth buffer */
+   depth = tile->depth + block_offset;
+
+
+
+#ifdef DEBUG
+   assert(lp_check_alignment(tile->depth, 16));
+   assert(lp_check_alignment(tile->color[0], 16));
+   assert(lp_check_alignment(state->jit_context.blend_color, 16));
+
+   assert(lp_check_alignment(inputs->step[0], 16));
+   assert(lp_check_alignment(inputs->step[1], 16));
+   assert(lp_check_alignment(inputs->step[2], 16));
+#endif
+
+   /* run shader */
+   state->jit_function[1]( &state->jit_context,
+                        x, y,
+                        inputs->a0,
+                        inputs->dadx,
+                        inputs->dady,
+                        color,
+                        depth,
+                        c1, c2, c3,
+                        inputs->step[0], inputs->step[1], inputs->step[2]);
+}
+
+
+/**
+ * Set top row and left column of the tile's pixels to white.  For debugging.
+ */
+static void
+outline_tile(uint8_t *tile)
+{
+   const uint8_t val = 0xff;
+   unsigned i;
+
+   for (i = 0; i < TILE_SIZE; i++) {
+      TILE_PIXEL(tile, i, 0, 0) = val;
+      TILE_PIXEL(tile, i, 0, 1) = val;
+      TILE_PIXEL(tile, i, 0, 2) = val;
+      TILE_PIXEL(tile, i, 0, 3) = val;
+
+      TILE_PIXEL(tile, 0, i, 0) = val;
+      TILE_PIXEL(tile, 0, i, 1) = val;
+      TILE_PIXEL(tile, 0, i, 2) = val;
+      TILE_PIXEL(tile, 0, i, 3) = val;
+   }
+}
+
+
+/**
+ * Draw grid of gray lines at 16-pixel intervals across the tile to
+ * show the sub-tile boundaries.  For debugging.
+ */
+static void
+outline_subtiles(uint8_t *tile)
+{
+   const uint8_t val = 0x80;
+   const unsigned step = 16;
+   unsigned i, j;
+
+   for (i = 0; i < TILE_SIZE; i += step) {
+      for (j = 0; j < TILE_SIZE; j++) {
+         TILE_PIXEL(tile, i, j, 0) = val;
+         TILE_PIXEL(tile, i, j, 1) = val;
+         TILE_PIXEL(tile, i, j, 2) = val;
+         TILE_PIXEL(tile, i, j, 3) = val;
+
+         TILE_PIXEL(tile, j, i, 0) = val;
+         TILE_PIXEL(tile, j, i, 1) = val;
+         TILE_PIXEL(tile, j, i, 2) = val;
+         TILE_PIXEL(tile, j, i, 3) = val;
+      }
+   }
+
+   outline_tile(tile);
+}
+
+
+
+/**
+ * Write the rasterizer's color tile to the framebuffer.
+ */
+static void lp_rast_store_color( struct lp_rasterizer *rast,
+                                 unsigned thread_index)
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const unsigned x = task->x;
+   const unsigned y = task->y;
+   unsigned i;
+
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+      struct pipe_transfer *transfer = rast->cbuf_transfer[i];
+      int w = TILE_SIZE;
+      int h = TILE_SIZE;
+
+      if (x >= transfer->width)
+	 continue;
+
+      if (y >= transfer->height)
+	 continue;
+
+      LP_DBG(DEBUG_RAST, "%s [%u] %d,%d %dx%d\n", __FUNCTION__,
+	     thread_index, x, y, w, h);
+
+      if (LP_DEBUG & DEBUG_SHOW_SUBTILES)
+         outline_subtiles(task->tile.color[i]);
+      else if (LP_DEBUG & DEBUG_SHOW_TILES)
+         outline_tile(task->tile.color[i]);
+
+      lp_tile_write_4ub(transfer->texture->format,
+			task->tile.color[i],
+			rast->cbuf_map[i], 
+			transfer->stride,
+			x, y,
+			w, h);
+
+      LP_COUNT(nr_color_tile_store);
+   }
+}
+
+
+static void
+lp_tile_write_z32(const uint32_t *src, uint8_t *dst, unsigned dst_stride,
+                  unsigned x0, unsigned y0, unsigned w, unsigned h)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < h; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
+      for (x = 0; x < w; ++x) {
+         *dst_pixel++ = *src++;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+/**
+ * Write the rasterizer's z/stencil tile to the framebuffer.
+ */
+static void lp_rast_store_zstencil( struct lp_rasterizer *rast,
+                                    unsigned thread_index )
+{
+   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   const unsigned x = task->x;
+   const unsigned y = task->y;
+   unsigned w = TILE_SIZE;
+   unsigned h = TILE_SIZE;
+
+   if (x + w > rast->state.fb.width)
+      w -= x + w - rast->state.fb.width;
+
+   if (y + h > rast->state.fb.height)
+      h -= y + h - rast->state.fb.height;
+
+   LP_DBG(DEBUG_RAST, "%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
+
+   assert(rast->zsbuf_transfer->texture->format == PIPE_FORMAT_Z32_UNORM);
+   lp_tile_write_z32(task->tile.depth,
+                     rast->zsbuf_map, 
+                     rast->zsbuf_transfer->stride,
+                     x, y, w, h);
+}
+
+
+/**
+ * Write the rasterizer's tiles to the framebuffer.
+ */
+static void
+lp_rast_end_tile( struct lp_rasterizer *rast,
+                  unsigned thread_index )
+{
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   if (rast->state.write_color)
+      lp_rast_store_color(rast, thread_index);
+
+   if (rast->state.write_zstencil)
+      lp_rast_store_zstencil(rast, thread_index);
+}
+
+
+/**
+ * Signal on a fence.  This is called during bin execution/rasterization.
+ * Called per thread.
+ */
+void lp_rast_fence( struct lp_rasterizer *rast,
+                    unsigned thread_index,
+                    const union lp_rast_cmd_arg arg )
+{
+   struct lp_fence *fence = arg.fence;
+
+   pipe_mutex_lock( fence->mutex );
+
+   fence->count++;
+   assert(fence->count <= fence->rank);
+
+   LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__,
+          fence->count, fence->rank);
+
+   pipe_condvar_signal( fence->signalled );
+
+   pipe_mutex_unlock( fence->mutex );
+}
+
+
+/**
+ * When all the threads are done rasterizing a scene, one thread will
+ * call this function to reset the scene and put it onto the empty queue.
+ */
+static void
+release_scene( struct lp_rasterizer *rast,
+	       struct lp_scene *scene )
+{
+   util_unreference_framebuffer_state( &scene->fb );
+
+   lp_scene_reset( scene );
+   lp_scene_enqueue( rast->empty_scenes, scene );
+   rast->curr_scene = NULL;
+}
+
+
+/**
+ * Rasterize commands for a single bin.
+ * \param x, y  position of the bin's tile in the framebuffer
+ * Must be called between lp_rast_begin() and lp_rast_end().
+ * Called per thread.
+ */
+static void
+rasterize_bin( struct lp_rasterizer *rast,
+               unsigned thread_index,
+               const struct cmd_bin *bin,
+               int x, int y)
+{
+   const struct cmd_block_list *commands = &bin->commands;
+   struct cmd_block *block;
+   unsigned k;
+
+   lp_rast_start_tile( rast, thread_index, x, y );
+
+   /* simply execute each of the commands in the block list */
+   for (block = commands->head; block; block = block->next) {
+      for (k = 0; k < block->count; k++) {
+         block->cmd[k]( rast, thread_index, block->arg[k] );
+      }
+   }
+
+   lp_rast_end_tile( rast, thread_index );
+}
+
+
+#define RAST(x) { lp_rast_##x, #x }
+
+static struct {
+   lp_rast_cmd cmd;
+   const char *name;
+} cmd_names[] = 
+{
+   RAST(load_color),
+   RAST(load_zstencil),
+   RAST(clear_color),
+   RAST(clear_zstencil),
+   RAST(triangle),
+   RAST(shade_tile),
+   RAST(set_state),
+   RAST(fence),
+};
+
+static void
+debug_bin( const struct cmd_bin *bin )
+{
+   const struct cmd_block *head = bin->commands.head;
+   int i, j;
+
+   for (i = 0; i < head->count; i++) {
+      debug_printf("%d: ", i);
+      for (j = 0; j < Elements(cmd_names); j++) {
+         if (head->cmd[i] == cmd_names[j].cmd) {
+            debug_printf("%s\n", cmd_names[j].name);
+            break;
+         }
+      }
+      if (j == Elements(cmd_names))
+         debug_printf("...other\n");
+   }
+
+}
+
+/* An empty bin is one that just loads the contents of the tile and
+ * stores them again unchanged.  This typically happens when bins have
+ * been flushed for some reason in the middle of a frame, or when
+ * incremental updates are being made to a render target.
+ * 
+ * Try to avoid doing pointless work in this case.
+ */
+static boolean
+is_empty_bin( const struct cmd_bin *bin )
+{
+   const struct cmd_block *head = bin->commands.head;
+   int i;
+   
+   if (0)
+      debug_bin(bin);
+   
+   /* We emit at most two load-tile commands at the start of the first
+    * command block.  In addition we seem to emit a couple of
+    * set-state commands even in empty bins.
+    *
+    * As a heuristic, if a bin has more than 4 commands, consider it
+    * non-empty.
+    */
+   if (head->next != NULL ||
+       head->count > 4) {
+      return FALSE;
+   }
+
+   for (i = 0; i < head->count; i++)
+      if (head->cmd[i] != lp_rast_load_color &&
+          head->cmd[i] != lp_rast_load_zstencil &&
+          head->cmd[i] != lp_rast_set_state) {
+         return FALSE;
+      }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Rasterize/execute all bins within a scene.
+ * Called per thread.
+ */
+static void
+rasterize_scene( struct lp_rasterizer *rast,
+                unsigned thread_index,
+                struct lp_scene *scene,
+                bool write_depth )
+{
+   /* loop over scene bins, rasterize each */
+#if 0
+   {
+      unsigned i, j;
+      for (i = 0; i < scene->tiles_x; i++) {
+         for (j = 0; j < scene->tiles_y; j++) {
+            struct cmd_bin *bin = lp_get_bin(scene, i, j);
+            rasterize_bin( rast, thread_index,
+                           bin, i * TILE_SIZE, j * TILE_SIZE );
+         }
+      }
+   }
+#else
+   {
+      struct cmd_bin *bin;
+      int x, y;
+
+      assert(scene);
+      while ((bin = lp_scene_bin_iter_next(scene, &x, &y))) {
+         if (!is_empty_bin( bin ))
+            rasterize_bin( rast, thread_index, bin, x * TILE_SIZE, y * TILE_SIZE);
+      }
+   }
+#endif
+}
+
+
+/**
+ * Called by setup module when it has something for us to render.
+ */
+void
+lp_rasterize_scene( struct lp_rasterizer *rast,
+                   struct lp_scene *scene,
+                   const struct pipe_framebuffer_state *fb,
+                   bool write_depth )
+{
+   boolean debug = false;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   if (debug) {
+      unsigned x, y;
+      debug_printf("rasterize scene:\n");
+      debug_printf("  data size: %u\n", lp_scene_data_size(scene));
+      for (y = 0; y < scene->tiles_y; y++) {
+         for (x = 0; x < scene->tiles_x; x++) {
+            debug_printf("  bin %u, %u size: %u\n", x, y,
+                         lp_scene_bin_size(scene, x, y));
+         }
+      }
+   }
+
+   /* save framebuffer state in the bin */
+   util_copy_framebuffer_state(&scene->fb, fb);
+   scene->write_depth = write_depth;
+
+   if (rast->num_threads == 0) {
+      /* no threading */
+
+      lp_rast_begin( rast, fb,
+                     fb->nr_cbufs != 0, /* always write color if cbufs present */
+                     fb->zsbuf != NULL && write_depth );
+
+      lp_scene_bin_iter_begin( scene );
+      rasterize_scene( rast, 0, scene, write_depth );
+
+      release_scene( rast, scene );
+
+      lp_rast_end( rast );
+   }
+   else {
+      /* threaded rendering! */
+      unsigned i;
+
+      lp_scene_enqueue( rast->full_scenes, scene );
+
+      /* signal the threads that there's work to do */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_signal(&rast->tasks[i].work_ready);
+      }
+
+      /* wait for work to complete */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_wait(&rast->tasks[i].work_done);
+      }
+   }
+
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+}
+
+
+/**
+ * This is the thread's main entrypoint.
+ * It's a simple loop:
+ *   1. wait for work
+ *   2. do work
+ *   3. signal that we're done
+ */
+static PIPE_THREAD_ROUTINE( thread_func, init_data )
+{
+   struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
+   struct lp_rasterizer *rast = task->rast;
+   boolean debug = false;
+
+   while (1) {
+      /* wait for work */
+      if (debug)
+         debug_printf("thread %d waiting for work\n", task->thread_index);
+      pipe_semaphore_wait(&task->work_ready);
+
+      if (task->thread_index == 0) {
+         /* thread[0]:
+          *  - get next scene to rasterize
+          *  - map the framebuffer surfaces
+          */
+         const struct pipe_framebuffer_state *fb;
+         boolean write_depth;
+
+         rast->curr_scene = lp_scene_dequeue( rast->full_scenes, TRUE );
+
+         lp_scene_bin_iter_begin( rast->curr_scene );
+
+         fb = &rast->curr_scene->fb;
+         write_depth = rast->curr_scene->write_depth;
+
+         lp_rast_begin( rast, fb,
+                        fb->nr_cbufs != 0,
+                        fb->zsbuf != NULL && write_depth );
+      }
+
+      /* Wait for all threads to get here so that threads[1+] don't
+       * get a null rast->curr_scene pointer.
+       */
+      pipe_barrier_wait( &rast->barrier );
+
+      /* do work */
+      if (debug)
+         debug_printf("thread %d doing work\n", task->thread_index);
+      rasterize_scene(rast, 
+		     task->thread_index,
+                     rast->curr_scene, 
+		     rast->curr_scene->write_depth);
+      
+      /* wait for all threads to finish with this scene */
+      pipe_barrier_wait( &rast->barrier );
+
+      if (task->thread_index == 0) {
+         /* thread[0]:
+          * - release the scene object
+          * - unmap the framebuffer surfaces
+          */
+         release_scene( rast, rast->curr_scene );
+         lp_rast_end( rast );
+      }
+
+      /* signal done with work */
+      if (debug)
+         debug_printf("thread %d done working\n", task->thread_index);
+      pipe_semaphore_signal(&task->work_done);
+   }
+
+   return NULL;
+}
+
+
+/**
+ * Initialize semaphores and spawn the threads.
+ */
+static void
+create_rast_threads(struct lp_rasterizer *rast)
+{
+   unsigned i;
+
+#ifdef PIPE_OS_WINDOWS
+   /* Multithreading not supported on windows until conditions and barriers are
+    * properly implemented. */
+   rast->num_threads = 0;
+#else
+   rast->num_threads = util_cpu_caps.nr_cpus;
+   rast->num_threads = debug_get_num_option("LP_NUM_THREADS", rast->num_threads);
+   rast->num_threads = MIN2(rast->num_threads, MAX_THREADS);
+#endif
+
+   /* NOTE: if num_threads is zero, we won't use any threads */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_init(&rast->tasks[i].work_ready, 0);
+      pipe_semaphore_init(&rast->tasks[i].work_done, 0);
+      rast->threads[i] = pipe_thread_create(thread_func,
+                                            (void *) &rast->tasks[i]);
+   }
+}
+
+
+
+/**
+ * Create new lp_rasterizer.
+ * \param empty  the queue to put empty scenes on after we've finished
+ *               processing them.
+ */
+struct lp_rasterizer *
+lp_rast_create( struct pipe_screen *screen, struct lp_scene_queue *empty )
+{
+   struct lp_rasterizer *rast;
+   unsigned i, cbuf;
+
+   rast = CALLOC_STRUCT(lp_rasterizer);
+   if(!rast)
+      return NULL;
+
+   rast->screen = screen;
+
+   rast->empty_scenes = empty;
+   rast->full_scenes = lp_scene_queue_create();
+
+   for (i = 0; i < Elements(rast->tasks); i++) {
+      struct lp_rasterizer_task *task = &rast->tasks[i];
+
+      for (cbuf = 0; cbuf < PIPE_MAX_COLOR_BUFS; cbuf++ )
+	 task->tile.color[cbuf] = align_malloc(TILE_SIZE * TILE_SIZE * 4, 16);
+
+      task->tile.depth = align_malloc(TILE_SIZE * TILE_SIZE * 4, 16);
+      task->rast = rast;
+      task->thread_index = i;
+   }
+
+   create_rast_threads(rast);
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_init( &rast->barrier, rast->num_threads );
+
+   return rast;
+}
+
+
+/* Shutdown:
+ */
+void lp_rast_destroy( struct lp_rasterizer *rast )
+{
+   unsigned i, cbuf;
+
+   util_unreference_framebuffer_state(&rast->state.fb);
+
+   for (i = 0; i < Elements(rast->tasks); i++) {
+      align_free(rast->tasks[i].tile.depth);
+      for (cbuf = 0; cbuf < PIPE_MAX_COLOR_BUFS; cbuf++ )
+	 align_free(rast->tasks[i].tile.color[cbuf]);
+   }
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_destroy( &rast->barrier );
+
+   FREE(rast);
+}
+
+
+/** Return number of rasterization threads */
+unsigned
+lp_rast_get_num_threads( struct lp_rasterizer *rast )
+{
+   return rast->num_threads;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
new file mode 100644
index 0000000000..34da73eb50
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -0,0 +1,236 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * The rast code is concerned with rasterization of command bins.
+ * Each screen tile has a bin associated with it.  To render the
+ * scene we iterate over the tile bins and execute the commands
+ * in each bin.
+ * We'll do that with multiple threads...
+ */
+
+
+#ifndef LP_RAST_H
+#define LP_RAST_H
+
+#include "pipe/p_compiler.h"
+#include "lp_jit.h"
+
+
+struct lp_rasterizer;
+struct lp_scene;
+struct lp_scene_queue;
+struct lp_fence;
+struct cmd_bin;
+struct pipe_screen;
+
+/** For sub-pixel positioning */
+#define FIXED_ORDER 4
+#define FIXED_ONE (1<<FIXED_ORDER)
+
+
+/**
+ * Rasterization state.
+ * Objects of this type are put into the shared data bin and pointed
+ * to by commands in the per-tile bins.
+ */
+struct lp_rast_state {
+   /* State for the shader.  This also contains state which feeds into
+    * the fragment shader, such as blend color and alpha ref value.
+    */
+   struct lp_jit_context jit_context;
+   
+   /* The shader itself.  Probably we also need to pass a pointer to
+    * the tile color/z/stencil data somehow:
+    * jit_function[0] skips the triangle in/out test code
+    * jit_function[1] does triangle in/out testing
+     */
+   lp_jit_frag_func jit_function[2];
+
+   boolean opaque;
+};
+
+
+/**
+ * Coefficients necessary to run the shader at a given location.
+ * First coefficient is position.
+ * These pointers point into the bin data buffer.
+ */
+struct lp_rast_shader_inputs {
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
+
+   /* edge/step info for 3 edges and 4x4 block of pixels */
+   PIPE_ALIGN_VAR(16) int step[3][16];
+};
+
+
+/**
+ * Rasterization information for a triangle known to be in this bin,
+ * plus inputs to run the shader:
+ * These fields are tile- and bin-independent.
+ * Objects of this type are put into the setup_context::data buffer.
+ */
+struct lp_rast_triangle {
+   /* one-pixel sized trivial accept offsets for each plane */
+   int ei1;                   
+   int ei2;
+   int ei3;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int eo1;                   
+   int eo2;
+   int eo3;
+
+   /* y deltas for vertex pairs (in fixed pt) */
+   int dy12;
+   int dy23;
+   int dy31;
+
+   /* x deltas for vertex pairs (in fixed pt) */
+   int dx12;
+   int dx23;
+   int dx31;
+
+   /* edge function values at minx,miny ?? */
+   int c1, c2, c3;
+
+   /* inputs for the shader */
+   PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs;
+};
+
+
+
+struct lp_rasterizer *lp_rast_create( struct pipe_screen *screen,
+                                      struct lp_scene_queue *empty );
+
+void lp_rast_destroy( struct lp_rasterizer * );
+
+unsigned lp_rast_get_num_threads( struct lp_rasterizer * );
+
+void lp_rasterize_scene( struct lp_rasterizer *rast,
+			 struct lp_scene *scene,
+			 const struct pipe_framebuffer_state *fb,
+			 bool write_depth );
+
+
+
+union lp_rast_cmd_arg {
+   const struct lp_rast_shader_inputs *shade_tile;
+   const struct lp_rast_triangle *triangle;
+   const struct lp_rast_state *set_state;
+   uint8_t clear_color[4];
+   unsigned clear_zstencil;
+   struct lp_fence *fence;
+};
+
+
+/* Cast wrappers.  Hopefully these compile to noops!
+ */
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
+{
+   union lp_rast_cmd_arg arg;
+   arg.shade_tile = shade_tile;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_triangle( const struct lp_rast_triangle *triangle )
+{
+   union lp_rast_cmd_arg arg;
+   arg.triangle = triangle;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_state( const struct lp_rast_state *state )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = state;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_fence( struct lp_fence *fence )
+{
+   union lp_rast_cmd_arg arg;
+   arg.fence = fence;
+   return arg;
+}
+
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_null( void )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = NULL;
+   return arg;
+}
+
+
+
+/**
+ * Binnable Commands.
+ * These get put into bins by the setup code and are called when
+ * the bins are executed.
+ */
+
+void lp_rast_clear_color( struct lp_rasterizer *, 
+                          unsigned thread_index,
+                          const union lp_rast_cmd_arg );
+
+void lp_rast_clear_zstencil( struct lp_rasterizer *, 
+                             unsigned thread_index,
+                             const union lp_rast_cmd_arg );
+
+void lp_rast_load_color( struct lp_rasterizer *, 
+                         unsigned thread_index,
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_load_zstencil( struct lp_rasterizer *, 
+                            unsigned thread_index,
+                            const union lp_rast_cmd_arg );
+
+void lp_rast_set_state( struct lp_rasterizer *, 
+                        unsigned thread_index,
+                        const union lp_rast_cmd_arg );
+
+void lp_rast_triangle( struct lp_rasterizer *, 
+                       unsigned thread_index,
+                       const union lp_rast_cmd_arg );
+
+void lp_rast_shade_tile( struct lp_rasterizer *,
+                         unsigned thread_index,
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_fence( struct lp_rasterizer *,
+                    unsigned thread_index,
+                    const union lp_rast_cmd_arg );
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
new file mode 100644
index 0000000000..71e3a301e6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_RAST_PRIV_H
+#define LP_RAST_PRIV_H
+
+#include "os/os_thread.h"
+#include "lp_rast.h"
+#include "lp_tile_soa.h"
+
+
+#define MAX_THREADS 8  /* XXX probably temporary here */
+
+
+struct pipe_transfer;
+struct pipe_screen;
+struct lp_rasterizer;
+
+
+/**
+ * A tile's color and depth memory.
+ * We can choose whatever layout for the internal tile storage we prefer.
+ */
+struct lp_rast_tile
+{
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+
+   uint32_t *depth;
+};
+
+
+/**
+ * Per-thread rasterization state
+ */
+struct lp_rasterizer_task
+{
+   struct lp_rast_tile tile;   /** Tile color/z/stencil memory */
+
+   unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
+
+   const struct lp_rast_state *current_state;
+
+   /** "back" pointer */
+   struct lp_rasterizer *rast;
+
+   /** "my" index */
+   unsigned thread_index;
+
+   pipe_semaphore work_ready;
+   pipe_semaphore work_done;
+};
+
+
+/**
+ * This is the state required while rasterizing tiles.
+ * Note that this contains per-thread information too.
+ * The tile size is TILE_SIZE x TILE_SIZE pixels.
+ */
+struct lp_rasterizer
+{
+   boolean clipped_tile;
+   boolean check_for_clipped_tiles;
+
+   /* Framebuffer stuff
+    */
+   struct pipe_screen *screen;
+   struct pipe_transfer *cbuf_transfer[PIPE_MAX_COLOR_BUFS];
+   struct pipe_transfer *zsbuf_transfer;
+   void *cbuf_map[PIPE_MAX_COLOR_BUFS];
+   void *zsbuf_map;
+
+   struct {
+      struct pipe_framebuffer_state fb;
+      boolean write_color;
+      boolean write_zstencil;
+      unsigned clear_color;
+      unsigned clear_depth;
+      char clear_stencil;
+   } state;
+
+   /** The incoming queue of scenes ready to rasterize */
+   struct lp_scene_queue *full_scenes;
+   /** The outgoing queue of processed scenes to return to setup modulee */
+   struct lp_scene_queue *empty_scenes;
+
+   /** The scene currently being rasterized by the threads */
+   struct lp_scene *curr_scene;
+
+   /** A task object for each rasterization thread */
+   struct lp_rasterizer_task tasks[MAX_THREADS];
+
+   unsigned num_threads;
+   pipe_thread threads[MAX_THREADS];
+
+   /** For synchronizing the rasterization threads */
+   pipe_barrier barrier;
+};
+
+
+void lp_rast_shade_quads( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const struct lp_rast_shader_inputs *inputs,
+                          unsigned x, unsigned y,
+                          int32_t c1, int32_t c2, int32_t c3);
+
+
+/**
+ * Shade all pixels in a 4x4 block.  The fragment code omits the
+ * triangle in/out tests.
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE void
+lp_rast_shade_quads_all( struct lp_rasterizer *rast,
+                         unsigned thread_index,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y )
+{
+   const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
+   struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
+   const unsigned ix = x % TILE_SIZE, iy = y % TILE_SIZE;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   void *depth;
+   unsigned block_offset, i;
+
+   /* offset of the containing 16x16 pixel block within the tile */
+   block_offset = (iy / 4) * (16 * 16) + (ix / 4) * 16;
+
+   /* color buffer */
+   for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+      color[i] = tile->color[i] + 4 * block_offset;
+
+   /* depth buffer */
+   depth = tile->depth + block_offset;
+
+   /* run shader */
+   state->jit_function[0]( &state->jit_context,
+                           x, y,
+                           inputs->a0,
+                           inputs->dadx,
+                           inputs->dady,
+                           color,
+                           depth,
+                           INT_MIN, INT_MIN, INT_MIN,
+                           NULL, NULL, NULL );
+}
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
new file mode 100644
index 0000000000..3f76f159df
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -0,0 +1,251 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Rasterization for binned triangles within a tile
+ */
+
+#include <limits.h>
+#include "util/u_math.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_rast_priv.h"
+#include "lp_tile_soa.h"
+
+
+/**
+ * Map an index in [0,15] to an x,y position, multiplied by 4.
+ * This is used to get the position of each subtile in a 4x4
+ * grid of edge step values.
+ * Note: we can use some bit twiddling to compute these values instead
+ * of using a look-up table, but there's no measurable performance
+ * difference.
+ */
+static const int pos_table4[16][2] = {
+   { 0, 0 },
+   { 4, 0 },
+   { 0, 4 },
+   { 4, 4 },
+   { 8, 0 },
+   { 12, 0 },
+   { 8, 4 },
+   { 12, 4 },
+   { 0, 8 },
+   { 4, 8 },
+   { 0, 12 },
+   { 4, 12 },
+   { 8, 8 },
+   { 12, 8 },
+   { 8, 12 },
+   { 12, 12 }
+};
+
+
+static const int pos_table16[16][2] = {
+   { 0, 0 },
+   { 16, 0 },
+   { 0, 16 },
+   { 16, 16 },
+   { 32, 0 },
+   { 48, 0 },
+   { 32, 16 },
+   { 48, 16 },
+   { 0, 32 },
+   { 16, 32 },
+   { 0, 48 },
+   { 16, 48 },
+   { 32, 32 },
+   { 48, 32 },
+   { 32, 48 },
+   { 48, 48 }
+};
+
+
+/**
+ * Shade all pixels in a 4x4 block.
+ */
+static void
+block_full_4( struct lp_rasterizer_task *rast_task,
+              const struct lp_rast_triangle *tri,
+              int x, int y )
+{
+   lp_rast_shade_quads_all(rast_task->rast,
+                           rast_task->thread_index,
+                           &tri->inputs, 
+                           x, y);
+}
+
+
+/**
+ * Shade all pixels in a 16x16 block.
+ */
+static void
+block_full_16( struct lp_rasterizer_task *rast_task,
+               const struct lp_rast_triangle *tri,
+               int x, int y )
+{
+   unsigned ix, iy;
+   assert(x % 16 == 0);
+   assert(y % 16 == 0);
+   for (iy = 0; iy < 16; iy += 4)
+      for (ix = 0; ix < 16; ix += 4)
+	 block_full_4(rast_task, tri, x + ix, y + iy);
+}
+
+
+/**
+ * Pass the 4x4 pixel block to the shader function.
+ * Determination of which of the 16 pixels lies inside the triangle
+ * will be done as part of the fragment shader.
+ */
+static void
+do_block_4( struct lp_rasterizer_task *rast_task,
+	    const struct lp_rast_triangle *tri,
+	    int x, int y,
+	    int c1,
+	    int c2,
+	    int c3 )
+{
+   lp_rast_shade_quads(rast_task->rast,
+                       rast_task->thread_index,
+                       &tri->inputs, 
+                       x, y,
+                       -c1, -c2, -c3);
+}
+
+
+/**
+ * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out
+ * of the triangle's bounds.
+ */
+static void
+do_block_16( struct lp_rasterizer_task *rast_task,
+             const struct lp_rast_triangle *tri,
+             int x, int y,
+             int c1,
+             int c2,
+             int c3 )
+{
+   const int eo1 = tri->eo1 * 4;
+   const int eo2 = tri->eo2 * 4;
+   const int eo3 = tri->eo3 * 4;
+   const int *step0 = tri->inputs.step[0];
+   const int *step1 = tri->inputs.step[1];
+   const int *step2 = tri->inputs.step[2];
+   int i;
+
+   assert(x % 16 == 0);
+   assert(y % 16 == 0);
+
+   for (i = 0; i < 16; i++) {
+      int cx1 = c1 + step0[i] * 4;
+      int cx2 = c2 + step1[i] * 4;
+      int cx3 = c3 + step2[i] * 4;
+
+      if (cx1 + eo1 < 0 ||
+          cx2 + eo2 < 0 ||
+          cx3 + eo3 < 0) {
+         /* the block is completely outside the triangle - nop */
+         LP_COUNT(nr_empty_4);
+      }
+      else {
+         int px = x + pos_table4[i][0];
+         int py = y + pos_table4[i][1];
+         /* Don't bother testing if the 4x4 block is entirely in/out of
+          * the triangle.  It's a little faster to do it in the jit code.
+          */
+         LP_COUNT(nr_non_empty_4);
+         do_block_4(rast_task, tri, px, py, cx1, cx2, cx3);
+      }
+   }
+}
+
+
+/**
+ * Scan the tile in chunks and figure out which pixels to rasterize
+ * for this triangle.
+ */
+void
+lp_rast_triangle( struct lp_rasterizer *rast,
+                  unsigned thread_index,
+                  const union lp_rast_cmd_arg arg )
+{
+   struct lp_rasterizer_task *rast_task = &rast->tasks[thread_index];
+   const struct lp_rast_triangle *tri = arg.triangle;
+
+   int x = rast_task->x;
+   int y = rast_task->y;
+   unsigned i;
+
+   int c1 = tri->c1 + tri->dx12 * y - tri->dy12 * x;
+   int c2 = tri->c2 + tri->dx23 * y - tri->dy23 * x;
+   int c3 = tri->c3 + tri->dx31 * y - tri->dy31 * x;
+
+   int ei1 = tri->ei1 * 16;
+   int ei2 = tri->ei2 * 16;
+   int ei3 = tri->ei3 * 16;
+
+   int eo1 = tri->eo1 * 16;
+   int eo2 = tri->eo2 * 16;
+   int eo3 = tri->eo3 * 16;
+
+   LP_DBG(DEBUG_RAST, "lp_rast_triangle\n");
+
+   /* Walk over the tile to build a list of 4x4 pixel blocks which will
+    * be filled/shaded.  We do this at two granularities: 16x16 blocks
+    * and then 4x4 blocks.
+    */
+   for (i = 0; i < 16; i++) {
+      int cx1 = c1 + (tri->inputs.step[0][i] * 16);
+      int cx2 = c2 + (tri->inputs.step[1][i] * 16);
+      int cx3 = c3 + (tri->inputs.step[2][i] * 16);
+
+      if (cx1 + eo1 < 0 ||
+          cx2 + eo2 < 0 ||
+          cx3 + eo3 < 0) {
+         /* the block is completely outside the triangle - nop */
+         LP_COUNT(nr_empty_16);
+      }
+      else {
+         int px = x + pos_table16[i][0];
+         int py = y + pos_table16[i][1];
+
+         if (cx1 + ei1 > 0 &&
+             cx2 + ei2 > 0 &&
+             cx3 + ei3 > 0) {
+            /* the block is completely inside the triangle */
+            LP_COUNT(nr_fully_covered_16);
+            block_full_16(rast_task, tri, px, py);
+         }
+         else {
+            /* the block is partially in/out of the triangle */
+            LP_COUNT(nr_partially_covered_16);
+            do_block_16(rast_task, tri, px, py, cx1, cx2, cx3);
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
new file mode 100644
index 0000000000..b7116297ec
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -0,0 +1,392 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_simple_list.h"
+#include "lp_scene.h"
+
+
+struct lp_scene *
+lp_scene_create(void)
+{
+   struct lp_scene *scene = CALLOC_STRUCT(lp_scene);
+   if (scene)
+      lp_scene_init(scene);
+   return scene;
+}
+
+
+void
+lp_scene_destroy(struct lp_scene *scene)
+{
+   lp_scene_reset(scene);
+   lp_scene_free_bin_data(scene);
+   FREE(scene);
+}
+
+
+void
+lp_scene_init(struct lp_scene *scene)
+{
+   unsigned i, j;
+   for (i = 0; i < TILES_X; i++)
+      for (j = 0; j < TILES_Y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         bin->commands.head = bin->commands.tail = CALLOC_STRUCT(cmd_block);
+      }
+
+   scene->data.head =
+      scene->data.tail = CALLOC_STRUCT(data_block);
+
+   make_empty_list(&scene->textures);
+
+   pipe_mutex_init(scene->mutex);
+}
+
+
+/**
+ * Check if the scene's bins are all empty.
+ * For debugging purposes.
+ */
+boolean
+lp_scene_is_empty(struct lp_scene *scene )
+{
+   unsigned x, y;
+
+   for (y = 0; y < TILES_Y; y++) {
+      for (x = 0; x < TILES_X; x++) {
+         const struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+         const struct cmd_block_list *list = &bin->commands;
+         if (list->head != list->tail || list->head->count > 0) {
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+
+
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+   struct cmd_block_list *list = &bin->commands;
+   struct cmd_block *block;
+   struct cmd_block *tmp;
+
+   for (block = list->head; block != list->tail; block = tmp) {
+      tmp = block->next;
+      FREE(block);
+   }
+
+   assert(list->tail->next == NULL);
+   list->head = list->tail;
+   list->head->count = 0;
+}
+
+
+/**
+ * Set scene to empty state.
+ */
+void
+lp_scene_reset(struct lp_scene *scene )
+{
+   unsigned i, j;
+
+   /* Free all but last binner command lists:
+    */
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         lp_scene_bin_reset(scene, i, j);
+      }
+   }
+
+   assert(lp_scene_is_empty(scene));
+
+   /* Free all but last binned data block:
+    */
+   {
+      struct data_block_list *list = &scene->data;
+      struct data_block *block, *tmp;
+
+      for (block = list->head; block != list->tail; block = tmp) {
+         tmp = block->next;
+         FREE(block);
+      }
+         
+      assert(list->tail->next == NULL);
+      list->head = list->tail;
+      list->head->used = 0;
+   }
+
+   /* Release texture refs
+    */
+   {
+      struct texture_ref *ref, *next, *ref_list = &scene->textures;
+      for (ref = ref_list->next; ref != ref_list; ref = next) {
+         next = next_elem(ref);
+         pipe_texture_reference(&ref->texture, NULL);
+         FREE(ref);
+      }
+      make_empty_list(ref_list);
+   }
+}
+
+
+/**
+ * Free all data associated with the given bin, but don't free(scene).
+ */
+void
+lp_scene_free_bin_data(struct lp_scene *scene)
+{
+   unsigned i, j;
+
+   for (i = 0; i < TILES_X; i++)
+      for (j = 0; j < TILES_Y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         /* lp_reset_scene() should have been already called */
+         assert(bin->commands.head == bin->commands.tail);
+         FREE(bin->commands.head);
+         bin->commands.head = NULL;
+         bin->commands.tail = NULL;
+      }
+
+   FREE(scene->data.head);
+   scene->data.head = NULL;
+
+   pipe_mutex_destroy(scene->mutex);
+}
+
+
+void
+lp_scene_set_framebuffer_size( struct lp_scene *scene,
+                               unsigned width, unsigned height )
+{
+   assert(lp_scene_is_empty(scene));
+
+   scene->tiles_x = align(width, TILE_SIZE) / TILE_SIZE;
+   scene->tiles_y = align(height, TILE_SIZE) / TILE_SIZE;
+}
+
+
+void
+lp_bin_new_cmd_block( struct cmd_block_list *list )
+{
+   struct cmd_block *block = MALLOC_STRUCT(cmd_block);
+   list->tail->next = block;
+   list->tail = block;
+   block->next = NULL;
+   block->count = 0;
+}
+
+
+void
+lp_bin_new_data_block( struct data_block_list *list )
+{
+   struct data_block *block = MALLOC_STRUCT(data_block);
+   list->tail->next = block;
+   list->tail = block;
+   block->next = NULL;
+   block->used = 0;
+}
+
+
+/** Return number of bytes used for all bin data within a scene */
+unsigned
+lp_scene_data_size( const struct lp_scene *scene )
+{
+   unsigned size = 0;
+   const struct data_block *block;
+   for (block = scene->data.head; block; block = block->next) {
+      size += block->used;
+   }
+   return size;
+}
+
+
+/** Return number of bytes used for a single bin */
+unsigned
+lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y )
+{
+   struct cmd_bin *bin = lp_scene_get_bin((struct lp_scene *) scene, x, y);
+   const struct cmd_block *cmd;
+   unsigned size = 0;
+   for (cmd = bin->commands.head; cmd; cmd = cmd->next) {
+      size += (cmd->count *
+               (sizeof(lp_rast_cmd) + sizeof(union lp_rast_cmd_arg)));
+   }
+   return size;
+}
+
+
+/**
+ * Add a reference to a texture by the scene.
+ */
+void
+lp_scene_texture_reference( struct lp_scene *scene,
+                            struct pipe_texture *texture )
+{
+   struct texture_ref *ref = CALLOC_STRUCT(texture_ref);
+   if (ref) {
+      struct texture_ref *ref_list = &scene->textures;
+      pipe_texture_reference(&ref->texture, texture);
+      insert_at_tail(ref_list, ref);
+   }
+}
+
+
+/**
+ * Does this scene have a reference to the given texture?
+ */
+boolean
+lp_scene_is_texture_referenced( const struct lp_scene *scene,
+                                const struct pipe_texture *texture )
+{
+   const struct texture_ref *ref_list = &scene->textures;
+   const struct texture_ref *ref;
+   foreach (ref, ref_list) {
+      if (ref->texture == texture)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
+/**
+ * Return last command in the bin
+ */
+static lp_rast_cmd
+lp_get_last_command( const struct cmd_bin *bin )
+{
+   const struct cmd_block *tail = bin->commands.tail;
+   const unsigned i = tail->count;
+   if (i > 0)
+      return tail->cmd[i - 1];
+   else
+      return NULL;
+}
+
+
+/**
+ * Replace the arg of the last command in the bin.
+ */
+static void
+lp_replace_last_command_arg( struct cmd_bin *bin,
+                             const union lp_rast_cmd_arg arg )
+{
+   struct cmd_block *tail = bin->commands.tail;
+   const unsigned i = tail->count;
+   assert(i > 0);
+   tail->arg[i - 1] = arg;
+}
+
+
+
+/**
+ * Put a state-change command into all bins.
+ * If we find that the last command in a bin was also a state-change
+ * command, we can simply replace that one with the new one.
+ */
+void
+lp_scene_bin_state_command( struct lp_scene *scene,
+                            lp_rast_cmd cmd,
+                            const union lp_rast_cmd_arg arg )
+{
+   unsigned i, j;
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         lp_rast_cmd last_cmd = lp_get_last_command(bin);
+         if (last_cmd == cmd) {
+            lp_replace_last_command_arg(bin, arg);
+         }
+         else {
+            lp_scene_bin_command( scene, i, j, cmd, arg );
+         }
+      }
+   }
+}
+
+
+/** advance curr_x,y to the next bin */
+static boolean
+next_bin(struct lp_scene *scene)
+{
+   scene->curr_x++;
+   if (scene->curr_x >= scene->tiles_x) {
+      scene->curr_x = 0;
+      scene->curr_y++;
+   }
+   if (scene->curr_y >= scene->tiles_y) {
+      /* no more bins */
+      return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene )
+{
+   scene->curr_x = scene->curr_y = -1;
+}
+
+
+/**
+ * Return pointer to next bin to be rendered.
+ * The lp_scene::curr_x and ::curr_y fields will be advanced.
+ * Multiple rendering threads will call this function to get a chunk
+ * of work (a bin) to work on.
+ */
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y )
+{
+   struct cmd_bin *bin = NULL;
+
+   pipe_mutex_lock(scene->mutex);
+
+   if (scene->curr_x < 0) {
+      /* first bin */
+      scene->curr_x = 0;
+      scene->curr_y = 0;
+   }
+   else if (!next_bin(scene)) {
+      /* no more bins left */
+      goto end;
+   }
+
+   bin = lp_scene_get_bin(scene, scene->curr_x, scene->curr_y);
+   *bin_x = scene->curr_x;
+   *bin_y = scene->curr_y;
+
+end:
+   /*printf("return bin %p at %d, %d\n", (void *) bin, *bin_x, *bin_y);*/
+   pipe_mutex_unlock(scene->mutex);
+   return bin;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
new file mode 100644
index 0000000000..fb478cc2eb
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -0,0 +1,301 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Binner data structures and bin-related functions.
+ * Note: the "setup" code is concerned with building scenes while
+ * The "rast" code is concerned with consuming/executing scenes.
+ */
+
+#ifndef LP_SCENE_H
+#define LP_SCENE_H
+
+#include "os/os_thread.h"
+#include "lp_tile_soa.h"
+#include "lp_rast.h"
+
+
+/* We're limited to 2K by 2K for 32bit fixed point rasterization.
+ * Will need a 64-bit version for larger framebuffers.
+ */
+#define MAXHEIGHT 2048
+#define MAXWIDTH 2048
+#define TILES_X (MAXWIDTH / TILE_SIZE)
+#define TILES_Y (MAXHEIGHT / TILE_SIZE)
+
+
+#define CMD_BLOCK_MAX 128
+#define DATA_BLOCK_SIZE (16 * 1024 - sizeof(unsigned) - sizeof(void *))
+   
+
+
+/* switch to a non-pointer value for this:
+ */
+typedef void (*lp_rast_cmd)( struct lp_rasterizer *,
+                             unsigned thread_index,
+                             const union lp_rast_cmd_arg );
+
+struct cmd_block {
+   lp_rast_cmd cmd[CMD_BLOCK_MAX];
+   union lp_rast_cmd_arg arg[CMD_BLOCK_MAX];
+   unsigned count;
+   struct cmd_block *next;
+};
+
+struct data_block {
+   ubyte data[DATA_BLOCK_SIZE];
+   unsigned used;
+   struct data_block *next;
+};
+
+struct cmd_block_list {
+   struct cmd_block *head;
+   struct cmd_block *tail;
+};
+
+/**
+ * For each screen tile we have one of these bins.
+ */
+struct cmd_bin {
+   struct cmd_block_list commands;
+};
+   
+
+/**
+ * This stores bulk data which is shared by all bins within a scene.
+ * Examples include triangle data and state data.  The commands in
+ * the per-tile bins will point to chunks of data in this structure.
+ */
+struct data_block_list {
+   struct data_block *head;
+   struct data_block *tail;
+};
+
+
+/** List of texture references */
+struct texture_ref {
+   struct pipe_texture *texture;
+   struct texture_ref *prev, *next;  /**< linked list w/ u_simple_list.h */
+};
+
+
+/**
+ * All bins and bin data are contained here.
+ * Per-bin data goes into the 'tile' bins.
+ * Shared data goes into the 'data' buffer.
+ *
+ * When there are multiple threads, will want to double-buffer between
+ * scenes:
+ */
+struct lp_scene {
+   struct cmd_bin tile[TILES_X][TILES_Y];
+   struct data_block_list data;
+
+   /** the framebuffer to render the scene into */
+   struct pipe_framebuffer_state fb;
+
+   /** list of textures referenced by the scene commands */
+   struct texture_ref textures;
+
+   boolean write_depth;
+
+   /**
+    * Number of active tiles in each dimension.
+    * This basically the framebuffer size divided by tile size
+    */
+   unsigned tiles_x, tiles_y;
+
+   int curr_x, curr_y;  /**< for iterating over bins */
+   pipe_mutex mutex;
+};
+
+
+
+struct lp_scene *lp_scene_create(void);
+
+void lp_scene_destroy(struct lp_scene *scene);
+
+
+void lp_scene_init(struct lp_scene *scene);
+
+boolean lp_scene_is_empty(struct lp_scene *scene );
+
+void lp_scene_reset(struct lp_scene *scene );
+
+void lp_scene_free_bin_data(struct lp_scene *scene);
+
+void lp_scene_set_framebuffer_size( struct lp_scene *scene,
+                                  unsigned width, unsigned height );
+
+void lp_bin_new_data_block( struct data_block_list *list );
+
+void lp_bin_new_cmd_block( struct cmd_block_list *list );
+
+unsigned lp_scene_data_size( const struct lp_scene *scene );
+
+unsigned lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y );
+
+void lp_scene_texture_reference( struct lp_scene *scene,
+                                 struct pipe_texture *texture );
+
+boolean lp_scene_is_texture_referenced( const struct lp_scene *scene,
+                                        const struct pipe_texture *texture );
+
+
+/**
+ * Allocate space for a command/data in the bin's data buffer.
+ * Grow the block list if needed.
+ */
+static INLINE void *
+lp_scene_alloc( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+
+   if (list->tail->used + size > DATA_BLOCK_SIZE) {
+      lp_bin_new_data_block( list );
+   }
+
+   {
+      struct data_block *tail = list->tail;
+      ubyte *data = tail->data + tail->used;
+      tail->used += size;
+      return data;
+   }
+}
+
+
+/**
+ * As above, but with specific alignment.
+ */
+static INLINE void *
+lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
+			unsigned alignment )
+{
+   struct data_block_list *list = &scene->data;
+
+   if (list->tail->used + size + alignment - 1 > DATA_BLOCK_SIZE) {
+      lp_bin_new_data_block( list );
+   }
+
+   {
+      struct data_block *tail = list->tail;
+      ubyte *data = tail->data + tail->used;
+      unsigned offset = (((uintptr_t)data + alignment - 1) & ~(alignment - 1)) - (uintptr_t)data;
+      tail->used += offset + size;
+      return data + offset;
+   }
+}
+
+
+/* Put back data if we decide not to use it, eg. culled triangles.
+ */
+static INLINE void
+lp_scene_putback_data( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+   assert(list->tail->used >= size);
+   list->tail->used -= size;
+}
+
+
+/** Return pointer to a particular tile's bin. */
+static INLINE struct cmd_bin *
+lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   return &scene->tile[x][y];
+}
+
+
+/** Remove all commands from a bin */
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
+
+
+/* Add a command to bin[x][y].
+ */
+static INLINE void
+lp_scene_bin_command( struct lp_scene *scene,
+                unsigned x, unsigned y,
+                lp_rast_cmd cmd,
+                union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+   struct cmd_block_list *list = &bin->commands;
+
+   assert(x < scene->tiles_x);
+   assert(y < scene->tiles_y);
+
+   if (list->tail->count == CMD_BLOCK_MAX) {
+      lp_bin_new_cmd_block( list );
+   }
+
+   {
+      struct cmd_block *tail = list->tail;
+      unsigned i = tail->count;
+      tail->cmd[i] = cmd;
+      tail->arg[i] = arg;
+      tail->count++;
+   }
+}
+
+
+/* Add a command to all active bins.
+ */
+static INLINE void
+lp_scene_bin_everywhere( struct lp_scene *scene,
+			 lp_rast_cmd cmd,
+			 const union lp_rast_cmd_arg arg )
+{
+   unsigned i, j;
+   for (i = 0; i < scene->tiles_x; i++)
+      for (j = 0; j < scene->tiles_y; j++)
+         lp_scene_bin_command( scene, i, j, cmd, arg );
+}
+
+
+void
+lp_scene_bin_state_command( struct lp_scene *scene,
+			    lp_rast_cmd cmd,
+			    const union lp_rast_cmd_arg arg );
+
+
+static INLINE unsigned
+lp_scene_get_num_bins( const struct lp_scene *scene )
+{
+   return scene->tiles_x * scene->tiles_y;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene );
+
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y );
+
+
+#endif /* LP_BIN_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_scene_queue.c b/src/gallium/drivers/llvmpipe/lp_scene_queue.c
new file mode 100644
index 0000000000..43d74e4d89
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene_queue.c
@@ -0,0 +1,122 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Scene queue.  We'll use two queues.  One contains "full" scenes which
+ * are produced by the "setup" code.  The other contains "empty" scenes
+ * which are produced by the "rast" code when it finishes rendering a scene.
+ */
+
+#include "util/u_ringbuffer.h"
+#include "util/u_memory.h"
+#include "lp_scene_queue.h"
+
+
+
+#define MAX_SCENE_QUEUE 4
+
+struct scene_packet {
+   struct util_packet header;
+   struct lp_scene *scene;
+};
+
+/**
+ * A queue of scenes
+ */
+struct lp_scene_queue
+{
+   struct util_ringbuffer *ring;
+};
+
+
+
+/** Allocate a new scene queue */
+struct lp_scene_queue *
+lp_scene_queue_create(void)
+{
+   struct lp_scene_queue *queue = CALLOC_STRUCT(lp_scene_queue);
+   if (queue == NULL)
+      return NULL;
+
+   queue->ring = util_ringbuffer_create( MAX_SCENE_QUEUE * 
+                                         sizeof( struct scene_packet ) / 4);
+   if (queue->ring == NULL)
+      goto fail;
+
+   return queue;
+
+fail:
+   FREE(queue);
+   return NULL;
+}
+
+
+/** Delete a scene queue */
+void
+lp_scene_queue_destroy(struct lp_scene_queue *queue)
+{
+   util_ringbuffer_destroy(queue->ring);
+   FREE(queue);
+}
+
+
+/** Remove first lp_scene from head of queue */
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait)
+{
+   struct scene_packet packet;
+   enum pipe_error ret;
+
+   ret = util_ringbuffer_dequeue(queue->ring,
+                                 &packet.header,
+                                 sizeof packet / 4,
+                                 wait );
+   if (ret != PIPE_OK)
+      return NULL;
+
+   return packet.scene;
+}
+
+
+/** Add an lp_scene to tail of queue */
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene)
+{
+   struct scene_packet packet;
+
+   packet.header.dwords = sizeof packet / 4;
+   packet.header.data24 = 0;
+   packet.scene = scene;
+
+   util_ringbuffer_enqueue(queue->ring, &packet.header);
+}
+
+
+
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.h b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
index 0e787e0b9c..fd7c65a2c8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_misc.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
@@ -26,31 +26,26 @@
  **************************************************************************/
 
 
-#ifndef LP_BLD_MISC_H
-#define LP_BLD_MISC_H
+#ifndef LP_SCENE_QUEUE
+#define LP_SCENE_QUEUE
 
+struct lp_scene_queue;
+struct lp_scene;
 
-#include "llvm/Config/config.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#ifndef LLVM_NATIVE_ARCH
+struct lp_scene_queue *
+lp_scene_queue_create(void);
 
 void
-LLVMLinkInJIT(void);
+lp_scene_queue_destroy(struct lp_scene_queue *queue);
 
-int
-LLVMInitializeNativeTarget(void);
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait);
 
-#endif /* !LLVM_NATIVE_ARCH */
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene);
 
 
-#ifdef __cplusplus
-}
-#endif
 
 
-#endif /* !LP_BLD_MISC_H */
+#endif /* LP_BIN_QUEUE */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 9b47415f00..1cd3ea9a84 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -33,9 +33,11 @@
 
 #include "lp_texture.h"
 #include "lp_buffer.h"
+#include "lp_fence.h"
 #include "lp_winsys.h"
 #include "lp_jit.h"
 #include "lp_screen.h"
+#include "lp_context.h"
 #include "lp_debug.h"
 
 #ifdef DEBUG
@@ -51,6 +53,10 @@ static const struct debug_named_value lp_debug_flags[] = {
    { "query",  DEBUG_QUERY },
    { "screen", DEBUG_SCREEN },
    { "jit",    DEBUG_JIT },
+   { "show_tiles",    DEBUG_SHOW_TILES },
+   { "show_subtiles", DEBUG_SHOW_SUBTILES },
+   { "counters", DEBUG_COUNTERS },
+   { "nopt", DEBUG_NO_LLVM_OPT },
    {NULL, 0}
 };
 #endif
@@ -110,6 +116,16 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
       return 1;
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 0;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 0;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 0;
    default:
       return 0;
    }
@@ -295,10 +311,12 @@ llvmpipe_create_screen(struct llvmpipe_winsys *winsys)
    screen->base.is_format_supported = llvmpipe_is_format_supported;
 
    screen->base.surface_buffer_create = llvmpipe_surface_buffer_create;
+   screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
 
    llvmpipe_init_screen_texture_funcs(&screen->base);
    llvmpipe_init_screen_buffer_funcs(&screen->base);
+   llvmpipe_init_screen_fence_funcs(&screen->base);
 
    lp_jit_screen_init(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index b18f17c0cd..3186069899 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -26,1479 +26,704 @@
  **************************************************************************/
 
 /**
- * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ * Tiling engine.
  *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
+ * Builds per-tile display lists and executes them on calls to
+ * lp_setup_flush().
  */
 
-#include "lp_context.h"
-#include "lp_quad.h"
-#include "lp_setup.h"
-#include "lp_state.h"
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_vertex.h"
-#include "pipe/p_shader_tokens.h"
-#include "pipe/p_thread.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
-#include "lp_bld_debug.h"
-#include "lp_tile_cache.h"
-#include "lp_tile_soa.h"
-
-
-#define DEBUG_VERTS 0
-#define DEBUG_FRAGS 0
-
-/**
- * Triangle edge info
- */
-struct edge {
-   float dx;		/**< X(v1) - X(v0), used only during setup */
-   float dy;		/**< Y(v1) - Y(v0), used only during setup */
-   float dxdy;		/**< dx/dy */
-   float sx, sy;	/**< first sample point coord */
-   int lines;		/**< number of lines on this edge */
-};
-
-
-#define MAX_QUADS 16
-
+#include "util/u_pack_color.h"
+#include "util/u_surface.h"
+#include "lp_scene.h"
+#include "lp_scene_queue.h"
+#include "lp_buffer.h"
+#include "lp_texture.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_rast.h"
+#include "lp_setup_context.h"
 
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_context {
-   struct llvmpipe_context *llvmpipe;
-
-   /* Vertices are just an array of floats making up each attribute in
-    * turn.  Currently fixed at 4 floats, but should change in time.
-    * Codegen will help cope with this.
-    */
-   const float (*vmax)[4];
-   const float (*vmid)[4];
-   const float (*vmin)[4];
-   const float (*vprovoke)[4];
-
-   struct edge ebot;
-   struct edge etop;
-   struct edge emaj;
-
-   float oneoverarea;
-   int facing;
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 
-   float pixel_offset;
 
-   struct quad_header quad[MAX_QUADS];
-   struct quad_header *quad_ptrs[MAX_QUADS];
-   unsigned count;
+static void set_scene_state( struct setup_context *, unsigned );
 
-   struct quad_interp_coef coef;
 
-   struct {
-      int left[2];   /**< [0] = row0, [1] = row1 */
-      int right[2];
-      int y;
-   } span;
+struct lp_scene *
+lp_setup_get_current_scene(struct setup_context *setup)
+{
+   if (!setup->scene) {
 
-#if DEBUG_FRAGS
-   uint numFragsEmitted;  /**< per primitive */
-   uint numFragsWritten;  /**< per primitive */
-#endif
+      /* wait for a free/empty scene
+       */
+      setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE);
 
-   unsigned winding;		/* which winding to cull */
-};
+      if(0)lp_scene_reset( setup->scene ); /* XXX temporary? */
 
+      lp_scene_set_framebuffer_size(setup->scene,
+                                    setup->fb.width, 
+                                    setup->fb.height);
+   }
+   return setup->scene;
+}
 
 
-/**
- * Execute fragment shader for the four fragments in the quad.
- */
-ALIGN_STACK
 static void
-shade_quads(struct llvmpipe_context *llvmpipe,
-            struct quad_header *quads[],
-            unsigned nr)
+first_triangle( struct setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4])
 {
-   struct lp_fragment_shader *fs = llvmpipe->fs;
-   struct quad_header *quad = quads[0];
-   const unsigned x = quad->input.x0;
-   const unsigned y = quad->input.y0;
-   uint8_t *tile;
-   uint8_t *color;
-   void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
-   unsigned chan_index;
-   unsigned q;
-
-   assert(fs->current);
-   if(!fs->current)
-      return;
-
-   /* Sanity checks */
-   assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
-   assert(x % TILE_VECTOR_WIDTH == 0);
-   assert(y % TILE_VECTOR_HEIGHT == 0);
-   for (q = 0; q < nr; ++q) {
-      assert(quads[q]->input.x0 == x + q*2);
-      assert(quads[q]->input.y0 == y);
-   }
-
-   /* mask */
-   for (q = 0; q < 4; ++q)
-      for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
-         mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_triangle( setup );
+   setup->triangle( setup, v0, v1, v2 );
+}
 
-   /* color buffer */
-   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
-      llvmpipe->framebuffer.cbufs[0]) {
-      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
-      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
-   }
-   else
-      color = NULL;
-
-   /* depth buffer */
-   if(llvmpipe->zsbuf_map) {
-      assert((x % 2) == 0);
-      assert((y % 2) == 0);
-      depth = llvmpipe->zsbuf_map +
-              y*llvmpipe->zsbuf_transfer->stride +
-              2*x*util_format_get_blocksize(llvmpipe->zsbuf_transfer->texture->format);
-   }
-   else
-      depth = NULL;
-
-   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
-   assert(lp_check_alignment(mask, 16));
-
-   assert(lp_check_alignment(depth, 16));
-   assert(lp_check_alignment(color, 16));
-   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
-
-   /* run shader */
-   fs->current->jit_function( &llvmpipe->jit_context,
-                              x, y,
-                              quad->coef->a0,
-                              quad->coef->dadx,
-                              quad->coef->dady,
-                              &mask[0][0],
-                              color,
-                              depth);
+static void
+first_line( struct setup_context *setup,
+	    const float (*v0)[4],
+	    const float (*v1)[4])
+{
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_line( setup );
+   setup->line( setup, v0, v1 );
 }
 
+static void
+first_point( struct setup_context *setup,
+	     const float (*v0)[4])
+{
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_point( setup );
+   setup->point( setup, v0 );
+}
 
+static void reset_context( struct setup_context *setup )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
+   /* Reset derived state */
+   setup->constants.stored_size = 0;
+   setup->constants.stored_data = NULL;
+   setup->fs.stored = NULL;
+   setup->dirty = ~0;
 
-/**
- * Do triangle cull test using tri determinant (sign indicates orientation)
- * \return true if triangle is to be culled.
- */
-static INLINE boolean
-cull_tri(const struct setup_context *setup, float det)
-{
-   if (det != 0) {   
-      /* if (det < 0 then Z points toward camera and triangle is 
-       * counter-clockwise winding.
-       */
-      unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+   /* no current bin */
+   setup->scene = NULL;
 
-      if ((winding & setup->winding) == 0)
-	 return FALSE;
-   }
+   /* Reset some state:
+    */
+   setup->clear.flags = 0;
 
-   /* Culled:
+   /* Have an explicit "start-binning" call and get rid of this
+    * pointer twiddling?
     */
-   return TRUE;
+   setup->line = first_line;
+   setup->point = first_point;
+   setup->triangle = first_triangle;
 }
 
 
-
-/**
- * Clip setup->quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip( struct setup_context *setup, struct quad_header *quad )
+/** Rasterize all scene's bins */
+static void
+lp_setup_rasterize_scene( struct setup_context *setup,
+			 boolean write_depth )
 {
-   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (quad->input.x0 >= maxx ||
-       quad->input.y0 >= maxy ||
-       quad->input.x0 + 1 < minx ||
-       quad->input.y0 + 1 < miny) {
-      /* totally clipped */
-      quad->inout.mask = 0x0;
-      return;
-   }
-   if (quad->input.x0 < minx)
-      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (quad->input.y0 < miny)
-      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (quad->input.x0 == maxx - 1)
-      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (quad->input.y0 == maxy - 1)
-      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
 
+   lp_rasterize_scene(setup->rast,
+                      scene,
+                      &setup->fb,
+                      write_depth);
 
+   reset_context( setup );
 
-/**
- * Given an X or Y coordinate, return the block/quad coordinate that it
- * belongs to.
- */
-static INLINE int block( int x )
-{
-   return x & ~(2-1);
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
 }
 
-static INLINE int block_x( int x )
+
+
+static void
+begin_binning( struct setup_context *setup )
 {
-   return x & ~(TILE_VECTOR_WIDTH - 1);
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+
+   LP_DBG(DEBUG_SETUP, "%s color: %s depth: %s\n", __FUNCTION__,
+          (setup->clear.flags & PIPE_CLEAR_COLOR) ? "clear": "load",
+          (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) ? "clear": "load");
+
+   if (setup->fb.nr_cbufs) {
+      if (setup->clear.flags & PIPE_CLEAR_COLOR)
+         lp_scene_bin_everywhere( scene, 
+				  lp_rast_clear_color, 
+				  setup->clear.color );
+      else
+         lp_scene_bin_everywhere( scene,
+				  lp_rast_load_color,
+				  lp_rast_arg_null() );
+   }
+
+   if (setup->fb.zsbuf) {
+      if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL)
+         lp_scene_bin_everywhere( scene, 
+				  lp_rast_clear_zstencil, 
+				  setup->clear.zstencil );
+      else
+         lp_scene_bin_everywhere( scene,
+				  lp_rast_load_zstencil,
+				  lp_rast_arg_null() );
+   }
+
+   LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
 }
 
 
-/**
- * Emit a quad (pass to next stage) with clipping.
+/* This basically bins and then flushes any outstanding full-screen
+ * clears.  
+ *
+ * TODO: fast path for fullscreen clears and no triangles.
  */
-static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
+static void
+execute_clears( struct setup_context *setup )
 {
-   quad_clip( setup, quad );
-
-   if (quad->inout.mask) {
-      struct llvmpipe_context *lp = setup->llvmpipe;
-
-#if 1
-      /* XXX: The blender expects 4 quads. This is far from efficient, but
-       * until we codegenerate single-quad variants of the fragment pipeline
-       * we need this hack. */
-      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      struct quad_header quads[4];
-      struct quad_header *quad_ptrs[4];
-      int x0 = block_x(quad->input.x0);
-      unsigned i;
-
-      assert(nr_quads == 4);
-
-      for(i = 0; i < nr_quads; ++i) {
-         int x = x0 + 2*i;
-         if(x == quad->input.x0)
-            memcpy(&quads[i], quad, sizeof quads[i]);
-         else {
-            memset(&quads[i], 0, sizeof quads[i]);
-            quads[i].input.x0 = x;
-            quads[i].input.y0 = quad->input.y0;
-            quads[i].coef = quad->coef;
-         }
-         quad_ptrs[i] = &quads[i];
-      }
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-      shade_quads( lp, quad_ptrs, nr_quads );
-#else
-      shade_quads( lp, &quad, 1 );
-#endif
-   }
+   begin_binning( setup );
+   lp_setup_rasterize_scene( setup, TRUE );
 }
 
 
-/**
- * Render a horizontal span of quads
- */
-static void flush_spans( struct setup_context *setup )
+static void
+set_scene_state( struct setup_context *setup,
+           unsigned new_state )
 {
-   const int step = TILE_VECTOR_WIDTH;
-   const int xleft0 = setup->span.left[0];
-   const int xleft1 = setup->span.left[1];
-   const int xright0 = setup->span.right[0];
-   const int xright1 = setup->span.right[1];
-
-
-   int minleft = block_x(MIN2(xleft0, xleft1));
-   int maxright = MAX2(xright0, xright1);
-   int x;
-
-   for (x = minleft; x < maxright; x += step) {
-      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
-      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
-      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
-      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
-      unsigned lx = x;
-      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      unsigned q = 0;
-
-      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
-      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
-
-      /* These calculations fail when step == 32 and skip_right == 0.
-       */
-      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
-      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
-
-      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
-      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
-
-      if (mask0 | mask1) {
-         for(q = 0; q < nr_quads; ++q) {
-            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
-            setup->quad[q].input.x0 = lx;
-            setup->quad[q].input.y0 = setup->span.y;
-            setup->quad[q].inout.mask = quadmask;
-            setup->quad_ptrs[q] = &setup->quad[q];
-            mask0 >>= 2;
-            mask1 >>= 2;
-            lx += 2;
-         }
-         assert(!(mask0 | mask1));
+   unsigned old_state = setup->state;
 
-         shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
+   if (old_state == new_state)
+      return;
+       
+   LP_DBG(DEBUG_SETUP, "%s old %d new %d\n", __FUNCTION__, old_state, new_state);
+
+   switch (new_state) {
+   case SETUP_ACTIVE:
+      begin_binning( setup );
+      break;
+
+   case SETUP_CLEARED:
+      if (old_state == SETUP_ACTIVE) {
+         assert(0);
+         return;
       }
+      break;
+      
+   case SETUP_FLUSHED:
+      if (old_state == SETUP_CLEARED)
+         execute_clears( setup );
+      else
+         lp_setup_rasterize_scene( setup, TRUE );
+      break;
    }
 
-
-   setup->span.y = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   setup->span.left[0] = 1000000;     /* greater than right[0] */
-   setup->span.left[1] = 1000000;     /* greater than right[1] */
+   setup->state = new_state;
 }
 
 
-#if DEBUG_VERTS
-static void print_vertex(const struct setup_context *setup,
-                         const float (*v)[4])
+void
+lp_setup_flush( struct setup_context *setup,
+                unsigned flags )
 {
-   int i;
-   debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
-      debug_printf("     %d: %f %f %f %f\n",  i,
-              v[i][0], v[i][1], v[i][2], v[i][3]);
-      if (util_is_inf_or_nan(v[i][0])) {
-         debug_printf("   NaN!\n");
-      }
-   }
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   set_scene_state( setup, SETUP_FLUSHED );
 }
-#endif
 
-/**
- * Sort the vertices from top to bottom order, setting up the triangle
- * edge fields (ebot, emaj, etop).
- * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
- */
-static boolean setup_sort_vertices( struct setup_context *setup,
-                                    float det,
-                                    const float (*v0)[4],
-                                    const float (*v1)[4],
-                                    const float (*v2)[4] )
-{
-   setup->vprovoke = v2;
-
-   /* determine bottom to top order of vertices */
-   {
-      float y0 = v0[0][1];
-      float y1 = v1[0][1];
-      float y2 = v2[0][1];
-      if (y0 <= y1) {
-	 if (y1 <= y2) {
-	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;
-	    setup->vmid = v1;
-	    setup->vmax = v2;
-	 }
-	 else if (y2 <= y0) {
-	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;
-	    setup->vmid = v0;
-	    setup->vmax = v1;
-	 }
-	 else {
-	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;
-	    setup->vmid = v2;
-	    setup->vmax = v1;
-	 }
-      }
-      else {
-	 if (y0 <= y2) {
-	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;
-	    setup->vmid = v0;
-	    setup->vmax = v2;
-	 }
-	 else if (y2 <= y1) {
-	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;
-	    setup->vmid = v1;
-	    setup->vmax = v0;
-	 }
-	 else {
-	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;
-	    setup->vmid = v2;
-	    setup->vmax = v0;
-	 }
-      }
-   }
 
-   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
-   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
-   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
-   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
-
-   /*
-    * Compute triangle's area.  Use 1/area to compute partial
-    * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
-    */
-   {
-      const float area = (setup->emaj.dx * setup->ebot.dy -
-			    setup->ebot.dx * setup->emaj.dy);
-
-      setup->oneoverarea = 1.0f / area;
-
-      /*
-      debug_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, det );
-      */
-      if (util_is_inf_or_nan(setup->oneoverarea))
-         return FALSE;
-   }
+void
+lp_setup_bind_framebuffer( struct setup_context *setup,
+                           const struct pipe_framebuffer_state *fb )
+{
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
 
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
-    */
-   setup->facing = 
-      ((det > 0.0) ^ 
-       (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   /* Prepare pixel offset for rasterisation:
-    *  - pixel center (0.5, 0.5) for GL, or
-    *  - assume (0.0, 0.0) for other APIs.
-    */
-   if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
-      setup->pixel_offset = 0.5f;
-   } else {
-      setup->pixel_offset = 0.0f;
-   }
+   set_scene_state( setup, SETUP_FLUSHED );
 
-   return TRUE;
-}
+   /* re-get scene pointer, may have a new scene after flushing */
+   scene = lp_setup_get_current_scene(setup);
 
+   util_copy_framebuffer_state(&setup->fb, fb);
 
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static void tri_pos_coeff( struct setup_context *setup,
-                           uint vertSlot, unsigned i)
-{
-   float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-   float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-   float dadx = a * setup->oneoverarea;
-   float dady = b * setup->oneoverarea;
-
-   assert(i <= 3);
-
-   setup->coef.dadx[0][i] = dadx;
-   setup->coef.dady[0][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (pixel_offset, pixel_offset).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
-
-   /*
-   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                slot, "xyzw"[i],
-                setup->coef[slot].a0[i],
-                setup->coef[slot].dadx[i],
-                setup->coef[slot].dady[i]);
-   */
+   lp_scene_set_framebuffer_size(scene, setup->fb.width, setup->fb.height);
 }
 
 
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_pos_coeff( struct setup_context *setup,
-                             uint vertSlot, unsigned i)
+void
+lp_setup_clear( struct setup_context *setup,
+                const float *color,
+                double depth,
+                unsigned stencil,
+                unsigned flags )
 {
-   setup->coef.dadx[0][i] = 0;
-   setup->coef.dady[0][i] = 0;
-
-   /* need provoking vertex info!
-    */
-   setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
-}
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   unsigned i;
 
+   LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
 
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_coeff( struct setup_context *setup,
-                         unsigned attrib,
-                         uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      setup->coef.dadx[1 + attrib][i] = 0;
-      setup->coef.dady[1 + attrib][i] = 0;
 
-      /* need provoking vertex info!
-       */
-      setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
+   if (flags & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < 4; ++i)
+         setup->clear.color.clear_color[i] = float_to_ubyte(color[i]);
    }
-}
 
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static void tri_linear_coeff( struct setup_context *setup,
-                              unsigned attrib,
-                              uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-      float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-      float dadx = a * setup->oneoverarea;
-      float dady = b * setup->oneoverarea;
-
-      assert(i <= 3);
-
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
-
-      /*
-      debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                   slot, "xyzw"[i],
-                   setup->coef[slot].a0[i],
-                   setup->coef[slot].dadx[i],
-                   setup->coef[slot].dady[i]);
-      */
+   if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
+      setup->clear.zstencil.clear_zstencil = 
+         util_pack_z_stencil(setup->fb.zsbuf->format, 
+                             depth,
+                             stencil);
    }
-}
 
+   if (setup->state == SETUP_ACTIVE) {
+      /* Add the clear to existing scene.  In the unusual case where
+       * both color and depth-stencil are being cleared when there's
+       * already been some rendering, we could discard the currently
+       * binned scene and start again, but I don't see that as being
+       * a common usage.
+       */
+      if (flags & PIPE_CLEAR_COLOR)
+         lp_scene_bin_everywhere( scene, 
+                                  lp_rast_clear_color,
+                                  setup->clear.color );
 
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void tri_persp_coeff( struct setup_context *setup,
-                             unsigned attrib,
-                             uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      /* premultiply by 1/w  (v[0][3] is always W):
+      if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL)
+         lp_scene_bin_everywhere( scene, 
+                                  lp_rast_clear_zstencil,
+                                  setup->clear.zstencil );
+   }
+   else {
+      /* Put ourselves into the 'pre-clear' state, specifically to try
+       * and accumulate multiple clears to color and depth_stencil
+       * buffers which the app or state-tracker might issue
+       * separately.
        */
-      float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-      float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
-      float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-      float botda = mida - mina;
-      float majda = maxa - mina;
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-      float dadx = a * setup->oneoverarea;
-      float dady = b * setup->oneoverarea;
-
-      /*
-      debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-                   setup->vmin[vertSlot][i],
-                   setup->vmid[vertSlot][i],
-                   setup->vmax[vertSlot][i]
-             );
-      */
-      assert(i <= 3);
-
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (mina -
-                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+      set_scene_state( setup, SETUP_CLEARED );
+
+      setup->clear.flags |= flags;
    }
 }
 
 
 /**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial, though Y has to be inverted for OpenGL.
- * Z and W are copied from posCoef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ * Emit a fence.
  */
-static void
-setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+struct pipe_fence_handle *
+lp_setup_fence( struct setup_context *setup )
 {
-   /*X*/
-   setup->coef.a0[1 + slot][0] = 0;
-   setup->coef.dadx[1 + slot][0] = 1.0;
-   setup->coef.dady[1 + slot][0] = 0.0;
-   /*Y*/
-   setup->coef.a0[1 + slot][1] = 0.0;
-   setup->coef.dadx[1 + slot][1] = 0.0;
-   setup->coef.dady[1 + slot][1] = 1.0;
-   /*Z*/
-   setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
-   setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
-   setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
-   /*W*/
-   setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
-   setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
-   setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
-}
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   const unsigned rank = lp_scene_get_num_bins( scene ); /* xxx */
+   struct lp_fence *fence = lp_fence_create(rank);
 
+   LP_DBG(DEBUG_SETUP, "%s rank %u\n", __FUNCTION__, rank);
 
+   set_scene_state( setup, SETUP_ACTIVE );
 
-/**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
- */
-static void setup_tri_coefficients( struct setup_context *setup )
-{
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
+   /* insert the fence into all command bins */
+   lp_scene_bin_everywhere( scene,
+			    lp_rast_fence,
+			    lp_rast_arg_fence(fence) );
 
-   /* z and w are done by linear interpolation:
-    */
-   tri_pos_coeff(setup, 0, 2);
-   tri_pos_coeff(setup, 0, 3);
+   return (struct pipe_fence_handle *) fence;
+}
 
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_LINEAR:
-         tri_linear_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         tri_persp_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
+void 
+lp_setup_set_triangle_state( struct setup_context *setup,
+                             unsigned cull_mode,
+                             boolean ccw_is_frontface,
+                             boolean scissor )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
-   }
+   setup->ccw_is_frontface = ccw_is_frontface;
+   setup->cullmode = cull_mode;
+   setup->triangle = first_triangle;
+   setup->scissor_test = scissor;
 }
 
 
 
-static void setup_tri_edges( struct setup_context *setup )
+void
+lp_setup_set_fs_inputs( struct setup_context *setup,
+                        const struct lp_shader_input *input,
+                        unsigned nr )
 {
-   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
-   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
-
-   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
-   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
-   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
-
-   setup->emaj.sy = ceilf(vmin_y);
-   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
-
-   setup->etop.sy = ceilf(vmid_y);
-   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
-
-   setup->ebot.sy = ceilf(vmin_y);
-   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
-}
+   LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr);
 
+   memcpy( setup->fs.input, input, nr * sizeof input[0] );
+   setup->fs.nr_inputs = nr;
+}
 
-/**
- * Render the upper or lower half of a triangle.
- * Scissoring/cliprect is applied here too.
- */
-static void subtriangle( struct setup_context *setup,
-			 struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+void
+lp_setup_set_fs_functions( struct setup_context *setup,
+                           lp_jit_frag_func jit_function0,
+                           lp_jit_frag_func jit_function1,
+                           boolean opaque )
 {
-   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-   int y, start_y, finish_y;
-   int sy = (int)eleft->sy;
-
-   assert((int)eleft->sy == (int) eright->sy);
-
-   /* clip top/bottom */
-   start_y = sy;
-   if (start_y < miny)
-      start_y = miny;
-
-   finish_y = sy + lines;
-   if (finish_y > maxy)
-      finish_y = maxy;
-
-   start_y -= sy;
-   finish_y -= sy;
-
-   /*
-   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
-   */
-
-   for (y = start_y; y < finish_y; y++) {
-
-      /* avoid accumulating adds as floats don't have the precision to
-       * accurately iterate large triangle edges that way.  luckily we
-       * can just multiply these days.
-       *
-       * this is all drowned out by the attribute interpolation anyway.
-       */
-      int left = (int)(eleft->sx + y * eleft->dxdy);
-      int right = (int)(eright->sx + y * eright->dxdy);
-
-      /* clip left/right */
-      if (left < minx)
-         left = minx;
-      if (right > maxx)
-         right = maxx;
-
-      if (left < right) {
-         int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
-         }
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) jit_function0);
+   /* FIXME: reference count */
 
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-      }
-   }
-
-
-   /* save the values so that emaj can be restarted:
-    */
-   eleft->sx += lines * eleft->dxdy;
-   eright->sx += lines * eright->dxdy;
-   eleft->sy += lines;
-   eright->sy += lines;
+   setup->fs.current.jit_function[0] = jit_function0;
+   setup->fs.current.jit_function[1] = jit_function1;
+   setup->fs.current.opaque = opaque;
+   setup->dirty |= LP_SETUP_NEW_FS;
 }
 
-
-/**
- * Recalculate prim's determinant.  This is needed as we don't have
- * get this information through the vbuf_render interface & we must
- * calculate it here.
- */
-static float
-calc_det( const float (*v0)[4],
-          const float (*v1)[4],
-          const float (*v2)[4] )
+void
+lp_setup_set_fs_constants(struct setup_context *setup,
+                          struct pipe_buffer *buffer)
 {
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0][0] - v2[0][0];
-   const float ey = v0[0][1] - v2[0][1];
-   const float fx = v1[0][0] - v2[0][0];
-   const float fy = v1[0][1] - v2[0][1];
-
-   /* det = cross(e,f).z */
-   return ex * fy - ey * fx;
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) buffer);
+
+   pipe_buffer_reference(&setup->constants.current, buffer);
+
+   setup->dirty |= LP_SETUP_NEW_CONSTANTS;
 }
 
 
-/**
- * Do setup for triangle rasterization, then render the triangle.
- */
-void llvmpipe_setup_tri( struct setup_context *setup,
-                const float (*v0)[4],
-                const float (*v1)[4],
-                const float (*v2)[4] )
+void
+lp_setup_set_alpha_ref_value( struct setup_context *setup,
+                              float alpha_ref_value )
 {
-   float det;
-
-#if DEBUG_VERTS
-   debug_printf("Setup triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
-#endif
+   LP_DBG(DEBUG_SETUP, "%s %f\n", __FUNCTION__, alpha_ref_value);
 
-   if (setup->llvmpipe->no_rast)
-      return;
-   
-   det = calc_det(v0, v1, v2);
-   /*
-   debug_printf("%s\n", __FUNCTION__ );
-   */
+   if(setup->fs.current.jit_context.alpha_ref_value != alpha_ref_value) {
+      setup->fs.current.jit_context.alpha_ref_value = alpha_ref_value;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+}
 
-#if DEBUG_FRAGS
-   setup->numFragsEmitted = 0;
-   setup->numFragsWritten = 0;
-#endif
+void
+lp_setup_set_blend_color( struct setup_context *setup,
+                          const struct pipe_blend_color *blend_color )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   if (cull_tri( setup, det ))
-      return;
+   assert(blend_color);
 
-   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
-      return;
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
+   if(memcmp(&setup->blend_color.current, blend_color, sizeof *blend_color) != 0) {
+      memcpy(&setup->blend_color.current, blend_color, sizeof *blend_color);
+      setup->dirty |= LP_SETUP_NEW_BLEND_COLOR;
+   }
+}
 
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
-   setup->span.y = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+void
+lp_setup_set_scissor( struct setup_context *setup,
+                      const struct pipe_scissor_state *scissor )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   /*   init_constant_attribs( setup ); */
+   assert(scissor);
 
-   if (setup->oneoverarea < 0.0) {
-      /* emaj on left:
-       */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
-   }
-   else {
-      /* emaj on right:
-       */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+   if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) {
+      setup->scissor.current = *scissor; /* struct copy */
+      setup->dirty |= LP_SETUP_NEW_SCISSOR;
    }
-
-   flush_spans( setup );
-
-#if DEBUG_FRAGS
-   printf("Tri: %u frags emitted, %u written\n",
-          setup->numFragsEmitted,
-          setup->numFragsWritten);
-#endif
 }
 
 
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a line.
- */
-static void
-linear_pos_coeff(struct setup_context *setup,
-                 uint vertSlot, uint i)
+void 
+lp_setup_set_flatshade_first( struct setup_context *setup,
+                              boolean flatshade_first )
 {
-   const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-   const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   setup->coef.dadx[0][i] = dadx;
-   setup->coef.dady[0][i] = dady;
-   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   setup->flatshade_first = flatshade_first;
 }
 
 
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a line.
- */
-static void
-line_linear_coeff(struct setup_context *setup,
-                  unsigned attrib,
-                  uint vertSlot)
+void 
+lp_setup_set_vertex_info( struct setup_context *setup,
+                          struct vertex_info *vertex_info )
 {
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-      const float dady = da * setup->emaj.dy * setup->oneoverarea;
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
-   }
+   /* XXX: just silently holding onto the pointer:
+    */
+   setup->vertex_info = vertex_info;
 }
 
 
 /**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a line.
+ * Called during state validation when LP_NEW_TEXTURE is set.
  */
-static void
-line_persp_coeff(struct setup_context *setup,
-                 unsigned attrib,
-                 uint vertSlot)
+void
+lp_setup_set_sampler_textures( struct setup_context *setup,
+                               unsigned num, struct pipe_texture **texture)
 {
    unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      /* XXX double-check/verify this arithmetic */
-      const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-      const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-      const float da = a1 - a0;
-      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-      const float dady = da * setup->emaj.dy * setup->oneoverarea;
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+
+      if(tex) {
+         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+         struct lp_jit_texture *jit_tex;
+         jit_tex = &setup->fs.current.jit_context.textures[i];
+         jit_tex->width = tex->width0;
+         jit_tex->height = tex->height0;
+         jit_tex->stride = lp_tex->stride[0];
+         if(!lp_tex->dt)
+            jit_tex->data = lp_tex->data;
+         else
+            /* FIXME: map the rendertarget */
+            assert(0);
+
+         /* the scene references this texture */
+         {
+            struct lp_scene *scene = lp_setup_get_current_scene(setup);
+            lp_scene_texture_reference(scene, tex);
+         }
+      }
    }
+
+   setup->dirty |= LP_SETUP_NEW_FS;
 }
 
 
 /**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmax are initialized.
+ * Is the given texture referenced by any scene?
+ * Note: we have to check all scenes including any scenes currently
+ * being rendered and the current scene being built.
  */
-static INLINE boolean
-setup_line_coefficients(struct setup_context *setup,
-                        const float (*v0)[4],
-                        const float (*v1)[4])
+unsigned
+lp_setup_is_texture_referenced( const struct setup_context *setup,
+                                const struct pipe_texture *texture )
 {
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
-   float area;
-
-   /* use setup->vmin, vmax to point to vertices */
-   if (llvmpipe->rasterizer->flatshade_first)
-      setup->vprovoke = v0;
-   else
-      setup->vprovoke = v1;
-   setup->vmin = v0;
-   setup->vmax = v1;
-
-   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-
-   /* NOTE: this is not really area but something proportional to it */
-   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
-   if (area == 0.0f || util_is_inf_or_nan(area))
-      return FALSE;
-   setup->oneoverarea = 1.0f / area;
-
-   /* z and w are done by linear interpolation:
-    */
-   linear_pos_coeff(setup, 0, 2);
-   linear_pos_coeff(setup, 0, 3);
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_LINEAR:
-         line_linear_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         line_persp_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
+   unsigned i;
 
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
+   /* check the render targets */
+   for (i = 0; i < setup->fb.nr_cbufs; i++) {
+      if (setup->fb.cbufs[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+   }
+   if (setup->fb.zsbuf && setup->fb.zsbuf->texture == texture) {
+      return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
    }
-   return TRUE;
-}
-
 
-/**
- * Plot a pixel in a line segment.
- */
-static INLINE void
-plot(struct setup_context *setup, int x, int y)
-{
-   const int iy = y & 1;
-   const int ix = x & 1;
-   const int quadX = x - ix;
-   const int quadY = y - iy;
-   const int mask = (1 << ix) << (2 * iy);
-
-   if (quadX != setup->quad[0].input.x0 ||
-       quadY != setup->quad[0].input.y0)
-   {
-      /* flush prev quad, start new quad */
-
-      if (setup->quad[0].input.x0 != -1)
-         clip_emit_quad( setup, &setup->quad[0] );
-
-      setup->quad[0].input.x0 = quadX;
-      setup->quad[0].input.y0 = quadY;
-      setup->quad[0].inout.mask = 0x0;
+   /* check textures referenced by the scene */
+   for (i = 0; i < Elements(setup->scenes); i++) {
+      if (lp_scene_is_texture_referenced(setup->scenes[i], texture)) {
+         return PIPE_REFERENCED_FOR_READ;
+      }
    }
 
-   setup->quad[0].inout.mask |= mask;
+   return PIPE_UNREFERENCED;
 }
 
 
 /**
- * Do setup for line rasterization, then render the line.
- * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
- * to handle stippling and wide lines.
+ * Called by vbuf code when we're about to draw something.
  */
 void
-llvmpipe_setup_line(struct setup_context *setup,
-           const float (*v0)[4],
-           const float (*v1)[4])
+lp_setup_update_state( struct setup_context *setup )
 {
-   int x0 = (int) v0[0][0];
-   int x1 = (int) v1[0][0];
-   int y0 = (int) v0[0][1];
-   int y1 = (int) v1[0][1];
-   int dx = x1 - x0;
-   int dy = y1 - y0;
-   int xstep, ystep;
-
-#if DEBUG_VERTS
-   debug_printf("Setup line:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-#endif
-
-   if (setup->llvmpipe->no_rast)
-      return;
-
-   if (dx == 0 && dy == 0)
-      return;
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
 
-   if (!setup_line_coefficients(setup, v0, v1))
-      return;
-
-   assert(v0[0][0] < 1.0e9);
-   assert(v0[0][1] < 1.0e9);
-   assert(v1[0][0] < 1.0e9);
-   assert(v1[0][1] < 1.0e9);
-
-   if (dx < 0) {
-      dx = -dx;   /* make positive */
-      xstep = -1;
-   }
-   else {
-      xstep = 1;
-   }
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   if (dy < 0) {
-      dy = -dy;   /* make positive */
-      ystep = -1;
-   }
-   else {
-      ystep = 1;
-   }
+   assert(setup->fs.current.jit_function);
 
-   assert(dx >= 0);
-   assert(dy >= 0);
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_LINES);
+   if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
+      uint8_t *stored;
+      unsigned i, j;
 
-   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
-   setup->quad[0].inout.mask = 0x0;
+      stored = lp_scene_alloc_aligned(scene, 4 * 16, 16);
 
-   /* XXX temporary: set coverage to 1.0 so the line appears
-    * if AA mode happens to be enabled.
-    */
-   setup->quad[0].input.coverage[0] =
-   setup->quad[0].input.coverage[1] =
-   setup->quad[0].input.coverage[2] =
-   setup->quad[0].input.coverage[3] = 1.0;
-
-   if (dx > dy) {
-      /*** X-major line ***/
-      int i;
-      const int errorInc = dy + dy;
-      int error = errorInc - dx;
-      const int errorDec = error - dx;
-
-      for (i = 0; i < dx; i++) {
-         plot(setup, x0, y0);
-
-         x0 += xstep;
-         if (error < 0) {
-            error += errorInc;
-         }
-         else {
-            error += errorDec;
-            y0 += ystep;
-         }
-      }
-   }
-   else {
-      /*** Y-major line ***/
-      int i;
-      const int errorInc = dx + dx;
-      int error = errorInc - dy;
-      const int errorDec = error - dy;
-
-      for (i = 0; i < dy; i++) {
-         plot(setup, x0, y0);
-
-         y0 += ystep;
-         if (error < 0) {
-            error += errorInc;
-         }
-         else {
-            error += errorDec;
-            x0 += xstep;
-         }
+      /* smear each blend color component across 16 ubyte elements */
+      for (i = 0; i < 4; ++i) {
+         uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
+         for (j = 0; j < 16; ++j)
+            stored[i*16 + j] = c;
       }
-   }
 
-   /* draw final quad */
-   if (setup->quad[0].inout.mask) {
-      clip_emit_quad( setup, &setup->quad[0] );
+      setup->blend_color.stored = stored;
+
+      setup->fs.current.jit_context.blend_color = setup->blend_color.stored;
+      setup->dirty |= LP_SETUP_NEW_FS;
    }
-}
 
+   if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+      float *stored;
 
-static void
-point_persp_coeff(struct setup_context *setup,
-                  const float (*vert)[4],
-                  unsigned attrib,
-                  uint vertSlot)
-{
-   unsigned i;
-   for(i = 0; i < NUM_CHANNELS; ++i) {
-      setup->coef.dadx[1 + attrib][i] = 0.0F;
-      setup->coef.dady[1 + attrib][i] = 0.0F;
-      setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
-   }
-}
+      stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16);
 
+      stored[0] = (float) setup->scissor.current.minx;
+      stored[1] = (float) setup->scissor.current.miny;
+      stored[2] = (float) setup->scissor.current.maxx;
+      stored[3] = (float) setup->scissor.current.maxy;
 
-/**
- * Do setup for point rasterization, then render the point.
- * Round or square points...
- * XXX could optimize a lot for 1-pixel points.
- */
-void
-llvmpipe_setup_point( struct setup_context *setup,
-             const float (*v0)[4] )
-{
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const int sizeAttr = setup->llvmpipe->psize_slot;
-   const float size
-      = sizeAttr > 0 ? v0[sizeAttr][0]
-      : setup->llvmpipe->rasterizer->point_size;
-   const float halfSize = 0.5F * size;
-   const boolean round = (boolean) setup->llvmpipe->rasterizer->point_smooth;
-   const float x = v0[0][0];  /* Note: data[0] is always position */
-   const float y = v0[0][1];
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
-
-#if DEBUG_VERTS
-   debug_printf("Setup point:\n");
-   print_vertex(setup, v0);
-#endif
-
-   if (llvmpipe->no_rast)
-      return;
+      setup->scissor.stored = stored;
 
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_POINTS);
-
-   /* For points, all interpolants are constant-valued.
-    * However, for point sprites, we'll need to setup texcoords appropriately.
-    * XXX: which coefficients are the texcoords???
-    * We may do point sprites as textured quads...
-    *
-    * KW: We don't know which coefficients are texcoords - ultimately
-    * the choice of what interpolation mode to use for each attribute
-    * should be determined by the fragment program, using
-    * per-attribute declaration statements that include interpolation
-    * mode as a parameter.  So either the fragment program will have
-    * to be adjusted for pointsprite vs normal point behaviour, or
-    * otherwise a special interpolation mode will have to be defined
-    * which matches the required behaviour for point sprites.  But -
-    * the latter is not a feature of normal hardware, and as such
-    * probably should be ruled out on that basis.
-    */
-   setup->vprovoke = v0;
+      setup->fs.current.jit_context.scissor_xmin = stored[0];
+      setup->fs.current.jit_context.scissor_ymin = stored[1];
+      setup->fs.current.jit_context.scissor_xmax = stored[2];
+      setup->fs.current.jit_context.scissor_ymax = stored[3];
 
-   /* setup Z, W */
-   const_pos_coeff(setup, 0, 2);
-   const_pos_coeff(setup, 0, 3);
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
 
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+   if(setup->dirty & LP_SETUP_NEW_CONSTANTS) {
+      struct pipe_buffer *buffer = setup->constants.current;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         /* fall-through */
-      case INTERP_LINEAR:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
+      if(buffer) {
+         unsigned current_size = buffer->size;
+         const void *current_data = llvmpipe_buffer(buffer)->data;
 
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
-   }
+         /* TODO: copy only the actually used constants? */
 
+         if(setup->constants.stored_size != current_size ||
+            !setup->constants.stored_data ||
+            memcmp(setup->constants.stored_data,
+                   current_data,
+                   current_size) != 0) {
+            void *stored;
 
-   if (halfSize <= 0.5 && !round) {
-      /* special case for 1-pixel points */
-      const int ix = ((int) x) & 1;
-      const int iy = ((int) y) & 1;
-      setup->quad[0].input.x0 = (int) x - ix;
-      setup->quad[0].input.y0 = (int) y - iy;
-      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
-      clip_emit_quad( setup, &setup->quad[0] );
-   }
-   else {
-      if (round) {
-         /* rounded points */
-         const int ixmin = block((int) (x - halfSize));
-         const int ixmax = block((int) (x + halfSize));
-         const int iymin = block((int) (y - halfSize));
-         const int iymax = block((int) (y + halfSize));
-         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
-         const float rmax = halfSize + 0.7071F;
-         const float rmin2 = MAX2(0.0F, rmin * rmin);
-         const float rmax2 = rmax * rmax;
-         const float cscale = 1.0F / (rmax2 - rmin2);
-         int ix, iy;
-
-         for (iy = iymin; iy <= iymax; iy += 2) {
-            for (ix = ixmin; ix <= ixmax; ix += 2) {
-               float dx, dy, dist2, cover;
-
-               setup->quad[0].inout.mask = 0x0;
-
-               dx = (ix + 0.5f) - x;
-               dy = (iy + 0.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
-               }
-
-               dx = (ix + 1.5f) - x;
-               dy = (iy + 0.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
-               }
-
-               dx = (ix + 0.5f) - x;
-               dy = (iy + 1.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
-               }
-
-               dx = (ix + 1.5f) - x;
-               dy = (iy + 1.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
-               }
-
-               if (setup->quad[0].inout.mask) {
-                  setup->quad[0].input.x0 = ix;
-                  setup->quad[0].input.y0 = iy;
-                  clip_emit_quad( setup, &setup->quad[0] );
-               }
+            stored = lp_scene_alloc(scene, current_size);
+            if(stored) {
+               memcpy(stored,
+                      current_data,
+                      current_size);
+               setup->constants.stored_size = current_size;
+               setup->constants.stored_data = stored;
             }
          }
       }
       else {
-         /* square points */
-         const int xmin = (int) (x + 0.75 - halfSize);
-         const int ymin = (int) (y + 0.25 - halfSize);
-         const int xmax = xmin + (int) size;
-         const int ymax = ymin + (int) size;
-         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
-         const int ixmin = block(xmin);
-         const int ixmax = block(xmax - 1);
-         const int iymin = block(ymin);
-         const int iymax = block(ymax - 1);
-         int ix, iy;
-
-         /*
-         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
-         */
-         for (iy = iymin; iy <= iymax; iy += 2) {
-            uint rowMask = 0xf;
-            if (iy < ymin) {
-               /* above the top edge */
-               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-            }
-            if (iy + 1 >= ymax) {
-               /* below the bottom edge */
-               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-            }
+         setup->constants.stored_size = 0;
+         setup->constants.stored_data = NULL;
+      }
 
-            for (ix = ixmin; ix <= ixmax; ix += 2) {
-               uint mask = rowMask;
-
-               if (ix < xmin) {
-                  /* fragment is past left edge of point, turn off left bits */
-                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-               }
-               if (ix + 1 >= xmax) {
-                  /* past the right edge */
-                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-               }
-
-               setup->quad[0].inout.mask = mask;
-               setup->quad[0].input.x0 = ix;
-               setup->quad[0].input.y0 = iy;
-               clip_emit_quad( setup, &setup->quad[0] );
-            }
+      setup->fs.current.jit_context.constants = setup->constants.stored_data;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+
+   if(setup->dirty & LP_SETUP_NEW_FS) {
+      if(!setup->fs.stored ||
+         memcmp(setup->fs.stored,
+                &setup->fs.current,
+                sizeof setup->fs.current) != 0) {
+         /* The fs state that's been stored in the scene is different from
+          * the new, current state.  So allocate a new lp_rast_state object
+          * and append it to the bin's setup data buffer.
+          */
+         struct lp_rast_state *stored =
+            (struct lp_rast_state *) lp_scene_alloc(scene, sizeof *stored);
+         if(stored) {
+            memcpy(stored,
+                   &setup->fs.current,
+                   sizeof setup->fs.current);
+            setup->fs.stored = stored;
+
+            /* put the state-set command into all bins */
+            lp_scene_bin_state_command( scene,
+					lp_rast_set_state, 
+					lp_rast_arg_state(setup->fs.stored) );
          }
       }
    }
+
+   setup->dirty = 0;
+
+   assert(setup->fs.stored);
 }
 
-void llvmpipe_setup_prepare( struct setup_context *setup )
+
+
+/* Only caller is lp_setup_vbuf_destroy()
+ */
+void 
+lp_setup_destroy( struct setup_context *setup )
 {
-   struct llvmpipe_context *lp = setup->llvmpipe;
+   reset_context( setup );
 
-   if (lp->dirty) {
-      llvmpipe_update_derived(lp);
-   }
+   pipe_buffer_reference(&setup->constants.current, NULL);
 
-   if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
-       lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
-       lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
-      /* we'll do culling */
-      setup->winding = lp->rasterizer->cull_mode;
-   }
-   else {
-      /* 'draw' will do culling */
-      setup->winding = PIPE_WINDING_NONE;
+   /* free the scenes in the 'empty' queue */
+   while (1) {
+      struct lp_scene *scene = lp_scene_dequeue(setup->empty_scenes, FALSE);
+      if (!scene)
+         break;
+      lp_scene_destroy(scene);
    }
-}
-
 
+   lp_rast_destroy( setup->rast );
 
-void llvmpipe_setup_destroy_context( struct setup_context *setup )
-{
-   align_free( setup );
+   FREE( setup );
 }
 
 
 /**
- * Create a new primitive setup/render stage.
+ * Create a new primitive tiling engine.  Plug it into the backend of
+ * the draw module.  Currently also creates a rasterizer to use with
+ * it.
  */
-struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
+struct setup_context *
+lp_setup_create( struct pipe_screen *screen,
+                 struct draw_context *draw )
 {
-   struct setup_context *setup;
    unsigned i;
+   struct setup_context *setup = CALLOC_STRUCT(setup_context);
 
-   setup = align_malloc(sizeof(struct setup_context), 16);
    if (!setup)
       return NULL;
 
-   memset(setup, 0, sizeof *setup);
-   setup->llvmpipe = llvmpipe;
+   lp_setup_init_vbuf(setup);
+
+   setup->empty_scenes = lp_scene_queue_create();
+   if (!setup->empty_scenes)
+      goto fail;
 
-   for (i = 0; i < MAX_QUADS; i++) {
-      setup->quad[i].coef = &setup->coef;
+   setup->rast = lp_rast_create( screen, setup->empty_scenes );
+   if (!setup->rast) 
+      goto fail;
+
+   setup->vbuf = draw_vbuf_stage(draw, &setup->base);
+   if (!setup->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(draw, setup->vbuf);
+   draw_set_render(draw, &setup->base);
+
+   /* create some empty scenes */
+   for (i = 0; i < MAX_SCENES; i++) {
+      setup->scenes[i] = lp_scene_create();
+      lp_scene_enqueue(setup->empty_scenes, setup->scenes[i]);
    }
 
-   setup->span.left[0] = 1000000;     /* greater than right[0] */
-   setup->span.left[1] = 1000000;     /* greater than right[1] */
+   setup->triangle = first_triangle;
+   setup->line     = first_line;
+   setup->point    = first_point;
+   
+   setup->dirty = ~0;
 
    return setup;
+
+fail:
+   if (setup->rast)
+      lp_rast_destroy( setup->rast );
+   
+   if (setup->vbuf)
+      ;
+
+   if (setup->empty_scenes)
+      lp_scene_queue_destroy(setup->empty_scenes);
+
+   FREE(setup);
+   return NULL;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 89c43da046..0e155a7dc3 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -27,27 +27,113 @@
 #ifndef LP_SETUP_H
 #define LP_SETUP_H
 
-struct setup_context;
-struct llvmpipe_context;
+#include "pipe/p_compiler.h"
+#include "lp_jit.h"
+
+struct draw_context;
+struct vertex_info;
+
+enum lp_interp {
+   LP_INTERP_CONSTANT,
+   LP_INTERP_LINEAR,
+   LP_INTERP_PERSPECTIVE,
+   LP_INTERP_POSITION,
+   LP_INTERP_FACING
+};
+
+/* Describes how to generate all the fragment shader inputs from the
+ * the vertices passed into our triangle/line/point functions.
+ *
+ * Vertices are treated as an array of float[4] values, indexed by
+ * src_index.
+ */
+struct lp_shader_input {
+   enum lp_interp interp;       /* how to interpolate values */
+   unsigned src_index;          /* where to find values in incoming vertices */
+};
+
+struct pipe_texture;
+struct pipe_surface;
+struct pipe_buffer;
+struct pipe_blend_color;
+struct pipe_screen;
+struct pipe_framebuffer_state;
+struct lp_fragment_shader;
+struct lp_jit_context;
+
+struct setup_context *
+lp_setup_create( struct pipe_screen *screen,
+                 struct draw_context *draw );
+
+void
+lp_setup_clear(struct setup_context *setup,
+               const float *clear_color,
+               double clear_depth,
+               unsigned clear_stencil,
+               unsigned flags);
+
+struct pipe_fence_handle *
+lp_setup_fence( struct setup_context *setup );
+
+
+void
+lp_setup_flush( struct setup_context *setup,
+                unsigned flags );
+
+
+void
+lp_setup_bind_framebuffer( struct setup_context *setup,
+                           const struct pipe_framebuffer_state *fb );
 
 void 
-llvmpipe_setup_tri( struct setup_context *setup,
-	   const float (*v0)[4],
-	   const float (*v1)[4],
-	   const float (*v2)[4] );
+lp_setup_set_triangle_state( struct setup_context *setup,
+                             unsigned cullmode,
+                             boolean front_is_ccw,
+                             boolean scissor );
 
 void
-llvmpipe_setup_line(struct setup_context *setup,
-           const float (*v0)[4],
-           const float (*v1)[4]);
+lp_setup_set_fs_inputs( struct setup_context *setup,
+                        const struct lp_shader_input *interp,
+                        unsigned nr );
 
 void
-llvmpipe_setup_point( struct setup_context *setup,
-             const float (*v0)[4] );
+lp_setup_set_fs_functions( struct setup_context *setup,
+                           lp_jit_frag_func jit_function0,
+                           lp_jit_frag_func jit_function1,
+                           boolean opaque );
 
+void
+lp_setup_set_fs_constants(struct setup_context *setup,
+                          struct pipe_buffer *buffer);
+
+
+void
+lp_setup_set_alpha_ref_value( struct setup_context *setup,
+                              float alpha_ref_value );
+
+void
+lp_setup_set_blend_color( struct setup_context *setup,
+                          const struct pipe_blend_color *blend_color );
+
+void
+lp_setup_set_scissor( struct setup_context *setup,
+                      const struct pipe_scissor_state *scissor );
+
+void
+lp_setup_set_sampler_textures( struct setup_context *setup,
+                               unsigned num, struct pipe_texture **texture);
+
+unsigned
+lp_setup_is_texture_referenced( const struct setup_context *setup,
+                                const struct pipe_texture *texture );
+
+void
+lp_setup_set_flatshade_first( struct setup_context *setup, 
+                              boolean flatshade_first );
+
+void
+lp_setup_set_vertex_info( struct setup_context *setup, 
+                          struct vertex_info *info );
 
-struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe );
-void llvmpipe_setup_prepare( struct setup_context *setup );
-void llvmpipe_setup_destroy_context( struct setup_context *setup );
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
new file mode 100644
index 0000000000..a5fc34e54a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -0,0 +1,159 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * The setup code is concerned with point/line/triangle setup and
+ * putting commands/data into the bins.
+ */
+
+
+#ifndef LP_SETUP_CONTEXT_H
+#define LP_SETUP_CONTEXT_H
+
+#include "lp_setup.h"
+#include "lp_rast.h"
+#include "lp_tile_soa.h"        /* for TILE_SIZE */
+#include "lp_scene.h"
+
+#include "draw/draw_vbuf.h"
+
+#define LP_SETUP_NEW_FS          0x01
+#define LP_SETUP_NEW_CONSTANTS   0x02
+#define LP_SETUP_NEW_BLEND_COLOR 0x04
+#define LP_SETUP_NEW_SCISSOR     0x08
+
+
+struct lp_scene_queue;
+
+
+/** Max number of scenes */
+#define MAX_SCENES 2
+
+
+
+/**
+ * Point/line/triangle setup context.
+ * Note: "stored" below indicates data which is stored in the bins,
+ * not arbitrary malloc'd memory.
+ *
+ *
+ * Subclass of vbuf_render, plugged directly into the draw module as
+ * the rendering backend.
+ */
+struct setup_context
+{
+   struct vbuf_render base;
+
+   struct vertex_info *vertex_info;
+   uint prim;
+   uint vertex_size;
+   uint nr_vertices;
+   uint vertex_buffer_size;
+   void *vertex_buffer;
+
+   /* Final pipeline stage for draw module.  Draw module should
+    * create/install this itself now.
+    */
+   struct draw_stage *vbuf;
+   struct lp_rasterizer *rast;
+   struct lp_scene *scenes[MAX_SCENES];  /**< all the scenes */
+   struct lp_scene *scene;               /**< current scene being built */
+   struct lp_scene_queue *empty_scenes;  /**< queue of empty scenes */
+
+   boolean flatshade_first;
+   boolean ccw_is_frontface;
+   boolean scissor_test;
+   unsigned cullmode;
+
+   struct pipe_framebuffer_state fb;
+
+   struct {
+      unsigned flags;
+      union lp_rast_cmd_arg color;    /**< lp_rast_clear_color() cmd */
+      union lp_rast_cmd_arg zstencil; /**< lp_rast_clear_zstencil() cmd */
+   } clear;
+
+   enum {
+      SETUP_FLUSHED,
+      SETUP_CLEARED,
+      SETUP_ACTIVE
+   } state;
+   
+   struct {
+      struct lp_shader_input input[PIPE_MAX_ATTRIBS];
+      unsigned nr_inputs;
+
+      const struct lp_rast_state *stored; /**< what's in the scene */
+      struct lp_rast_state current;  /**< currently set state */
+   } fs;
+
+   /** fragment shader constants */
+   struct {
+      struct pipe_buffer *current;
+      unsigned stored_size;
+      const void *stored_data;
+   } constants;
+
+   struct {
+      struct pipe_blend_color current;
+      uint8_t *stored;
+   } blend_color;
+
+   struct {
+      struct pipe_scissor_state current;
+      const void *stored;
+   } scissor;
+
+   unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
+
+   void (*point)( struct setup_context *,
+                  const float (*v0)[4]);
+
+   void (*line)( struct setup_context *,
+                 const float (*v0)[4],
+                 const float (*v1)[4]);
+
+   void (*triangle)( struct setup_context *,
+                     const float (*v0)[4],
+                     const float (*v1)[4],
+                     const float (*v2)[4]);
+};
+
+void lp_setup_choose_triangle( struct setup_context *setup );
+void lp_setup_choose_line( struct setup_context *setup );
+void lp_setup_choose_point( struct setup_context *setup );
+
+struct lp_scene *lp_setup_get_current_scene(struct setup_context *setup);
+
+void lp_setup_init_vbuf(struct setup_context *setup);
+
+void lp_setup_update_state( struct setup_context *setup );
+
+void lp_setup_destroy( struct setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 634575670d..feea79d394 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,37 +18,30 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  **************************************************************************/
 
-/**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
+/*
+ * Binning code for lines
  */
 
-#ifndef LP_BLD_ALPHA_H
-#define LP_BLD_ALPHA_H
-
-
-#include <llvm-c/Core.h>  
+#include "lp_setup_context.h"
 
-struct pipe_alpha_state;
-struct lp_type;
-struct lp_build_mask_context;
+static void line_nop( struct setup_context *setup,
+                      const float (*v0)[4],
+                      const float (*v1)[4] )
+{
+}
 
 
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref);
+void 
+lp_setup_choose_line( struct setup_context *setup )
+{
+   setup->line = line_nop;
+}
 
 
-#endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 0676e2f42a..f03ca729b2 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,17 +22,25 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
-#ifndef LP_VBUF_H
-#define LP_VBUF_H
+/*
+ * Binning code for points
+ */
 
+#include "lp_setup_context.h"
 
-struct llvmpipe_context;
+static void point_nop( struct setup_context *setup,
+                       const float (*v0)[4] )
+{
+}
 
-extern struct vbuf_render *
-lp_create_vbuf_backend(struct llvmpipe_context *llvmpipe);
+
+void 
+lp_setup_choose_point( struct setup_context *setup )
+{
+   setup->point = point_nop;
+}
 
 
-#endif /* LP_VBUF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
new file mode 100644
index 0000000000..9e59a6602c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -0,0 +1,618 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+
+#define NUM_CHANNELS 4
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_rast_triangle *tri,
+                           unsigned slot,
+			   const float value,
+                           unsigned i )
+{
+   tri->inputs.a0[slot][i] = value;
+   tri->inputs.dadx[slot][i] = 0.0f;
+   tri->inputs.dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct lp_rast_triangle *tri,
+                         float oneoverarea,
+                         unsigned slot,
+                         const float (*v1)[4],
+                         const float (*v2)[4],
+                         const float (*v3)[4],
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a1 = v1[vert_attr][i];
+   float a2 = v2[vert_attr][i];
+   float a3 = v3[vert_attr][i];
+
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - 0.5f) +
+                               dady * (v1[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_rast_triangle *tri,
+                              float oneoverarea,
+                              unsigned slot,
+			      const float (*v1)[4],
+			      const float (*v2)[4],
+			      const float (*v3)[4],
+			      unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = v1[vert_attr][i] * v1[0][3];
+   float a2 = v2[vert_attr][i] * v2[0][3];
+   float a3 = v3[vert_attr][i] * v3[0][3];
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - 0.5f) +
+                               dady * (v1[0][1] - 0.5f)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coef(struct lp_rast_triangle *tri,
+                     float oneoverarea,
+                     unsigned slot,
+                     const float (*v1)[4],
+                     const float (*v2)[4],
+                     const float (*v3)[4])
+{
+   /*X*/
+   tri->inputs.a0[slot][0] = 0.0;
+   tri->inputs.dadx[slot][0] = 1.0;
+   tri->inputs.dady[slot][0] = 0.0;
+   /*Y*/
+   tri->inputs.a0[slot][1] = 0.0;
+   tri->inputs.dadx[slot][1] = 0.0;
+   tri->inputs.dady[slot][1] = 1.0;
+   /*Z*/
+   linear_coef(tri, oneoverarea, slot, v1, v2, v3, 0, 2);
+   /*W*/
+   linear_coef(tri, oneoverarea, slot, v1, v2, v3, 0, 3);
+}
+
+
+static void setup_facing_coef( struct lp_rast_triangle *tri,
+                               unsigned slot,
+                               boolean frontface )
+{
+   constant_coef( tri, slot, 1.0f - frontface, 0 );
+   constant_coef( tri, slot, 0.0f, 1 ); /* wasted */
+   constant_coef( tri, slot, 0.0f, 2 ); /* wasted */
+   constant_coef( tri, slot, 0.0f, 3 ); /* wasted */
+}
+
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_tri_coefficients( struct setup_context *setup,
+				    struct lp_rast_triangle *tri,
+                                    float oneoverarea,
+				    const float (*v1)[4],
+				    const float (*v2)[4],
+				    const float (*v3)[4],
+				    boolean frontface)
+{
+   unsigned slot;
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(tri, oneoverarea, 0, v1, v2, v3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned i;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            constant_coef(tri, slot+1, v3[vert_attr][i], i);
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            linear_coef(tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            perspective_coef(tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
+         break;
+
+      case LP_INTERP_POSITION:
+         /* XXX: fix me - duplicates the values in slot zero.
+          */
+         setup_fragcoord_coef(tri, oneoverarea, slot+1, v1, v2, v3);
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(tri, slot+1, frontface);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+
+static INLINE int subpixel_snap( float a )
+{
+   return util_iround(FIXED_ONE * a - (FIXED_ONE / 2));
+}
+
+
+
+/**
+ * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
+ * immediately after it.
+ * The memory is allocated from the per-scene pool, not per-tile.
+ * \param tri_size  returns number of bytes allocated
+ * \param nr_inputs  number of fragment shader inputs
+ * \return pointer to triangle space
+ */
+static INLINE struct lp_rast_triangle *
+alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size)
+{
+   unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   struct lp_rast_triangle *tri;
+   unsigned bytes;
+   char *inputs;
+
+   assert(sizeof(*tri) % 16 == 0);
+
+   bytes = sizeof(*tri) + (3 * input_array_sz);
+
+   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+
+   inputs = (char *) (tri + 1);
+   tri->inputs.a0   = (float (*)[4]) inputs;
+   tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
+   tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+
+   *tri_size = bytes;
+
+   return tri;
+}
+
+
+
+/**
+ * Do basic setup for triangle rasterization and determine which
+ * framebuffer tiles are touched.  Put the triangle in the scene's
+ * bins for the tiles which we overlap.
+ */
+static void 
+do_triangle_ccw(struct setup_context *setup,
+		const float (*v1)[4],
+		const float (*v2)[4],
+		const float (*v3)[4],
+		boolean frontfacing )
+{
+   /* x/y positions in fixed point */
+   const int x1 = subpixel_snap(v1[0][0]);
+   const int x2 = subpixel_snap(v2[0][0]);
+   const int x3 = subpixel_snap(v3[0][0]);
+   const int y1 = subpixel_snap(v1[0][1]);
+   const int y2 = subpixel_snap(v2[0][1]);
+   const int y3 = subpixel_snap(v3[0][1]);
+
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *tri;
+   int area;
+   float oneoverarea;
+   int minx, maxx, miny, maxy;
+   unsigned tri_bytes;
+
+   tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes);
+
+   tri->dx12 = x1 - x2;
+   tri->dx23 = x2 - x3;
+   tri->dx31 = x3 - x1;
+
+   tri->dy12 = y1 - y2;
+   tri->dy23 = y2 - y3;
+   tri->dy31 = y3 - y1;
+
+   area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12);
+
+   LP_COUNT(nr_tris);
+
+   /* Cull non-ccw and zero-sized triangles. 
+    *
+    * XXX: subject to overflow??
+    */
+   if (area <= 0) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   /* Bounding rectangle (in pixels) */
+   minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   
+   if (setup->scissor_test) {
+      minx = MAX2(minx, setup->scissor.current.minx);
+      maxx = MIN2(maxx, setup->scissor.current.maxx);
+      miny = MAX2(miny, setup->scissor.current.miny);
+      maxy = MIN2(maxy, setup->scissor.current.maxy);
+   }
+
+   if (miny == maxy || 
+       minx == maxx) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   /* 
+    */
+   oneoverarea = ((float)FIXED_ONE) / (float)area;
+
+   /* Setup parameter interpolants:
+    */
+   setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing );
+
+   /* half-edge constants, will be interated over the whole render target.
+    */
+   tri->c1 = tri->dy12 * x1 - tri->dx12 * y1;
+   tri->c2 = tri->dy23 * x2 - tri->dx23 * y2;
+   tri->c3 = tri->dy31 * x3 - tri->dx31 * y3;
+
+   /* correct for top-left fill convention:
+    */
+   if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++;
+   if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++;
+   if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++;
+
+   tri->dy12 *= FIXED_ONE;
+   tri->dy23 *= FIXED_ONE;
+   tri->dy31 *= FIXED_ONE;
+
+   tri->dx12 *= FIXED_ONE;
+   tri->dx23 *= FIXED_ONE;
+   tri->dx31 *= FIXED_ONE;
+
+   /* find trivial reject offsets for each edge for a single-pixel
+    * sized block.  These will be scaled up at each recursive level to
+    * match the active blocksize.  Scaling in this way works best if
+    * the blocks are square.
+    */
+   tri->eo1 = 0;
+   if (tri->dy12 < 0) tri->eo1 -= tri->dy12;
+   if (tri->dx12 > 0) tri->eo1 += tri->dx12;
+
+   tri->eo2 = 0;
+   if (tri->dy23 < 0) tri->eo2 -= tri->dy23;
+   if (tri->dx23 > 0) tri->eo2 += tri->dx23;
+
+   tri->eo3 = 0;
+   if (tri->dy31 < 0) tri->eo3 -= tri->dy31;
+   if (tri->dx31 > 0) tri->eo3 += tri->dx31;
+
+   /* Calculate trivial accept offsets from the above.
+    */
+   tri->ei1 = tri->dx12 - tri->dy12 - tri->eo1;
+   tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2;
+   tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3;
+
+   /* Fill in the inputs.step[][] arrays.
+    * We've manually unrolled some loops here.
+    */
+   {
+      const int xstep1 = -tri->dy12;
+      const int xstep2 = -tri->dy23;
+      const int xstep3 = -tri->dy31;
+      const int ystep1 = tri->dx12;
+      const int ystep2 = tri->dx23;
+      const int ystep3 = tri->dx31;
+
+#define SETUP_STEP(i, x, y)                                \
+      do {                                                 \
+         tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \
+         tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \
+         tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \
+      } while (0)
+
+      SETUP_STEP(0, 0, 0);
+      SETUP_STEP(1, 1, 0);
+      SETUP_STEP(2, 0, 1);
+      SETUP_STEP(3, 1, 1);
+
+      SETUP_STEP(4, 2, 0);
+      SETUP_STEP(5, 3, 0);
+      SETUP_STEP(6, 2, 1);
+      SETUP_STEP(7, 3, 1);
+
+      SETUP_STEP(8, 0, 2);
+      SETUP_STEP(9, 1, 2);
+      SETUP_STEP(10, 0, 3);
+      SETUP_STEP(11, 1, 3);
+
+      SETUP_STEP(12, 2, 2);
+      SETUP_STEP(13, 3, 2);
+      SETUP_STEP(14, 2, 3);
+      SETUP_STEP(15, 3, 3);
+#undef STEP
+   }
+
+   /*
+    * All fields of 'tri' are now set.  The remaining code here is
+    * concerned with binning.
+    */
+
+   /* Convert to tile coordinates:
+    */
+   minx = minx / TILE_SIZE;
+   miny = miny / TILE_SIZE;
+   maxx = maxx / TILE_SIZE;
+   maxy = maxy / TILE_SIZE;
+
+   /* Clamp maxx, maxy to framebuffer size
+    */
+   maxx = MIN2(maxx, scene->tiles_x - 1);
+   maxy = MIN2(maxy, scene->tiles_y - 1);
+
+   /* Determine which tile(s) intersect the triangle's bounding box
+    */
+   if (miny == maxy && minx == maxx)
+   {
+      /* Triangle is contained in a single tile:
+       */
+      lp_scene_bin_command( scene, minx, miny, lp_rast_triangle, 
+			    lp_rast_arg_triangle(tri) );
+   }
+   else 
+   {
+      int c1 = (tri->c1 + 
+                tri->dx12 * miny * TILE_SIZE - 
+                tri->dy12 * minx * TILE_SIZE);
+      int c2 = (tri->c2 + 
+                tri->dx23 * miny * TILE_SIZE -
+                tri->dy23 * minx * TILE_SIZE);
+      int c3 = (tri->c3 +
+                tri->dx31 * miny * TILE_SIZE -
+                tri->dy31 * minx * TILE_SIZE);
+
+      int ei1 = tri->ei1 << TILE_ORDER;
+      int ei2 = tri->ei2 << TILE_ORDER;
+      int ei3 = tri->ei3 << TILE_ORDER;
+
+      int eo1 = tri->eo1 << TILE_ORDER;
+      int eo2 = tri->eo2 << TILE_ORDER;
+      int eo3 = tri->eo3 << TILE_ORDER;
+
+      int xstep1 = -(tri->dy12 << TILE_ORDER);
+      int xstep2 = -(tri->dy23 << TILE_ORDER);
+      int xstep3 = -(tri->dy31 << TILE_ORDER);
+
+      int ystep1 = tri->dx12 << TILE_ORDER;
+      int ystep2 = tri->dx23 << TILE_ORDER;
+      int ystep3 = tri->dx31 << TILE_ORDER;
+      int x, y;
+
+
+      /* Test tile-sized blocks against the triangle.
+       * Discard blocks fully outside the tri.  If the block is fully
+       * contained inside the tri, bin an lp_rast_shade_tile command.
+       * Else, bin a lp_rast_triangle command.
+       */
+      for (y = miny; y <= maxy; y++)
+      {
+	 int cx1 = c1;
+	 int cx2 = c2;
+	 int cx3 = c3;
+	 boolean in = FALSE;  /* are we inside the triangle? */
+
+	 for (x = minx; x <= maxx; x++)
+	 {
+	    if (cx1 + eo1 < 0 || 
+		cx2 + eo2 < 0 ||
+		cx3 + eo3 < 0) 
+	    {
+	       /* do nothing */
+               LP_COUNT(nr_empty_64);
+	       if (in)
+		  break;  /* exiting triangle, all done with this row */
+	    }
+	    else if (cx1 + ei1 > 0 &&
+		     cx2 + ei2 > 0 &&
+		     cx3 + ei3 > 0) 
+	    {
+               /* triangle covers the whole tile- shade whole tile */
+               LP_COUNT(nr_fully_covered_64);
+	       in = TRUE;
+	       if(setup->fs.current.opaque) {
+	          lp_scene_bin_reset( scene, x, y );
+	          lp_scene_bin_command( scene, x, y,
+	                                lp_rast_set_state,
+	                                lp_rast_arg_state(setup->fs.stored) );
+	       }
+               lp_scene_bin_command( scene, x, y,
+				     lp_rast_shade_tile,
+				     lp_rast_arg_inputs(&tri->inputs) );
+	    }
+	    else 
+	    { 
+               /* rasterizer/shade partial tile */
+               LP_COUNT(nr_partially_covered_64);
+	       in = TRUE;
+               lp_scene_bin_command( scene, x, y,
+				     lp_rast_triangle, 
+				     lp_rast_arg_triangle(tri) );
+	    }
+
+	    /* Iterate cx values across the region:
+	     */
+	    cx1 += xstep1;
+	    cx2 += xstep2;
+	    cx3 += xstep3;
+	 }
+      
+	 /* Iterate c values down the region:
+	  */
+	 c1 += ystep1;
+	 c2 += ystep2;
+	 c3 += ystep3;    
+      }
+   }
+}
+
+
+static void triangle_cw( struct setup_context *setup,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface );
+}
+
+
+static void triangle_ccw( struct setup_context *setup,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+}
+
+
+static void triangle_both( struct setup_context *setup,
+			   const float (*v0)[4],
+			   const float (*v1)[4],
+			   const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   if (ex * fy - ey * fx < 0.0f) 
+      triangle_ccw( setup, v0, v1, v2 );
+   else
+      triangle_cw( setup, v0, v1, v2 );
+}
+
+
+static void triangle_nop( struct setup_context *setup,
+			  const float (*v0)[4],
+			  const float (*v1)[4],
+			  const float (*v2)[4] )
+{
+}
+
+
+void 
+lp_setup_choose_triangle( struct setup_context *setup )
+{
+   switch (setup->cullmode) {
+   case PIPE_WINDING_NONE:
+      setup->triangle = triangle_both;
+      break;
+   case PIPE_WINDING_CCW:
+      setup->triangle = triangle_cw;
+      break;
+   case PIPE_WINDING_CW:
+      setup->triangle = triangle_ccw;
+      break;
+   default:
+      setup->triangle = triangle_nop;
+      break;
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
new file mode 100644
index 0000000000..24291da91e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -0,0 +1,518 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the llvmpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "lp_setup_context.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+
+#define LP_MAX_VBUF_INDEXES 1024
+#define LP_MAX_VBUF_SIZE    4096
+
+  
+
+/** cast wrapper */
+static struct setup_context *
+setup_context(struct vbuf_render *vbr)
+{
+   return (struct setup_context *) vbr;
+}
+
+
+
+static const struct vertex_info *
+lp_setup_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct setup_context *setup = setup_context(vbr);
+   return setup->vertex_info;
+}
+
+
+static boolean
+lp_setup_allocate_vertices(struct vbuf_render *vbr,
+                          ushort vertex_size, ushort nr_vertices)
+{
+   struct setup_context *setup = setup_context(vbr);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (setup->vertex_buffer_size < size) {
+      align_free(setup->vertex_buffer);
+      setup->vertex_buffer = align_malloc(size, 16);
+      setup->vertex_buffer_size = size;
+   }
+
+   setup->vertex_size = vertex_size;
+   setup->nr_vertices = nr_vertices;
+   
+   return setup->vertex_buffer != NULL;
+}
+
+static void
+lp_setup_release_vertices(struct vbuf_render *vbr)
+{
+   /* keep the old allocation for next time */
+}
+
+static void *
+lp_setup_map_vertices(struct vbuf_render *vbr)
+{
+   struct setup_context *setup = setup_context(vbr);
+   return setup->vertex_buffer;
+}
+
+static void 
+lp_setup_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
+{
+   struct setup_context *setup = setup_context(vbr);
+   assert( setup->vertex_buffer_size >= (max_index+1) * setup->vertex_size );
+   /* do nothing */
+}
+
+
+static boolean
+lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   setup_context(vbr)->prim = prim;
+   return TRUE;
+}
+
+typedef const float (*const_float4_ptr)[4];
+
+static INLINE const_float4_ptr get_vert( const void *vertex_buffer,
+                                         int index,
+                                         int stride )
+{
+   return (const_float4_ptr)((char *)vertex_buffer + index * stride);
+}
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+lp_setup_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct setup_context *setup = setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer = setup->vertex_buffer;
+   unsigned i;
+
+   lp_setup_update_state(setup);
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[nr-1], stride),
+                      get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      if (setup->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (setup->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride));
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.  Note that the first polygon vertex is passed as
+       * the last triangle vertex here.
+       * flatshade_first state makes no difference.
+       */
+      for (i = 2; i < nr; i += 1) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct setup_context *setup = setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(setup->vertex_buffer, start, stride);
+   unsigned i;
+
+   lp_setup_update_state(setup);
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, nr-1, stride),
+                      get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i+(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-(i&1), stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i+(i&1)-2, stride),
+                             get_vert(vertex_buffer, i-(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (setup->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      if (setup->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (setup->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+            setup->triangle( setup,
+
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.  Note that the first polygon vertex is passed as
+       * the last triangle vertex here.
+       * flatshade_first state makes no difference.
+       */
+      for (i = 2; i < nr; i += 1) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+lp_setup_vbuf_destroy(struct vbuf_render *vbr)
+{
+   lp_setup_destroy(setup_context(vbr));
+}
+
+
+/**
+ * Create the post-transform vertex handler for the given context.
+ */
+void
+lp_setup_init_vbuf(struct setup_context *setup)
+{
+   setup->base.max_indices = LP_MAX_VBUF_INDEXES;
+   setup->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
+
+   setup->base.get_vertex_info = lp_setup_get_vertex_info;
+   setup->base.allocate_vertices = lp_setup_allocate_vertices;
+   setup->base.map_vertices = lp_setup_map_vertices;
+   setup->base.unmap_vertices = lp_setup_unmap_vertices;
+   setup->base.set_primitive = lp_setup_set_primitive;
+   setup->base.draw = lp_setup_draw;
+   setup->base.draw_arrays = lp_setup_draw_arrays;
+   setup->base.release_vertices = lp_setup_release_vertices;
+   setup->base.destroy = lp_setup_vbuf_destroy;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 7020da145f..8f68f12bed 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -36,7 +36,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_jit.h"
-#include "lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
 
 
 #define LP_NEW_VIEWPORT      0x1
@@ -54,6 +54,7 @@
 #define LP_NEW_VERTEX        0x1000
 #define LP_NEW_VS            0x2000
 #define LP_NEW_QUERY         0x4000
+#define LP_NEW_BLEND_COLOR   0x8000
 
 
 struct vertex_info;
@@ -65,11 +66,18 @@ struct lp_fragment_shader;
 
 struct lp_fragment_shader_variant_key
 {
-   enum pipe_format zsbuf_format;
    struct pipe_depth_state depth;
    struct pipe_alpha_state alpha;
    struct pipe_blend_state blend;
-
+   enum pipe_format zsbuf_format;
+   unsigned nr_cbufs:8;
+   unsigned flatshade:1;
+   unsigned scissor:1;
+
+   struct {
+      ubyte colormask;
+   } cbuf_blend[PIPE_MAX_COLOR_BUFS];
+   
    struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
@@ -80,9 +88,9 @@ struct lp_fragment_shader_variant
 
    struct lp_fragment_shader_variant_key key;
 
-   LLVMValueRef function;
+   LLVMValueRef function[2];
 
-   lp_jit_frag_func jit_function;
+   lp_jit_frag_func jit_function[2];
 
    struct lp_fragment_shader_variant *next;
 };
@@ -154,7 +162,7 @@ void llvmpipe_set_clip_state( struct pipe_context *,
 
 void llvmpipe_set_constant_buffer(struct pipe_context *,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buf);
+                                  struct pipe_buffer *buf);
 
 void *llvmpipe_create_fs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
@@ -212,23 +220,10 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned mode, unsigned start, unsigned count);
 
 void
-llvmpipe_map_transfers(struct llvmpipe_context *lp);
-
-void
-llvmpipe_unmap_transfers(struct llvmpipe_context *lp);
-
-void
 llvmpipe_map_texture_surfaces(struct llvmpipe_context *lp);
 
 void
 llvmpipe_unmap_texture_surfaces(struct llvmpipe_context *lp);
 
 
-struct vertex_info *
-llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe);
-
-struct vertex_info *
-llvmpipe_get_vbuf_vertex_info(struct llvmpipe_context *llvmpipe);
-
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
index a94cd05ef2..9b950e82d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -73,7 +73,9 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
 			     const struct pipe_blend_color *blend_color )
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   unsigned i, j;
+
+   if(!blend_color)
+      return;
 
    if(memcmp(&llvmpipe->blend_color, blend_color, sizeof *blend_color) == 0)
       return;
@@ -82,13 +84,7 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
 
    memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
 
-   if(!llvmpipe->jit_context.blend_color)
-      llvmpipe->jit_context.blend_color = align_malloc(4 * 16, 16);
-   for (i = 0; i < 4; ++i) {
-      uint8_t c = float_to_ubyte(blend_color->color[i]);
-      for (j = 0; j < 16; ++j)
-         llvmpipe->jit_context.blend_color[i*16 + j] = c;
-   }
+   llvmpipe->dirty |= LP_NEW_BLEND_COLOR;
 }
 
 
@@ -117,9 +113,6 @@ llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 
    llvmpipe->depth_stencil = depth_stencil;
 
-   if(llvmpipe->depth_stencil)
-      llvmpipe->jit_context.alpha_ref_value = llvmpipe->depth_stencil->alpha.ref_value;
-
    llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 6c1ef6bc42..bdd906e1a7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -33,166 +33,113 @@
 #include "draw/draw_private.h"
 #include "lp_context.h"
 #include "lp_screen.h"
-#include "lp_tex_cache.h"
+#include "lp_setup.h"
 #include "lp_state.h"
 
 
-/**
- * Mark the current vertex layout as "invalid".
- * We'll validate the vertex layout later, when we start to actually
- * render a point or line or tri.
- */
-static void
-invalidate_vertex_layout(struct llvmpipe_context *llvmpipe)
-{
-   llvmpipe->vertex_info.num_attribs =  0;
-}
-
 
 /**
  * The vertex info describes how to convert the post-transformed vertices
  * (simple float[][4]) used by the 'draw' module into vertices for
  * rasterization.
  *
- * This function validates the vertex layout and returns a pointer to a
- * vertex_info object.
+ * This function validates the vertex layout.
  */
-struct vertex_info *
-llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
+static void
+compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
+   const uint num = draw_num_shader_outputs(llvmpipe->draw);
+   uint i;
 
-   if (vinfo->num_attribs == 0) {
-      /* compute vertex layout now */
-      const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-      struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-      const uint num = draw_current_shader_outputs(llvmpipe->draw);
-      uint i;
-
-      /* Tell draw_vbuf to simply emit the whole post-xform vertex
-       * as-is.  No longer any need to try and emit draw vertex_header
-       * info.
-       */
-      vinfo_vbuf->num_attribs = 0;
-      for (i = 0; i < num; i++) {
-	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-      }
-      draw_compute_vertex_size(vinfo_vbuf);
+   /* Tell setup to tell the draw module to simply emit the whole
+    * post-xform vertex as-is.
+    *
+    * Not really sure if this is the best approach.
+    */
+   vinfo->num_attribs = 0;
+   for (i = 0; i < num; i++) {
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, i);
+   }
+   draw_compute_vertex_size(vinfo);
 
-      /*
-       * Loop over fragment shader inputs, searching for the matching output
-       * from the vertex shader.
-       */
-      vinfo->num_attribs = 0;
-      for (i = 0; i < lpfs->info.num_inputs; i++) {
-         int src;
-         enum interp_mode interp;
 
-         switch (lpfs->info.input_interpolate[i]) {
-         case TGSI_INTERPOLATE_CONSTANT:
-            interp = INTERP_CONSTANT;
-            break;
-         case TGSI_INTERPOLATE_LINEAR:
-            interp = INTERP_LINEAR;
-            break;
-         case TGSI_INTERPOLATE_PERSPECTIVE:
-            interp = INTERP_PERSPECTIVE;
-            break;
-         default:
-            assert(0);
-            interp = INTERP_LINEAR;
-         }
+   lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
+
+/*
+   llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
+                                              TGSI_SEMANTIC_PSIZE, 0);
+*/
+
+   /* Now match FS inputs against emitted vertex data.  It's also
+    * entirely possible to just have a fixed layout for FS input,
+    * determined by the fragment shader itself, and adjust the draw
+    * outputs to match that.
+    */
+   {
+      struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
 
+      for (i = 0; i < lpfs->info.num_inputs; i++) {
+
+         /* This can be precomputed, except for flatshade:
+          */
          switch (lpfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_FACE:
+            inputs[i].interp = LP_INTERP_FACING;
+            break;
          case TGSI_SEMANTIC_POSITION:
-            interp = INTERP_POS;
+            inputs[i].interp = LP_INTERP_POSITION;
             break;
-
          case TGSI_SEMANTIC_COLOR:
-            if (llvmpipe->rasterizer->flatshade) {
-               interp = INTERP_CONSTANT;
-            }
+            /* Colors are linearly interpolated in the fragment shader
+             * even when flatshading is active.  This just tells the
+             * setup module to use coefficients with ddx==0 and
+             * ddy==0.
+             */
+            if (llvmpipe->rasterizer->flatshade)
+               inputs[i].interp = LP_INTERP_CONSTANT;
+            else
+               inputs[i].interp = LP_INTERP_LINEAR;
             break;
-         }
 
-         /* this includes texcoords and varying vars */
-         src = draw_find_shader_output(llvmpipe->draw,
-                                   lpfs->info.input_semantic_name[i],
-                                   lpfs->info.input_semantic_index[i]);
-         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
-      }
+         default:
+            switch (lpfs->info.input_interpolate[i]) {
+            case TGSI_INTERPOLATE_CONSTANT:
+               inputs[i].interp = LP_INTERP_CONSTANT;
+               break;
+            case TGSI_INTERPOLATE_LINEAR:
+               inputs[i].interp = LP_INTERP_LINEAR;
+               break;
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               inputs[i].interp = LP_INTERP_PERSPECTIVE;
+               break;
+            default:
+               assert(0);
+               break;
+            }
+         }
 
-      llvmpipe->psize_slot = draw_find_shader_output(llvmpipe->draw,
-                                                 TGSI_SEMANTIC_PSIZE, 0);
-      if (llvmpipe->psize_slot > 0) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
-                               llvmpipe->psize_slot);
+         /* Search for each input in current vs output:
+          */
+         inputs[i].src_index = 
+            draw_find_shader_output(llvmpipe->draw,
+                                    lpfs->info.input_semantic_name[i],
+                                    lpfs->info.input_semantic_index[i]);
       }
 
-      draw_compute_vertex_size(vinfo);
+      lp_setup_set_fs_inputs(llvmpipe->setup, 
+                             inputs,
+                             lpfs->info.num_inputs);
    }
-
-   return vinfo;
 }
 
 
 /**
- * Called from vbuf module.
+ * Handle state changes.
+ * Called just prior to drawing anything (pipe::draw_arrays(), etc).
  *
- * Note that there's actually two different vertex layouts in llvmpipe.
- *
- * The normal one is computed in llvmpipe_get_vertex_info() above and is
- * used by the point/line/tri "setup" code.
- *
- * The other one (this one) is only used by the vbuf module (which is
- * not normally used by default but used in testing).  For the vbuf module,
- * we basically want to pass-through the draw module's vertex layout as-is.
- * When the llvmpipe vbuf code begins drawing, the normal vertex layout
- * will come into play again.
- */
-struct vertex_info *
-llvmpipe_get_vbuf_vertex_info(struct llvmpipe_context *llvmpipe)
-{
-   (void) llvmpipe_get_vertex_info(llvmpipe);
-   return &llvmpipe->vertex_info_vbuf;
-}
-
-
-/**
- * Recompute cliprect from scissor bounds, scissor enable and surface size.
- */
-static void
-compute_cliprect(struct llvmpipe_context *lp)
-{
-   /* LP_NEW_FRAMEBUFFER
-    */
-   uint surfWidth = lp->framebuffer.width;
-   uint surfHeight = lp->framebuffer.height;
-
-   /* LP_NEW_RASTERIZER
-    */
-   if (lp->rasterizer->scissor) {
-
-      /* LP_NEW_SCISSOR
-       *
-       * clip to scissor rect:
-       */
-      lp->cliprect.minx = MAX2(lp->scissor.minx, 0);
-      lp->cliprect.miny = MAX2(lp->scissor.miny, 0);
-      lp->cliprect.maxx = MIN2(lp->scissor.maxx, surfWidth);
-      lp->cliprect.maxy = MIN2(lp->scissor.maxy, surfHeight);
-   }
-   else {
-      /* clip to surface bounds */
-      lp->cliprect.minx = 0;
-      lp->cliprect.miny = 0;
-      lp->cliprect.maxx = surfWidth;
-      lp->cliprect.maxy = surfHeight;
-   }
-}
-
-
-/* Hopefully this will remain quite simple, otherwise need to pull in
+ * Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
 void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
@@ -206,28 +153,40 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->dirty |= LP_NEW_TEXTURE;
    }
       
-   if (llvmpipe->dirty & (LP_NEW_SAMPLER |
-                          LP_NEW_TEXTURE)) {
-      /* TODO */
-   }
-
    if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
                           LP_NEW_FS |
                           LP_NEW_VS))
-      invalidate_vertex_layout( llvmpipe );
-
-   if (llvmpipe->dirty & (LP_NEW_SCISSOR |
-                          LP_NEW_RASTERIZER |
-                          LP_NEW_FRAMEBUFFER))
-      compute_cliprect(llvmpipe);
+      compute_vertex_info( llvmpipe );
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_BLEND |
+                          LP_NEW_SCISSOR |
                           LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_RASTERIZER |
                           LP_NEW_SAMPLER |
                           LP_NEW_TEXTURE))
       llvmpipe_update_fs( llvmpipe );
 
+   if (llvmpipe->dirty & LP_NEW_BLEND_COLOR)
+      lp_setup_set_blend_color(llvmpipe->setup,
+                               &llvmpipe->blend_color);
+
+   if (llvmpipe->dirty & LP_NEW_SCISSOR)
+      lp_setup_set_scissor(llvmpipe->setup, &llvmpipe->scissor);
+
+   if (llvmpipe->dirty & LP_NEW_DEPTH_STENCIL_ALPHA)
+      lp_setup_set_alpha_ref_value(llvmpipe->setup, 
+                                   llvmpipe->depth_stencil->alpha.ref_value);
+
+   if (llvmpipe->dirty & LP_NEW_CONSTANTS)
+      lp_setup_set_fs_constants(llvmpipe->setup, 
+                                llvmpipe->constants[PIPE_SHADER_FRAGMENT]);
+
+   if (llvmpipe->dirty & LP_NEW_TEXTURE)
+      lp_setup_set_sampler_textures(llvmpipe->setup, 
+                                    llvmpipe->num_textures,
+                                    llvmpipe->texture);
 
    llvmpipe->dirty = 0;
 }
+
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b73ca2d41e..15c10d8e2e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -31,6 +31,8 @@
  * Code generate the whole fragment pipeline.
  *
  * The fragment pipeline consists of the following stages:
+ * - triangle edge in/out testing
+ * - scissor test
  * - stipple (TBI)
  * - early depth test
  * - fragment shader
@@ -58,36 +60,39 @@
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
 
+#include <limits.h>
 #include "pipe/p_defines.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
 #include "util/u_debug_dump.h"
-#include "pipe/internal/p_winsys_screen.h"
+#include "os/os_time.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_parse.h"
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_conv.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_depth.h"
-#include "lp_bld_interp.h"
-#include "lp_bld_tgsi.h"
-#include "lp_bld_alpha.h"
-#include "lp_bld_blend.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_screen.h"
-#include "lp_context.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_depth.h"
+#include "gallivm/lp_bld_interp.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "gallivm/lp_bld_alpha.h"
+#include "gallivm/lp_bld_blend.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
 #include "lp_buffer.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_screen.h"
+#include "lp_setup.h"
 #include "lp_state.h"
-#include "lp_quad.h"
 #include "lp_tex_sample.h"
-#include "lp_debug.h"
 
 
 static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@ -187,7 +192,187 @@ generate_depth(LLVMBuilderRef builder,
 
 
 /**
+ * Generate the code to do inside/outside triangle testing for the
+ * four pixels in a 2x2 quad.  This will set the four elements of the
+ * quad mask vector to 0 or ~0.
+ * \param i  which quad of the quad group to test, in [0,3]
+ */
+static void
+generate_tri_edge_mask(LLVMBuilderRef builder,
+                       unsigned i,
+                       LLVMValueRef *mask,      /* ivec4, out */
+                       LLVMValueRef c0,         /* int32 */
+                       LLVMValueRef c1,         /* int32 */
+                       LLVMValueRef c2,         /* int32 */
+                       LLVMValueRef step0_ptr,  /* ivec4 */
+                       LLVMValueRef step1_ptr,  /* ivec4 */
+                       LLVMValueRef step2_ptr)  /* ivec4 */
+{
+#define OPTIMIZE_IN_OUT_TEST 0
+#if OPTIMIZE_IN_OUT_TEST
+   struct lp_build_if_state ifctx;
+   LLVMValueRef not_draw_all;
+#endif
+   struct lp_build_flow_context *flow;
+   struct lp_type i32_type;
+   LLVMTypeRef i32vec4_type, mask_type;
+   LLVMValueRef c0_vec, c1_vec, c2_vec;
+   LLVMValueRef in_out_mask;
+
+   assert(i < 4);
+   
+   /* int32 vector type */
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+
+   i32vec4_type = lp_build_int32_vec4_type();
+
+   mask_type = LLVMIntType(32 * 4);
+
+   /*
+    * Use a conditional here to do detailed pixel in/out testing.
+    * We only have to do this if c0 != INT_MIN.
+    */
+   flow = lp_build_flow_create(builder);
+   lp_build_flow_scope_begin(flow);
+
+   {
+#if OPTIMIZE_IN_OUT_TEST
+      /* not_draw_all = (c0 != INT_MIN) */
+      not_draw_all = LLVMBuildICmp(builder,
+                                   LLVMIntNE,
+                                   c0,
+                                   LLVMConstInt(LLVMInt32Type(), INT_MIN, 0),
+                                   "");
+
+      in_out_mask = lp_build_int_const_scalar(i32_type, ~0);
+
+
+      lp_build_flow_scope_declare(flow, &in_out_mask);
+
+      /* if (not_draw_all) {... */
+      lp_build_if(&ifctx, flow, builder, not_draw_all);
+#endif
+      {
+         LLVMValueRef step0_vec, step1_vec, step2_vec;
+         LLVMValueRef m0_vec, m1_vec, m2_vec;
+         LLVMValueRef index, m;
+
+         /* c0_vec = {c0, c0, c0, c0}
+          * Note that we emit this code four times but LLVM optimizes away
+          * three instances of it.
+          */
+         c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
+         c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
+         c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
+         lp_build_name(c0_vec, "edgeconst0vec");
+         lp_build_name(c1_vec, "edgeconst1vec");
+         lp_build_name(c2_vec, "edgeconst2vec");
+
+         /* load step0vec, step1, step2 vec from memory */
+         index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
+         step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
+         step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
+         lp_build_name(step0_vec, "step0vec");
+         lp_build_name(step1_vec, "step1vec");
+         lp_build_name(step2_vec, "step2vec");
+
+         /* m0_vec = step0_ptr[i] > c0_vec */
+         m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec);
+         m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec);
+         m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec);
+
+         /* in_out_mask = m0_vec & m1_vec & m2_vec */
+         m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
+         in_out_mask = LLVMBuildAnd(builder, m, m2_vec, "");
+         lp_build_name(in_out_mask, "inoutmaskvec");
+      }
+#if OPTIMIZE_IN_OUT_TEST
+      lp_build_endif(&ifctx);
+#endif
+
+   }
+   lp_build_flow_scope_end(flow);
+   lp_build_flow_destroy(flow);
+
+   /* This is the initial alive/dead pixel mask for a quad of four pixels.
+    * It's an int[4] vector with each word set to 0 or ~0.
+    * Words will get cleared when pixels faile the Z test, etc.
+    */
+   *mask = in_out_mask;
+}
+
+
+static LLVMValueRef
+generate_scissor_test(LLVMBuilderRef builder,
+                      LLVMValueRef context_ptr,
+                      const struct lp_build_interp_soa_context *interp,
+                      struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1];
+   LLVMValueRef xmin, ymin, xmax, ymax;
+   LLVMValueRef m0, m1, m2, m3, m;
+
+   /* xpos, ypos contain the window coords for the four pixels in the quad */
+   assert(xpos);
+   assert(ypos);
+
+   /* get the current scissor bounds, convert to vectors */
+   xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr);
+   xmin = lp_build_broadcast(builder, vec_type, xmin);
+
+   ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr);
+   ymin = lp_build_broadcast(builder, vec_type, ymin);
+
+   xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr);
+   xmax = lp_build_broadcast(builder, vec_type, xmax);
+
+   ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr);
+   ymax = lp_build_broadcast(builder, vec_type, ymax);
+
+   /* compare the fragment's position coordinates against the scissor bounds */
+   m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin);
+   m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin);
+   m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax);
+   m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax);
+
+   /* AND all the masks together */
+   m = LLVMBuildAnd(builder, m0, m1, "");
+   m = LLVMBuildAnd(builder, m, m2, "");
+   m = LLVMBuildAnd(builder, m, m3, "");
+
+   lp_build_name(m, "scissormask");
+
+   return m;
+}
+
+
+static LLVMValueRef
+build_int32_vec_const(int value)
+{
+   struct lp_type i32_type;
+
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+   return lp_build_int_const_scalar(i32_type, value);
+}
+
+
+
+/**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
+ * \param i  which quad in the tile, in range [0,3]
+ * \param do_tri_test  if 1, do triangle edge in/out testing
  */
 static void
 generate_fs(struct llvmpipe_context *lp,
@@ -200,8 +385,15 @@ generate_fs(struct llvmpipe_context *lp,
             const struct lp_build_interp_soa_context *interp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
-            LLVMValueRef *color,
-            LLVMValueRef depth_ptr)
+            LLVMValueRef (*color)[4],
+            LLVMValueRef depth_ptr,
+            unsigned do_tri_test,
+            LLVMValueRef c0,
+            LLVMValueRef c1,
+            LLVMValueRef c2,
+            LLVMValueRef step0_ptr,
+            LLVMValueRef step1_ptr,
+            LLVMValueRef step2_ptr)
 {
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef elem_type;
@@ -215,6 +407,9 @@ generate_fs(struct llvmpipe_context *lp,
    boolean early_depth_test;
    unsigned attrib;
    unsigned chan;
+   unsigned cbuf;
+
+   assert(i < 4);
 
    elem_type = lp_build_elem_type(type);
    vec_type = lp_build_vec_type(type);
@@ -229,14 +424,32 @@ generate_fs(struct llvmpipe_context *lp,
    lp_build_flow_scope_begin(flow);
 
    /* Declare the color and z variables */
-   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-      color[chan] = LLVMGetUndef(vec_type);
-      lp_build_flow_scope_declare(flow, &color[chan]);
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+	 color[cbuf][chan] = LLVMGetUndef(vec_type);
+	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+      }
    }
    lp_build_flow_scope_declare(flow, &z);
 
+   /* do triangle edge testing */
+   if (do_tri_test) {
+      generate_tri_edge_mask(builder, i, pmask,
+                             c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+   }
+   else {
+      *pmask = build_int32_vec_const(~0);
+   }
+
+   /* 'mask' will control execution based on quad's pixel alive/killed state */
    lp_build_mask_begin(&mask, flow, type, *pmask);
 
+   if (key->scissor) {
+      LLVMValueRef smask =
+         generate_scissor_test(builder, context_ptr, interp, type);
+      lp_build_mask_update(&mask, smask);
+   }
+
    early_depth_test =
       key->depth.enabled &&
       !key->alpha.enabled &&
@@ -255,19 +468,21 @@ generate_fs(struct llvmpipe_context *lp,
    for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
          if(outputs[attrib][chan]) {
-            lp_build_name(outputs[attrib][chan], "output%u.%u.%c", i, attrib, "xyzw"[chan]);
+            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
 
             switch (shader->info.output_semantic_name[attrib]) {
             case TGSI_SEMANTIC_COLOR:
                {
                   unsigned cbuf = shader->info.output_semantic_index[attrib];
 
-                  lp_build_name(outputs[attrib][chan], "color%u.%u.%c", i, attrib, "rgba"[chan]);
+                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
 
                   /* Alpha test */
                   /* XXX: should the alpha reference value be passed separately? */
+		  /* XXX: should only test the final assignment to alpha */
                   if(cbuf == 0 && chan == 3) {
-                     LLVMValueRef alpha = outputs[attrib][chan];
+                     LLVMValueRef alpha = out;
                      LLVMValueRef alpha_ref_value;
                      alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
                      alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
@@ -275,15 +490,13 @@ generate_fs(struct llvmpipe_context *lp,
                                          &mask, alpha, alpha_ref_value);
                   }
 
-                  if(cbuf == 0)
-                     color[chan] = outputs[attrib][chan];
-
+		  color[cbuf][chan] = out;
                   break;
                }
 
             case TGSI_SEMANTIC_POSITION:
                if(chan == 2)
-                  z = outputs[attrib][chan];
+                  z = out;
                break;
             }
          }
@@ -332,6 +545,8 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_context_init(&bld, builder, type);
 
    flow = lp_build_flow_create(builder);
+
+   /* we'll use this mask context to skip blending if all pixels are dead */
    lp_build_mask_begin(&mask_ctx, flow, type, mask);
 
    vec_type = lp_build_vec_type(type);
@@ -354,7 +569,7 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_blend_soa(builder, blend, type, src, dst, con, res);
 
    for(chan = 0; chan < 4; ++chan) {
-      if(blend->colormask & (1 << chan)) {
+      if(blend->rt[0].colormask & (1 << chan)) {
          LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
          lp_build_name(res[chan], "res.%c", "rgba"[chan]);
          res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
@@ -369,14 +584,18 @@ generate_blend(const struct pipe_blend_state *blend,
 
 /**
  * Generate the runtime callable function for the whole fragment pipeline.
+ * Note that the function which we generate operates on a block of 16
+ * pixels at at time.  The block contains 2x2 quads.  Each quad contains
+ * 2x2 pixels.
  */
-static struct lp_fragment_shader_variant *
+static void
 generate_fragment(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
-                  const struct lp_fragment_shader_variant_key *key)
+                  struct lp_fragment_shader_variant *variant,
+                  unsigned do_tri_test)
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
-   struct lp_fragment_shader_variant *variant;
+   const struct lp_fragment_shader_variant_key *key = &variant->key;
    struct lp_type fs_type;
    struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
@@ -384,17 +603,18 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMTypeRef fs_int_vec_type;
    LLVMTypeRef blend_vec_type;
    LLVMTypeRef blend_int_vec_type;
-   LLVMTypeRef arg_types[9];
+   LLVMTypeRef arg_types[14];
    LLVMTypeRef func_type;
+   LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();
    LLVMValueRef context_ptr;
    LLVMValueRef x;
    LLVMValueRef y;
    LLVMValueRef a0_ptr;
    LLVMValueRef dadx_ptr;
    LLVMValueRef dady_ptr;
-   LLVMValueRef mask_ptr;
-   LLVMValueRef color_ptr;
+   LLVMValueRef color_ptr_ptr;
    LLVMValueRef depth_ptr;
+   LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef x0;
@@ -402,71 +622,15 @@ generate_fragment(struct llvmpipe_context *lp,
    struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
    LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
-   LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
    LLVMValueRef blend_mask;
    LLVMValueRef blend_in_color[NUM_CHANNELS];
+   LLVMValueRef function;
    unsigned num_fs;
    unsigned i;
    unsigned chan;
+   unsigned cbuf;
 
-   if (LP_DEBUG & DEBUG_JIT) {
-      tgsi_dump(shader->base.tokens, 0);
-      if(key->depth.enabled) {
-         debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
-         debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
-         debug_printf("depth.writemask = %u\n", key->depth.writemask);
-      }
-      if(key->alpha.enabled) {
-         debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
-         debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
-      }
-      if(key->blend.logicop_enable) {
-         debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
-      }
-      else if(key->blend.blend_enable) {
-         debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rgb_func, TRUE));
-         debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_src_factor, TRUE));
-         debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_dst_factor, TRUE));
-         debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.alpha_func, TRUE));
-         debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_src_factor, TRUE));
-         debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
-      }
-      debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
-      for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-         if(key->sampler[i].format) {
-            debug_printf("sampler[%u] = \n", i);
-            debug_printf("  .format = %s\n",
-                         pf_name(key->sampler[i].format));
-            debug_printf("  .target = %s\n",
-                         debug_dump_tex_target(key->sampler[i].target, TRUE));
-            debug_printf("  .pot = %u %u %u\n",
-                         key->sampler[i].pot_width,
-                         key->sampler[i].pot_height,
-                         key->sampler[i].pot_depth);
-            debug_printf("  .wrap = %s %s %s\n",
-                         debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                         debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                         debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-            debug_printf("  .min_img_filter = %s\n",
-                         debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-            debug_printf("  .min_mip_filter = %s\n",
-                         debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-            debug_printf("  .mag_img_filter = %s\n",
-                         debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-               debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
-            debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-            debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
-         }
-      }
-   }
-
-   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
-   if(!variant)
-      return NULL;
-
-   variant->shader = shader;
-   memcpy(&variant->key, key, sizeof *key);
 
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
@@ -476,8 +640,8 @@ generate_fragment(struct llvmpipe_context *lp,
    fs_type.sign = TRUE;     /* values are signed */
    fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
    fs_type.width = 32;      /* 32-bit float */
-   fs_type.length = 4;      /* 4 element per vector */
-   num_fs = 4;
+   fs_type.length = 4;      /* 4 elements per vector */
+   num_fs = 4;              /* number of quads per block */
 
    memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
@@ -504,27 +668,47 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[3] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dady */
-   arg_types[6] = LLVMPointerType(fs_int_vec_type, 0); /* mask */
-   arg_types[7] = LLVMPointerType(blend_vec_type, 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[6] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
+   arg_types[7] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[8] = LLVMInt32Type();                     /* c0 */
+   arg_types[9] = LLVMInt32Type();                     /* c1 */
+   arg_types[10] = LLVMInt32Type();                    /* c2 */
+   /* Note: the step arrays are built as int32[16] but we interpret
+    * them here as int32_vec4[4].
+    */
+   arg_types[11] = LLVMPointerType(int32_vec4_type, 0);/* step0 */
+   arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step1 */
+   arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step2 */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
-   variant->function = LLVMAddFunction(screen->module, "shader", func_type);
-   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+   function = LLVMAddFunction(screen->module, "shader", func_type);
+   LLVMSetFunctionCallConv(function, LLVMCCallConv);
+
+   variant->function[do_tri_test] = function;
+
+
+   /* XXX: need to propagate noalias down into color param now we are
+    * passing a pointer-to-pointer?
+    */
    for(i = 0; i < Elements(arg_types); ++i)
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-         LLVMAddAttribute(LLVMGetParam(variant->function, i), LLVMNoAliasAttribute);
-
-   context_ptr  = LLVMGetParam(variant->function, 0);
-   x            = LLVMGetParam(variant->function, 1);
-   y            = LLVMGetParam(variant->function, 2);
-   a0_ptr       = LLVMGetParam(variant->function, 3);
-   dadx_ptr     = LLVMGetParam(variant->function, 4);
-   dady_ptr     = LLVMGetParam(variant->function, 5);
-   mask_ptr     = LLVMGetParam(variant->function, 6);
-   color_ptr    = LLVMGetParam(variant->function, 7);
-   depth_ptr    = LLVMGetParam(variant->function, 8);
+         LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+   context_ptr  = LLVMGetParam(function, 0);
+   x            = LLVMGetParam(function, 1);
+   y            = LLVMGetParam(function, 2);
+   a0_ptr       = LLVMGetParam(function, 3);
+   dadx_ptr     = LLVMGetParam(function, 4);
+   dady_ptr     = LLVMGetParam(function, 5);
+   color_ptr_ptr = LLVMGetParam(function, 6);
+   depth_ptr    = LLVMGetParam(function, 7);
+   c0           = LLVMGetParam(function, 8);
+   c1           = LLVMGetParam(function, 9);
+   c2           = LLVMGetParam(function, 10);
+   step0_ptr    = LLVMGetParam(function, 11);
+   step1_ptr    = LLVMGetParam(function, 12);
+   step2_ptr    = LLVMGetParam(function, 13);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(x, "x");
@@ -532,36 +716,45 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(a0_ptr, "a0");
    lp_build_name(dadx_ptr, "dadx");
    lp_build_name(dady_ptr, "dady");
-   lp_build_name(mask_ptr, "mask");
-   lp_build_name(color_ptr, "color");
+   lp_build_name(color_ptr_ptr, "color_ptr");
    lp_build_name(depth_ptr, "depth");
+   lp_build_name(c0, "c0");
+   lp_build_name(c1, "c1");
+   lp_build_name(c2, "c2");
+   lp_build_name(step0_ptr, "step0");
+   lp_build_name(step1_ptr, "step1");
+   lp_build_name(step2_ptr, "step2");
 
    /*
     * Function body
     */
 
-   block = LLVMAppendBasicBlock(variant->function, "entry");
+   block = LLVMAppendBasicBlock(function, "entry");
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
    generate_pos0(builder, x, y, &x0, &y0);
 
-   lp_build_interp_soa_init(&interp, shader->base.tokens, builder, fs_type,
+   lp_build_interp_soa_init(&interp, 
+                            shader->base.tokens,
+                            key->flatshade,
+                            builder, fs_type,
                             a0_ptr, dadx_ptr, dady_ptr,
-                            x0, y0, 2, 0);
+                            x0, y0);
 
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
 
+   /* loop over quads in the block */
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef out_color[NUM_CHANNELS];
+      LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
+      int cbuf;
 
       if(i != 0)
-         lp_build_interp_soa_update(&interp);
+         lp_build_interp_soa_update(&interp, i);
 
-      fs_mask[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, mask_ptr, &index, 1, ""), "");
       depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
 
       generate_fs(lp, shader, key,
@@ -571,71 +764,162 @@ generate_fragment(struct llvmpipe_context *lp,
                   i,
                   &interp,
                   sampler,
-                  &fs_mask[i],
+                  &fs_mask[i], /* output */
                   out_color,
-                  depth_ptr_i);
-
-      for(chan = 0; chan < NUM_CHANNELS; ++chan)
-         fs_out_color[chan][i] = out_color[chan];
+                  depth_ptr_i,
+                  do_tri_test,
+                  c0, c1, c2,
+                  step0_ptr, step1_ptr, step2_ptr);
+
+      for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++)
+	 for(chan = 0; chan < NUM_CHANNELS; ++chan)
+	    fs_out_color[cbuf][chan][i] = out_color[cbuf][chan];
    }
 
    sampler->destroy(sampler);
 
-   /* 
-    * Convert the fs's output color and mask to fit to the blending type. 
+   /* Loop over color outputs / color buffers to do blending.
     */
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      LLVMValueRef color_ptr;
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), cbuf, 0);
 
-   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-      lp_build_conv(builder, fs_type, blend_type,
-                    fs_out_color[chan], num_fs,
-                    &blend_in_color[chan], 1);
-      lp_build_name(blend_in_color[chan], "color.%c", "rgba"[chan]);
+      /* 
+       * Convert the fs's output color and mask to fit to the blending type. 
+       */
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+	 lp_build_conv(builder, fs_type, blend_type,
+		       fs_out_color[cbuf][chan], num_fs,
+		       &blend_in_color[chan], 1);
+	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+      }
 
+      lp_build_conv_mask(builder, fs_type, blend_type,
+			 fs_mask, num_fs,
+			 &blend_mask, 1);
+
+      color_ptr = LLVMBuildLoad(builder, 
+				LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
+				"");
+      lp_build_name(color_ptr, "color_ptr%d", cbuf);
+
+      /*
+       * Blending.
+       */
+      generate_blend(&key->blend,
+		     builder,
+		     blend_type,
+		     context_ptr,
+		     blend_mask,
+		     blend_in_color,
+		     color_ptr);
    }
 
-   lp_build_conv_mask(builder, fs_type, blend_type,
-                      fs_mask, num_fs,
-                      &blend_mask, 1);
-
-   /*
-    * Blending.
-    */
-
-   generate_blend(&key->blend,
-                  builder,
-                  blend_type,
-                  context_ptr,
-                  blend_mask,
-                  blend_in_color,
-                  color_ptr);
-
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
 
-   /*
-    * Translate the LLVM IR into machine code.
-    */
 
+   /* Verify the LLVM IR.  If invalid, dump and abort */
 #ifdef DEBUG
-   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function);
-      assert(0);
+   if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
+      if (1)
+         LLVMDumpValue(function);
+      abort();
    }
 #endif
 
-   LLVMRunFunctionPassManager(screen->pass, variant->function);
+   /* Apply optimizations to LLVM IR */
+   if (1)
+      LLVMRunFunctionPassManager(screen->pass, function);
 
    if (LP_DEBUG & DEBUG_JIT) {
-      LLVMDumpValue(variant->function);
+      /* Print the LLVM IR to stderr */
+      LLVMDumpValue(function);
       debug_printf("\n");
    }
 
-   variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+   variant->jit_function[do_tri_test] = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, function);
 
    if (LP_DEBUG & DEBUG_ASM)
-      lp_disassemble(variant->jit_function);
+      lp_disassemble(variant->jit_function[do_tri_test]);
+}
+
+
+static struct lp_fragment_shader_variant *
+generate_variant(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 const struct lp_fragment_shader_variant_key *key)
+{
+   struct lp_fragment_shader_variant *variant;
+
+   if (LP_DEBUG & DEBUG_JIT) {
+      unsigned i;
 
+      tgsi_dump(shader->base.tokens, 0);
+      if(key->depth.enabled) {
+         debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
+         debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
+         debug_printf("depth.writemask = %u\n", key->depth.writemask);
+      }
+      if(key->alpha.enabled) {
+         debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
+         debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
+      }
+      if(key->blend.logicop_enable) {
+         debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
+      }
+      else if(key->blend.rt[0].blend_enable) {
+         debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+         debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+         debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+         debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+         debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+         debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+      }
+      debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
+      for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+         if(key->sampler[i].format) {
+            debug_printf("sampler[%u] = \n", i);
+            debug_printf("  .format = %s\n",
+                         pf_name(key->sampler[i].format));
+            debug_printf("  .target = %s\n",
+                         debug_dump_tex_target(key->sampler[i].target, TRUE));
+            debug_printf("  .pot = %u %u %u\n",
+                         key->sampler[i].pot_width,
+                         key->sampler[i].pot_height,
+                         key->sampler[i].pot_depth);
+            debug_printf("  .wrap = %s %s %s\n",
+                         debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                         debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                         debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+            debug_printf("  .min_img_filter = %s\n",
+                         debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+            debug_printf("  .min_mip_filter = %s\n",
+                         debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+            debug_printf("  .mag_img_filter = %s\n",
+                         debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+               debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+            debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+         }
+      }
+   }
+
+   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+   if(!variant)
+      return NULL;
+
+   variant->shader = shader;
+   memcpy(&variant->key, key, sizeof *key);
+
+   generate_fragment(lp, shader, variant, 0);
+   generate_fragment(lp, shader, variant, 1);
+
+   /* insert new variant into linked list */
    variant->next = shader->variants;
    shader->variants = variant;
 
@@ -693,11 +977,15 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
    variant = shader->variants;
    while(variant) {
       struct lp_fragment_shader_variant *next = variant->next;
-
-      if(variant->function) {
-         if(variant->jit_function)
-            LLVMFreeMachineCodeForFunction(screen->engine, variant->function);
-         LLVMDeleteFunction(variant->function);
+      unsigned i;
+
+      for (i = 0; i < Elements(variant->function); i++) {
+         if (variant->function[i]) {
+            if (variant->jit_function[i])
+               LLVMFreeMachineCodeForFunction(screen->engine,
+                                              variant->function[i]);
+            LLVMDeleteFunction(variant->function[i]);
+         }
       }
 
       FREE(variant);
@@ -714,27 +1002,25 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 void
 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *constants)
+                             struct pipe_buffer *constants)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_buffer *buffer = constants ? constants->buffer : NULL;
-   unsigned size = buffer ? buffer->size : 0;
-   const void *data = buffer ? llvmpipe_buffer(buffer)->data : NULL;
+   unsigned size = constants ? constants->size : 0;
+   const void *data = constants ? llvmpipe_buffer(constants)->data : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   if(llvmpipe->constants[shader] == constants)
+      return;
+
    draw_flush(llvmpipe->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
-
-   if(shader == PIPE_SHADER_FRAGMENT) {
-      llvmpipe->jit_context.constants = data;
-   }
+   pipe_buffer_reference(&llvmpipe->constants[shader], constants);
 
    if(shader == PIPE_SHADER_VERTEX) {
-      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX,
+      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX, 0,
                                       data, size);
    }
 
@@ -769,21 +1055,30 @@ make_variant_key(struct llvmpipe_context *lp,
       key->alpha.func = lp->depth_stencil->alpha.func;
    /* alpha.ref_value is passed in jit_context */
 
-   if(lp->framebuffer.cbufs[0]) {
-      const struct util_format_description *format_desc;
-      unsigned chan;
+   key->flatshade = lp->rasterizer->flatshade;
+   key->scissor = lp->rasterizer->scissor;
 
+   if (lp->framebuffer.nr_cbufs) {
       memcpy(&key->blend, lp->blend, sizeof key->blend);
+   }
 
-      format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+   key->nr_cbufs = lp->framebuffer.nr_cbufs;
+   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[i]->format);
       assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
              format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
 
-      /* mask out color channels not present in the color buffer */
+      /* mask out color channels not present in the color buffer.
+       * Should be simple to incorporate per-cbuf writemasks:
+       */
       for(chan = 0; chan < 4; ++chan) {
          enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-         if(swizzle > 4)
-            key->blend.colormask &= ~(1 << chan);
+
+         if(swizzle <= UTIL_FORMAT_SWIZZLE_W)
+            key->blend.rt[0].colormask |= (1 << chan);
       }
    }
 
@@ -793,12 +1088,17 @@ make_variant_key(struct llvmpipe_context *lp,
 }
 
 
+/**
+ * Update fragment state.  This is called just prior to drawing
+ * something when some fragment-related state has changed.
+ */
 void 
 llvmpipe_update_fs(struct llvmpipe_context *lp)
 {
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant;
+   boolean opaque;
 
    make_variant_key(lp, shader, &key);
 
@@ -810,8 +1110,34 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       variant = variant->next;
    }
 
-   if(!variant)
-      variant = generate_fragment(lp, shader, &key);
+   if (!variant) {
+      int64_t t0, t1;
+      int64_t dt;
+      t0 = os_time_get();
+
+      variant = generate_variant(lp, shader, &key);
+
+      t1 = os_time_get();
+      dt = t1 - t0;
+      LP_COUNT_ADD(llvm_compile_time, dt);
+      LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+   }
 
    shader->current = variant;
+
+   /* TODO: put this in the variant */
+   /* TODO: most of these can be relaxed, in particular the colormask */
+   opaque = !key.blend.logicop_enable &&
+            !key.blend.rt[0].blend_enable &&
+            key.blend.rt[0].colormask == 0xf &&
+            !key.alpha.enabled &&
+            !key.depth.enabled &&
+            !key.scissor &&
+            !shader->info.uses_kill
+            ? TRUE : FALSE;
+
+   lp_setup_set_fs_functions(lp->setup, 
+                             shader->current->jit_function[0],
+                             shader->current->jit_function[1],
+                             opaque);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index aa3b5a3f91..feb012816c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -29,6 +29,7 @@
 #include "util/u_memory.h"
 #include "lp_context.h"
 #include "lp_state.h"
+#include "lp_setup.h"
 #include "draw/draw_context.h"
 
 
@@ -53,6 +54,17 @@ void llvmpipe_bind_rasterizer_state(struct pipe_context *pipe,
 
    llvmpipe->rasterizer = rasterizer;
 
+   /* Note: we can immediately set the triangle state here and
+    * not worry about binning because we handle culling during
+    * triangle setup, not when rasterizing the bins.
+    */
+   if (llvmpipe->rasterizer) {
+      lp_setup_set_triangle_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->cull_mode,
+                   llvmpipe->rasterizer->front_winding == PIPE_WINDING_CCW,
+                   llvmpipe->rasterizer->scissor);
+   }
+
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index d382f9ca87..b30a075776 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -29,6 +29,7 @@
  *  Brian Paul
  */
 
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 
 #include "draw/draw_context.h"
@@ -36,8 +37,6 @@
 #include "lp_context.h"
 #include "lp_context.h"
 #include "lp_state.h"
-#include "lp_texture.h"
-#include "lp_tex_cache.h"
 #include "draw/draw_context.h"
 
 
@@ -125,17 +124,6 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num ? texture[i] : NULL;
 
       pipe_texture_reference(&llvmpipe->texture[i], tex);
-      lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
-
-      if(tex) {
-         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
-         struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
-         jit_tex->width = tex->width0;
-         jit_tex->height = tex->height0;
-         jit_tex->stride = lp_tex->stride[0];
-         if(!lp_tex->dt)
-            jit_tex->data = lp_tex->data;
-      }
    }
 
    llvmpipe->num_textures = num;
@@ -166,7 +154,6 @@ llvmpipe_set_vertex_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num_textures ? textures[i] : NULL;
 
       pipe_texture_reference(&llvmpipe->vertex_textures[i], tex);
-      lp_tex_tile_cache_set_texture(llvmpipe->vertex_tex_cache[i], tex);
    }
 
    llvmpipe->num_vertex_textures = num_textures;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index e37ff04f3d..048ac5b968 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -28,10 +28,12 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
 #include "lp_context.h"
 #include "lp_state.h"
-#include "lp_surface.h"
-#include "lp_tile_cache.h"
+#include "lp_setup.h"
 
 #include "draw/draw_context.h"
 
@@ -39,54 +41,19 @@
 
 
 /**
- * XXX this might get moved someday
  * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
- * Here, we flush the old surfaces and update the tile cache to point to the new
- * surfaces.
  */
 void
 llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
                                const struct pipe_framebuffer_state *fb)
 {
    struct llvmpipe_context *lp = llvmpipe_context(pipe);
-   uint i;
 
-   draw_flush(lp->draw);
+   boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      /* check if changing cbuf */
-      if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
-         /* flush old */
-         lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
-         lp_flush_tile_cache(lp->cbuf_cache[i]);
+   if (changed) {
 
-         /* assign new */
-         pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
-
-         /* update cache */
-         lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
-      }
-   }
-
-   lp->framebuffer.nr_cbufs = fb->nr_cbufs;
-
-   /* zbuf changing? */
-   if (lp->framebuffer.zsbuf != fb->zsbuf) {
-
-      if(lp->zsbuf_transfer) {
-         struct pipe_screen *screen = pipe->screen;
-
-         if(lp->zsbuf_map) {
-            screen->transfer_unmap(screen, lp->zsbuf_transfer);
-            lp->zsbuf_map = NULL;
-         }
-
-         screen->tex_transfer_destroy(lp->zsbuf_transfer);
-         lp->zsbuf_transfer = NULL;
-      }
-
-      /* assign new */
-      pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
+      util_copy_framebuffer_state(&lp->framebuffer, fb);
 
       /* Tell draw module how deep the Z/depth buffer is */
       if (lp->framebuffer.zsbuf) {
@@ -103,10 +70,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
          }
          draw_set_mrd(lp->draw, mrd);
       }
-   }
 
-   lp->framebuffer.width = fb->width;
-   lp->framebuffer.height = fb->height;
+      lp_setup_bind_framebuffer( lp->setup, &lp->framebuffer );
 
-   lp->dirty |= LP_NEW_FRAMEBUFFER;
+      lp->dirty |= LP_NEW_FRAMEBUFFER;
+   }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
index 1a17631a4c..57ac25ea0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vertex.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -31,7 +31,6 @@
 
 #include "lp_context.h"
 #include "lp_state.h"
-#include "lp_surface.h"
 
 #include "draw/draw_context.h"
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 39d80726e6..ca0f737b29 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -53,7 +53,7 @@
 #include "util/u_math.h"
 #include "util/u_debug_dump.h"
 
-#include "lp_bld_type.h"
+#include "gallivm/lp_bld_type.h"
 
 
 #define LP_TEST_NUM_SAMPLES 32
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 29fff91981..e49b705598 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -37,10 +37,9 @@
  */
 
 
-#include "lp_bld_type.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_blend.h"
-#include "lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_blend.h"
+#include "gallivm/lp_bld_debug.h"
 #include "lp_test.h"
 
 
@@ -104,18 +103,18 @@ write_tsv_row(FILE *fp,
 
    fprintf(fp,
            "%s\t%s\t%s\t",
-           blend->rgb_func != blend->alpha_func ? "true" : "false",
-           blend->rgb_src_factor != blend->alpha_src_factor ? "true" : "false",
-           blend->rgb_dst_factor != blend->alpha_dst_factor ? "true" : "false");
+           blend->rt[0].rgb_func != blend->rt[0].alpha_func ? "true" : "false",
+           blend->rt[0].rgb_src_factor != blend->rt[0].alpha_src_factor ? "true" : "false",
+           blend->rt[0].rgb_dst_factor != blend->rt[0].alpha_dst_factor ? "true" : "false");
 
    fprintf(fp,
            "%s\t%s\t%s\t%s\t%s\t%s\n",
-           debug_dump_blend_func(blend->rgb_func, TRUE),
-           debug_dump_blend_factor(blend->rgb_src_factor, TRUE),
-           debug_dump_blend_factor(blend->rgb_dst_factor, TRUE),
-           debug_dump_blend_func(blend->alpha_func, TRUE),
-           debug_dump_blend_factor(blend->alpha_src_factor, TRUE),
-           debug_dump_blend_factor(blend->alpha_dst_factor, TRUE));
+           debug_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           debug_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           debug_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           debug_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           debug_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           debug_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
 
    fflush(fp);
 }
@@ -137,12 +136,12 @@ dump_blend_type(FILE *fp,
 
    fprintf(fp,
            " %s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
-           "rgb_func",         debug_dump_blend_func(blend->rgb_func, TRUE),
-           "rgb_src_factor",   debug_dump_blend_factor(blend->rgb_src_factor, TRUE),
-           "rgb_dst_factor",   debug_dump_blend_factor(blend->rgb_dst_factor, TRUE),
-           "alpha_func",       debug_dump_blend_func(blend->alpha_func, TRUE),
-           "alpha_src_factor", debug_dump_blend_factor(blend->alpha_src_factor, TRUE),
-           "alpha_dst_factor", debug_dump_blend_factor(blend->alpha_dst_factor, TRUE));
+           "rgb_func",         debug_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           "rgb_src_factor",   debug_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           "rgb_dst_factor",   debug_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           "alpha_func",       debug_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           "alpha_src_factor", debug_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           "alpha_dst_factor", debug_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
 
    fprintf(fp, " ...\n");
    fflush(fp);
@@ -401,13 +400,15 @@ compute_blend_ref(const struct pipe_blend_state *blend,
    double src_term[4];
    double dst_term[4];
 
-   compute_blend_ref_term(blend->rgb_src_factor, blend->alpha_src_factor, src, src, dst, con, src_term);
-   compute_blend_ref_term(blend->rgb_dst_factor, blend->alpha_dst_factor, dst, src, dst, con, dst_term);
+   compute_blend_ref_term(blend->rt[0].rgb_src_factor, blend->rt[0].alpha_src_factor,
+                          src, src, dst, con, src_term);
+   compute_blend_ref_term(blend->rt[0].rgb_dst_factor, blend->rt[0].alpha_dst_factor,
+                          dst, src, dst, con, dst_term);
 
    /*
     * Combine RGB terms
     */
-   switch (blend->rgb_func) {
+   switch (blend->rt[0].rgb_func) {
    case PIPE_BLEND_ADD:
       ADD_SAT(res[0], src_term[0], dst_term[0]); /* R */
       ADD_SAT(res[1], src_term[1], dst_term[1]); /* G */
@@ -440,7 +441,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
    /*
     * Combine A terms
     */
-   switch (blend->alpha_func) {
+   switch (blend->rt[0].alpha_func) {
    case PIPE_BLEND_ADD:
       ADD_SAT(res[3], src_term[3], dst_term[3]); /* A */
       break;
@@ -462,7 +463,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -531,11 +532,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -596,11 +597,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
@@ -806,14 +807,14 @@ test_all(unsigned verbose, FILE *fp)
                               continue;
 
                            memset(&blend, 0, sizeof blend);
-                           blend.blend_enable      = 1;
-                           blend.rgb_func          = *rgb_func;
-                           blend.rgb_src_factor    = *rgb_src_factor;
-                           blend.rgb_dst_factor    = *rgb_dst_factor;
-                           blend.alpha_func        = *alpha_func;
-                           blend.alpha_src_factor  = *alpha_src_factor;
-                           blend.alpha_dst_factor  = *alpha_dst_factor;
-                           blend.colormask         = PIPE_MASK_RGBA;
+                           blend.rt[0].blend_enable      = 1;
+                           blend.rt[0].rgb_func          = *rgb_func;
+                           blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+                           blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+                           blend.rt[0].alpha_func        = *alpha_func;
+                           blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+                           blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+                           blend.rt[0].colormask         = PIPE_MASK_RGBA;
 
                            if(!test_one(verbose, fp, &blend, mode, *type))
                              success = FALSE;
@@ -865,14 +866,14 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
       type = &blend_types[rand() % num_types];
 
       memset(&blend, 0, sizeof blend);
-      blend.blend_enable      = 1;
-      blend.rgb_func          = *rgb_func;
-      blend.rgb_src_factor    = *rgb_src_factor;
-      blend.rgb_dst_factor    = *rgb_dst_factor;
-      blend.alpha_func        = *alpha_func;
-      blend.alpha_src_factor  = *alpha_src_factor;
-      blend.alpha_dst_factor  = *alpha_dst_factor;
-      blend.colormask         = PIPE_MASK_RGBA;
+      blend.rt[0].blend_enable      = 1;
+      blend.rt[0].rgb_func          = *rgb_func;
+      blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+      blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+      blend.rt[0].alpha_func        = *alpha_func;
+      blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+      blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+      blend.rt[0].colormask         = PIPE_MASK_RGBA;
 
       if(!test_one(verbose, fp, &blend, mode, *type))
         success = FALSE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index faddfb9677..958cc40538 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -34,10 +34,10 @@
  */
 
 
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_conv.h"
-#include "lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_debug.h"
 #include "lp_test.h"
 
 
@@ -142,7 +142,7 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -230,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 23ea9ebbe7..48828bd0a0 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -38,7 +38,7 @@
 #include "util/u_cpu_detect.h"
 #include "util/u_format.h"
 
-#include "lp_bld_format.h"
+#include "gallivm/lp_bld_format.h"
 #include "lp_test.h"
 
 
@@ -199,7 +199,7 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 314544aa9a..14ff00469b 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -36,8 +36,8 @@
 
 #include "util/u_cpu_detect.h"
 
-#include "lp_bld_const.h"
-#include "lp_bld_misc.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_misc.h"
 #include "lp_test.h"
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
deleted file mode 100644
index a6d9a2c1ac..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture tile caching.
- *
- * Author:
- *    Brian Paul
- */
-
-#include "pipe/p_inlines.h"
-#include "util/u_memory.h"
-#include "util/u_tile.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "lp_context.h"
-#include "lp_surface.h"
-#include "lp_texture.h"
-#include "lp_tex_cache.h"
-
-
-
-/**
- * Return the position in the cache for the tile that contains win pos (x,y).
- * We currently use a direct mapped cache so this is like a hack key.
- * At some point we should investige something more sophisticated, like
- * a LRU replacement policy.
- */
-#define CACHE_POS(x, y) \
-   (((x) + (y) * 5) % NUM_ENTRIES)
-
-
-
-/**
- * Is the tile at (x,y) in cleared state?
- */
-static INLINE uint
-is_clear_flag_set(const uint *bitvec, union tex_tile_address addr)
-{
-   int pos, bit;
-   pos = addr.bits.y * (MAX_TEX_WIDTH / TEX_TILE_SIZE) + addr.bits.x;
-   assert(pos / 32 < (MAX_TEX_WIDTH / TEX_TILE_SIZE) * (MAX_TEX_HEIGHT / TEX_TILE_SIZE) / 32);
-   bit = bitvec[pos / 32] & (1 << (pos & 31));
-   return bit;
-}
-   
-
-/**
- * Mark the tile at (x,y) as not cleared.
- */
-static INLINE void
-clear_clear_flag(uint *bitvec, union tex_tile_address addr)
-{
-   int pos;
-   pos = addr.bits.y * (MAX_TEX_WIDTH / TEX_TILE_SIZE) + addr.bits.x;
-   assert(pos / 32 < (MAX_TEX_WIDTH / TEX_TILE_SIZE) * (MAX_TEX_HEIGHT / TEX_TILE_SIZE) / 32);
-   bitvec[pos / 32] &= ~(1 << (pos & 31));
-}
-   
-
-struct llvmpipe_tex_tile_cache *
-lp_create_tex_tile_cache( struct pipe_screen *screen )
-{
-   struct llvmpipe_tex_tile_cache *tc;
-   uint pos;
-
-   tc = CALLOC_STRUCT( llvmpipe_tex_tile_cache );
-   if (tc) {
-      tc->screen = screen;
-      for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].addr.bits.invalid = 1;
-      }
-      tc->last_tile = &tc->entries[0]; /* any tile */
-   }
-   return tc;
-}
-
-
-void
-lp_destroy_tex_tile_cache(struct llvmpipe_tex_tile_cache *tc)
-{
-   struct pipe_screen *screen;
-   uint pos;
-
-   for (pos = 0; pos < NUM_ENTRIES; pos++) {
-      /*assert(tc->entries[pos].x < 0);*/
-   }
-   if (tc->transfer) {
-      screen = tc->transfer->texture->screen;
-      screen->tex_transfer_destroy(tc->transfer);
-   }
-   if (tc->tex_trans) {
-      screen = tc->tex_trans->texture->screen;
-      screen->tex_transfer_destroy(tc->tex_trans);
-   }
-
-   FREE( tc );
-}
-
-
-void
-lp_tex_tile_cache_map_transfers(struct llvmpipe_tex_tile_cache *tc)
-{
-   if (tc->transfer && !tc->transfer_map)
-      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-
-   if (tc->tex_trans && !tc->tex_trans_map)
-      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
-}
-
-
-void
-lp_tex_tile_cache_unmap_transfers(struct llvmpipe_tex_tile_cache *tc)
-{
-   if (tc->transfer_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->transfer);
-      tc->transfer_map = NULL;
-   }
-
-   if (tc->tex_trans_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-      tc->tex_trans_map = NULL;
-   }
-}
-
-void
-lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
-{
-   if (tc->texture) {
-      struct llvmpipe_texture *lpt = llvmpipe_texture(tc->texture);
-      if (lpt->timestamp != tc->timestamp) {
-         /* texture was modified, invalidate all cached tiles */
-         uint i;
-         for (i = 0; i < NUM_ENTRIES; i++) {
-            tc->entries[i].addr.bits.invalid = 1;
-         }
-
-         tc->timestamp = lpt->timestamp;
-      }
-   }
-}
-
-/**
- * Specify the texture to cache.
- */
-void
-lp_tex_tile_cache_set_texture(struct llvmpipe_tex_tile_cache *tc,
-                          struct pipe_texture *texture)
-{
-   uint i;
-
-   assert(!tc->transfer);
-
-   if (tc->texture != texture) {
-      pipe_texture_reference(&tc->texture, texture);
-
-      if (tc->tex_trans) {
-         struct pipe_screen *screen = tc->tex_trans->texture->screen;
-         
-         if (tc->tex_trans_map) {
-            screen->transfer_unmap(screen, tc->tex_trans);
-            tc->tex_trans_map = NULL;
-         }
-
-         screen->tex_transfer_destroy(tc->tex_trans);
-         tc->tex_trans = NULL;
-      }
-
-      /* mark as entries as invalid/empty */
-      /* XXX we should try to avoid this when the teximage hasn't changed */
-      for (i = 0; i < NUM_ENTRIES; i++) {
-         tc->entries[i].addr.bits.invalid = 1;
-      }
-
-      tc->tex_face = -1; /* any invalid value here */
-   }
-}
-
-
-/**
- * Given the texture face, level, zslice, x and y values, compute
- * the cache entry position/index where we'd hope to find the
- * cached texture tile.
- * This is basically a direct-map cache.
- * XXX There's probably lots of ways in which we can improve this.
- */
-static INLINE uint
-tex_cache_pos( union tex_tile_address addr )
-{
-   uint entry = (addr.bits.x + 
-                 addr.bits.y * 9 + 
-                 addr.bits.z * 3 + 
-                 addr.bits.face + 
-                 addr.bits.level * 7);
-
-   return entry % NUM_ENTRIES;
-}
-
-/**
- * Similar to lp_get_cached_tile() but for textures.
- * Tiles are read-only and indexed with more params.
- */
-const struct llvmpipe_cached_tex_tile *
-lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
-                        union tex_tile_address addr )
-{
-   struct pipe_screen *screen = tc->screen;
-   struct llvmpipe_cached_tex_tile *tile;
-   
-   tile = tc->entries + tex_cache_pos( addr );
-
-   if (addr.value != tile->addr.value) {
-
-      /* cache miss.  Most misses are because we've invaldiated the
-       * texture cache previously -- most commonly on binding a new
-       * texture.  Currently we effectively flush the cache on texture
-       * bind.
-       */
-#if 0
-      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
-                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
-                    pos, x/TEX_TILE_SIZE, y/TEX_TILE_SIZE, z, face, level,
-                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
-#endif
-
-      /* check if we need to get a new transfer */
-      if (!tc->tex_trans ||
-          tc->tex_face != addr.bits.face ||
-          tc->tex_level != addr.bits.level ||
-          tc->tex_z != addr.bits.z) {
-         /* get new transfer (view into texture) */
-
-         if (tc->tex_trans) {
-            if (tc->tex_trans_map) {
-               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-               tc->tex_trans_map = NULL;
-            }
-
-            screen->tex_transfer_destroy(tc->tex_trans);
-            tc->tex_trans = NULL;
-         }
-
-         tc->tex_trans = 
-            screen->get_tex_transfer(screen, tc->texture, 
-                                     addr.bits.face, 
-                                     addr.bits.level, 
-                                     addr.bits.z, 
-                                     PIPE_TRANSFER_READ, 0, 0,
-                                     u_minify(tc->texture->width0, addr.bits.level),
-                                     u_minify(tc->texture->height0, addr.bits.level));
-
-         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
-
-         tc->tex_face = addr.bits.face;
-         tc->tex_level = addr.bits.level;
-         tc->tex_z = addr.bits.z;
-      }
-
-      {
-         unsigned x = addr.bits.x * TEX_TILE_SIZE;
-         unsigned y = addr.bits.y * TEX_TILE_SIZE;
-         unsigned w = TEX_TILE_SIZE;
-         unsigned h = TEX_TILE_SIZE;
-
-         if (pipe_clip_tile(x, y, &w, &h, tc->tex_trans)) {
-            assert(0);
-         }
-
-         util_format_read_4ub(tc->tex_trans->texture->format,
-                              (uint8_t *)tile->color, sizeof tile->color[0],
-                              tc->tex_trans_map, tc->tex_trans->stride,
-                              x, y, w, h);
-      }
-
-      tile->addr = addr;
-   }
-
-   tc->last_tile = tile;
-   return tile;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.h b/src/gallium/drivers/llvmpipe/lp_tex_cache.h
deleted file mode 100644
index 05fded78e1..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef LP_TEX_CACHE_H
-#define LP_TEX_CACHE_H
-
-
-#include "pipe/p_compiler.h"
-
-
-struct llvmpipe_context;
-struct llvmpipe_tex_tile_cache;
-
-
-/**
- * Cache tile size (width and height). This needs to be a power of two.
- */
-#define TEX_TILE_SIZE 64
-
-
-/* If we need to support > 4096, just expand this to be a 64 bit
- * union, or consider tiling in Z as well.
- */
-union tex_tile_address {
-   struct {
-      unsigned x:6;             /* 4096 / TEX_TILE_SIZE */
-      unsigned y:6;             /* 4096 / TEX_TILE_SIZE */
-      unsigned z:12;            /* 4096 -- z not tiled */
-      unsigned face:3;
-      unsigned level:4;
-      unsigned invalid:1;
-   } bits;
-   unsigned value;
-};
-
-
-struct llvmpipe_cached_tex_tile
-{
-   union tex_tile_address addr;
-   uint8_t color[TEX_TILE_SIZE][TEX_TILE_SIZE][4];
-};
-
-#define NUM_ENTRIES 50
-
-
-/** XXX move these */
-#define MAX_TEX_WIDTH 2048
-#define MAX_TEX_HEIGHT 2048
-
-
-struct llvmpipe_tex_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct pipe_texture *texture;  /**< if caching a texture */
-   unsigned timestamp;
-
-   struct llvmpipe_cached_tex_tile entries[NUM_ENTRIES];
-
-   struct pipe_transfer *tex_trans;
-   void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
-
-   struct llvmpipe_cached_tex_tile *last_tile;  /**< most recently retrieved tile */
-};
-
-
-extern struct llvmpipe_tex_tile_cache *
-lp_create_tex_tile_cache( struct pipe_screen *screen );
-
-extern void
-lp_destroy_tex_tile_cache(struct llvmpipe_tex_tile_cache *tc);
-
-extern void
-lp_tex_tile_cache_map_transfers(struct llvmpipe_tex_tile_cache *tc);
-
-extern void
-lp_tex_tile_cache_unmap_transfers(struct llvmpipe_tex_tile_cache *tc);
-
-extern void
-lp_tex_tile_cache_set_texture(struct llvmpipe_tex_tile_cache *tc,
-                          struct pipe_texture *texture);
-
-void
-lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc);
-
-extern const struct llvmpipe_cached_tex_tile *
-lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
-                        union tex_tile_address addr );
-
-static INLINE union tex_tile_address
-tex_tile_address( unsigned x,
-                  unsigned y,
-                  unsigned z,
-                  unsigned face,
-                  unsigned level )
-{
-   union tex_tile_address addr;
-
-   addr.value = 0;
-   addr.bits.x = x / TEX_TILE_SIZE;
-   addr.bits.y = y / TEX_TILE_SIZE;
-   addr.bits.z = z;
-   addr.bits.face = face;
-   addr.bits.level = level;
-      
-   return addr;
-}
-
-/* Quickly retrieve tile if it matches last lookup.
- */
-static INLINE const struct llvmpipe_cached_tex_tile *
-lp_get_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
-                       union tex_tile_address addr )
-{
-   if (tc->last_tile->addr.value == addr.value)
-      return tc->last_tile;
-
-   return lp_find_cached_tex_tile( tc, addr );
-}
-
-
-#endif /* LP_TEX_CACHE_H */
-
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
index d2a6ae21f5..2533275dc1 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
@@ -42,12 +42,11 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_type.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_sample.h"
-#include "lp_bld_tgsi.h"
-#include "lp_state.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "lp_jit.h"
 #include "lp_tex_sample.h"
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 2c135029ea..022bf92cb4 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -32,45 +32,42 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_inlines.h"
 
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "lp_context.h"
-#include "lp_state.h"
-#include "lp_texture.h"
-#include "lp_tex_cache.h"
 #include "lp_screen.h"
+#include "lp_texture.h"
+#include "lp_tile_size.h"
 #include "lp_winsys.h"
 
 
-/* Simple, maximally packed layout.
- */
-
-/* Conventional allocation path for non-display textures:
+/**
+ * Conventional allocation path for non-display textures:
+ * Simple, maximally packed layout.
  */
 static boolean
 llvmpipe_texture_layout(struct llvmpipe_screen *screen,
-                        struct llvmpipe_texture * lpt)
+                        struct llvmpipe_texture *lpt)
 {
    struct pipe_texture *pt = &lpt->base;
    unsigned level;
    unsigned width = pt->width0;
    unsigned height = pt->height0;
    unsigned depth = pt->depth0;
-
    unsigned buffer_size = 0;
 
    for (level = 0; level <= pt->last_level; level++) {
       unsigned nblocksx, nblocksy;
 
       /* Allocate storage for whole quads. This is particularly important
-       * for depth surfaces, which are currently stored in a swizzled format. */
-      nblocksx = util_format_get_nblocksx(pt->format, align(width, 2));
-      nblocksy = util_format_get_nblocksy(pt->format, align(height, 2));
+       * for depth surfaces, which are currently stored in a swizzled format.
+       */
+      nblocksx = util_format_get_nblocksx(pt->format, align(width, TILE_SIZE));
+      nblocksy = util_format_get_nblocksy(pt->format, align(height, TILE_SIZE));
 
       lpt->stride[level] = align(nblocksx * util_format_get_blocksize(pt->format), 16);
 
@@ -80,7 +77,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
                       ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                       lpt->stride[level]);
 
-      width  = u_minify(width, 1);
+      width = u_minify(width, 1);
       height = u_minify(height, 1);
       depth = u_minify(depth, 1);
    }
@@ -90,16 +87,23 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
    return lpt->data != NULL;
 }
 
+
+
 static boolean
 llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
-                              struct llvmpipe_texture * lpt)
+                              struct llvmpipe_texture *lpt)
 {
    struct llvmpipe_winsys *winsys = screen->winsys;
 
+   /* Round up the surface size to a multiple of the tile size to
+    * avoid tile clipping.
+    */
+   unsigned width = align(lpt->base.width0, TILE_SIZE);
+   unsigned height = align(lpt->base.height0, TILE_SIZE);
+
    lpt->dt = winsys->displaytarget_create(winsys,
                                           lpt->base.format,
-                                          lpt->base.width0,
-                                          lpt->base.height0,
+                                          width, height,
                                           16,
                                           &lpt->stride[0] );
 
@@ -107,9 +111,6 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
 }
 
 
-
-
-
 static struct pipe_texture *
 llvmpipe_texture_create(struct pipe_screen *_screen,
                         const struct pipe_texture *templat)
@@ -126,7 +127,7 @@ llvmpipe_texture_create(struct pipe_screen *_screen,
    /* XXX: The xlib state tracker is brain-dead and will request
     * PIPE_FORMAT_Z16_UNORM no matter how much we tell it we don't support it.
     */
-   if(lpt->base.format == PIPE_FORMAT_Z16_UNORM)
+   if (lpt->base.format == PIPE_FORMAT_Z16_UNORM)
       lpt->base.format = PIPE_FORMAT_Z32_UNORM;
 
    if (lpt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
@@ -178,6 +179,7 @@ llvmpipe_texture_blanket(struct pipe_screen * screen,
 
    return &lpt->base;
 #else
+   debug_printf("llvmpipe_texture_blanket() not implemented!");
    return NULL;
 #endif
 }
@@ -189,12 +191,15 @@ llvmpipe_texture_destroy(struct pipe_texture *pt)
    struct llvmpipe_screen *screen = llvmpipe_screen(pt->screen);
    struct llvmpipe_texture *lpt = llvmpipe_texture(pt);
 
-   if(lpt->dt) {
+   if (lpt->dt) {
+      /* display target */
       struct llvmpipe_winsys *winsys = screen->winsys;
       winsys->displaytarget_destroy(winsys, lpt->dt);
    }
-   else
+   else {
+      /* regular texture */
       align_free(lpt->data);
+   }
 
    FREE(lpt);
 }
@@ -236,7 +241,7 @@ llvmpipe_get_tex_surface(struct pipe_screen *screen,
 
       if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
                        PIPE_BUFFER_USAGE_GPU_WRITE)) {
-         /* Mark the surface as dirty.  The tile cache will look for this. */
+         /* Mark the surface as dirty. */
          lpt->timestamp++;
          llvmpipe_screen(screen)->timestamp++;
       }
@@ -298,8 +303,8 @@ llvmpipe_get_tex_transfer(struct pipe_screen *screen,
       pipe_texture_reference(&pt->texture, texture);
       pt->x = x;
       pt->y = y;
-      pt->width = w;
-      pt->height = h;
+      pt->width = align(w, TILE_SIZE);
+      pt->height = align(h, TILE_SIZE);
       pt->stride = lptex->stride[level];
       pt->usage = usage;
       pt->face = face;
@@ -356,7 +361,8 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    lpt = llvmpipe_texture(transfer->texture);
    format = lpt->base.format;
 
-   if(lpt->dt) {
+   if (lpt->dt) {
+      /* display target */
       struct llvmpipe_winsys *winsys = screen->winsys;
 
       map = winsys->displaytarget_map(winsys, lpt->dt,
@@ -364,16 +370,16 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
       if (map == NULL)
          return NULL;
    }
-   else
+   else {
+      /* regular texture */
       map = lpt->data;
+   }
 
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE))
-   {
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE)) {
       /* Do something to notify sharing contexts of a texture change.
-       * In llvmpipe, that would mean flushing the texture cache.
        */
       screen->timestamp++;
    }
@@ -387,29 +393,24 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
 
 
 static void
-llvmpipe_transfer_unmap(struct pipe_screen *_screen,
+llvmpipe_transfer_unmap(struct pipe_screen *screen,
                        struct pipe_transfer *transfer)
 {
-   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_screen *lp_screen = llvmpipe_screen(screen);
    struct llvmpipe_texture *lpt;
 
    assert(transfer->texture);
    lpt = llvmpipe_texture(transfer->texture);
 
-   if(lpt->dt) {
-      struct llvmpipe_winsys *winsys = screen->winsys;
+   if (lpt->dt) {
+      /* display target */
+      struct llvmpipe_winsys *winsys = lp_screen->winsys;
       winsys->displaytarget_unmap(winsys, lpt->dt);
    }
 }
 
 
 void
-llvmpipe_init_texture_funcs(struct llvmpipe_context *lp)
-{
-}
-
-
-void
 llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen)
 {
    screen->texture_create = llvmpipe_texture_create;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index 00a20763e4..87c905bc02 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -37,6 +37,7 @@ struct pipe_screen;
 struct llvmpipe_context;
 struct llvmpipe_displaytarget;
 
+
 struct llvmpipe_texture
 {
    struct pipe_texture base;
@@ -58,6 +59,7 @@ struct llvmpipe_texture
    unsigned timestamp;
 };
 
+
 struct llvmpipe_transfer
 {
    struct pipe_transfer base;
@@ -73,6 +75,14 @@ llvmpipe_texture(struct pipe_texture *pt)
    return (struct llvmpipe_texture *) pt;
 }
 
+
+static INLINE const struct llvmpipe_texture *
+llvmpipe_texture_const(const struct pipe_texture *pt)
+{
+   return (const struct llvmpipe_texture *) pt;
+}
+
+
 static INLINE struct llvmpipe_transfer *
 llvmpipe_transfer(struct pipe_transfer *pt)
 {
@@ -81,10 +91,7 @@ llvmpipe_transfer(struct pipe_transfer *pt)
 
 
 extern void
-llvmpipe_init_texture_funcs( struct llvmpipe_context *llvmpipe );
-
-extern void
 llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 
 
-#endif /* LP_TEXTURE */
+#endif /* LP_TEXTURE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
deleted file mode 100644
index 7a1ecf5107..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture tile caching.
- *
- * Author:
- *    Brian Paul
- */
-
-#include "pipe/p_inlines.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "util/u_tile.h"
-#include "util/u_rect.h"
-#include "lp_context.h"
-#include "lp_surface.h"
-#include "lp_texture.h"
-#include "lp_tile_soa.h"
-#include "lp_tile_cache.h"
-
-
-#define MAX_WIDTH 4096
-#define MAX_HEIGHT 4096
-
-
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
-
-
-struct llvmpipe_tile_cache *
-lp_create_tile_cache( struct pipe_screen *screen )
-{
-   struct llvmpipe_tile_cache *tc;
-   int maxLevels, maxTexSize;
-
-   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
-   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
-   maxTexSize = 1 << (maxLevels - 1);
-   assert(MAX_WIDTH >= maxTexSize);
-
-   tc = CALLOC_STRUCT( llvmpipe_tile_cache );
-   if(!tc)
-      return NULL;
-
-   tc->screen = screen;
-
-   return tc;
-}
-
-
-void
-lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc)
-{
-   struct pipe_screen *screen;
-   unsigned x, y;
-
-   for (y = 0; y < MAX_HEIGHT; y += TILE_SIZE) {
-      for (x = 0; x < MAX_WIDTH; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-         if(tile->color)
-            align_free(tile->color);
-      }
-   }
-
-   if (tc->transfer) {
-      screen = tc->transfer->texture->screen;
-      screen->tex_transfer_destroy(tc->transfer);
-   }
-
-   FREE( tc );
-}
-
-
-/**
- * Specify the surface to cache.
- */
-void
-lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
-                          struct pipe_surface *ps)
-{
-   if (tc->transfer) {
-      struct pipe_screen *screen = tc->transfer->texture->screen;
-
-      if (ps == tc->surface)
-         return;
-
-      if (tc->transfer_map) {
-         screen->transfer_unmap(screen, tc->transfer);
-         tc->transfer_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->transfer);
-      tc->transfer = NULL;
-   }
-
-   tc->surface = ps;
-
-   if (ps) {
-      struct pipe_screen *screen = ps->texture->screen;
-      unsigned x, y;
-
-      tc->transfer = screen->get_tex_transfer(screen, ps->texture, ps->face,
-                                              ps->level, ps->zslice,
-                                              PIPE_TRANSFER_READ_WRITE,
-                                              0, 0, ps->width, ps->height);
-
-      for (y = 0; y < ps->height; y += TILE_SIZE) {
-         for (x = 0; x < ps->width; x += TILE_SIZE) {
-            struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-            tile->status = LP_TILE_STATUS_UNDEFINED;
-
-            if(!tile->color)
-               tile->color = align_malloc( TILE_SIZE*TILE_SIZE*NUM_CHANNELS, 16 );
-         }
-      }
-   }
-}
-
-
-/**
- * Return the transfer being cached.
- */
-struct pipe_surface *
-lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc)
-{
-   return tc->surface;
-}
-
-
-void
-lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc)
-{
-   if (tc->transfer && !tc->transfer_map)
-      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-}
-
-
-void
-lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc)
-{
-   if (tc->transfer_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->transfer);
-      tc->transfer_map = NULL;
-   }
-}
-
-
-/**
- * Set a tile to a solid color.
- */
-static void
-clear_tile(struct llvmpipe_cached_tile *tile,
-           uint8_t clear_color[4])
-{
-   if (clear_color[0] == clear_color[1] &&
-       clear_color[1] == clear_color[2] &&
-       clear_color[2] == clear_color[3]) {
-      memset(tile->color, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
-   }
-   else {
-      uint x, y, chan;
-      for (y = 0; y < TILE_SIZE; y++)
-         for (x = 0; x < TILE_SIZE; x++)
-            for (chan = 0; chan < 4; ++chan)
-               TILE_PIXEL(tile->color, x, y, chan) = clear_color[chan];
-   }
-}
-
-
-/**
- * Flush the tile cache: write all dirty tiles back to the transfer.
- * any tiles "flagged" as cleared will be "really" cleared.
- */
-void
-lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
-{
-   struct pipe_transfer *pt = tc->transfer;
-   unsigned x, y;
-
-   if(!pt)
-      return;
-
-   assert(tc->transfer_map);
-
-   /* push the tile to all positions marked as clear */
-   for (y = 0; y < pt->height; y += TILE_SIZE) {
-      for (x = 0; x < pt->width; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-         if(tile->status != LP_TILE_STATUS_UNDEFINED) {
-            unsigned w = TILE_SIZE;
-            unsigned h = TILE_SIZE;
-
-            if (!pipe_clip_tile(x, y, &w, &h, pt)) {
-               switch(tile->status) {
-               case LP_TILE_STATUS_CLEAR:
-                  /* Actually clear the tiles which were flagged as being in a
-                   * clear state. */
-                  util_fill_rect(tc->transfer_map, pt->texture->format, pt->stride,
-                                 x, y, w, h,
-                                 tc->clear_val);
-                  break;
-
-               case LP_TILE_STATUS_DEFINED:
-                  lp_tile_write_4ub(pt->texture->format,
-                                    tile->color,
-                                    tc->transfer_map, pt->stride,
-                                    x, y, w, h);
-                  break;
-
-               default:
-                  assert(0);
-                  break;
-               }
-            }
-
-            tile->status = LP_TILE_STATUS_UNDEFINED;
-         }
-      }
-   }
-}
-
-
-/**
- * Get a tile from the cache.
- * \param x, y  position of tile, in pixels
- */
-void *
-lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
-                   unsigned x, unsigned y )
-{
-   struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-   struct pipe_transfer *pt = tc->transfer;
-   
-   assert(tc->surface);
-   assert(tc->transfer);
-
-   if(!tc->transfer_map)
-      lp_tile_cache_map_transfers(tc);
-
-   assert(tc->transfer_map);
-
-   switch(tile->status) {
-   case LP_TILE_STATUS_CLEAR:
-      /* don't get tile from framebuffer, just clear it */
-      clear_tile(tile, tc->clear_color);
-      tile->status = LP_TILE_STATUS_DEFINED;
-      break;
-
-   case LP_TILE_STATUS_UNDEFINED: {
-      unsigned w = TILE_SIZE;
-      unsigned h = TILE_SIZE;
-
-      x &= ~(TILE_SIZE - 1);
-      y &= ~(TILE_SIZE - 1);
-
-      if (!pipe_clip_tile(x, y, &w, &h, tc->transfer))
-         lp_tile_read_4ub(pt->texture->format,
-                          tile->color,
-                          tc->transfer_map, tc->transfer->stride,
-                          x, y, w, h);
-
-      tile->status = LP_TILE_STATUS_DEFINED;
-      break;
-   }
-
-   case LP_TILE_STATUS_DEFINED:
-      /* nothing to do */
-      break;
-   }
-
-   return tile->color;
-}
-
-
-/**
- * When a whole surface is being cleared to a value we can avoid
- * fetching tiles above.
- * Save the color and set a 'clearflag' for each tile of the screen.
- */
-void
-lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
-                    uint clearValue)
-{
-   struct pipe_transfer *pt = tc->transfer;
-   const unsigned w = pt->width;
-   const unsigned h = pt->height;
-   unsigned x, y, chan;
-
-   for(chan = 0; chan < 4; ++chan)
-      tc->clear_color[chan] = float_to_ubyte(rgba[chan]);
-
-   tc->clear_val = clearValue;
-
-   /* push the tile to all positions marked as clear */
-   for (y = 0; y < h; y += TILE_SIZE) {
-      for (x = 0; x < w; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-         tile->status = LP_TILE_STATUS_CLEAR;
-      }
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
deleted file mode 100644
index 161bab3799..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef LP_TILE_CACHE_H
-#define LP_TILE_CACHE_H
-
-
-#include "pipe/p_compiler.h"
-#include "lp_tile_soa.h"
-
-
-struct llvmpipe_tile_cache;  /* opaque */
-
-
-extern struct llvmpipe_tile_cache *
-lp_create_tile_cache( struct pipe_screen *screen );
-
-extern void
-lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
-                          struct pipe_surface *lps);
-
-extern struct pipe_surface *
-lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_flush_tile_cache(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
-                    uint clearValue);
-
-extern void *
-lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
-                   unsigned x, unsigned y );
-
-
-#endif /* LP_TILE_CACHE_H */
-
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_size.h b/src/gallium/drivers/llvmpipe/lp_tile_size.h
new file mode 100644
index 0000000000..f0b983c063
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_size.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TILE_SIZE_H
+#define LP_TILE_SIZE_H
+
+
+/**
+ * Tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_ORDER 6
+#define TILE_SIZE (1 << TILE_ORDER)
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
index 19d00b58d3..eea3ab8499 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -30,7 +30,7 @@
 
 #include "pipe/p_compiler.h"
 #include "tgsi/tgsi_exec.h" /* for NUM_CHANNELS */
-
+#include "lp_tile_size.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,26 +40,20 @@ extern "C" {
 struct pipe_transfer;
 
 
-/**
- * Cache tile size (width and height). This needs to be a power of two.
- */
-#define TILE_SIZE 64
-
-
-#define TILE_VECTOR_HEIGHT 2
-#define TILE_VECTOR_WIDTH 8
+#define TILE_VECTOR_HEIGHT 4
+#define TILE_VECTOR_WIDTH 4
 
 extern const unsigned char
 tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH];
 
-#define TILE_C_STRIDE (TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH)
-#define TILE_X_STRIDE (NUM_CHANNELS*TILE_C_STRIDE)
-#define TILE_Y_STRIDE (TILE_VECTOR_HEIGHT*TILE_SIZE*NUM_CHANNELS)
+#define TILE_C_STRIDE (TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH) //16
+#define TILE_X_STRIDE (NUM_CHANNELS * TILE_C_STRIDE) //64
+#define TILE_Y_STRIDE (TILE_VECTOR_HEIGHT * TILE_SIZE * NUM_CHANNELS) //1024
 
 #define TILE_PIXEL(_p, _x, _y, _c) \
-   ((_p)[((_y)/TILE_VECTOR_HEIGHT)*TILE_Y_STRIDE + \
-         ((_x)/TILE_VECTOR_WIDTH)*TILE_X_STRIDE + \
-         (_c)*TILE_C_STRIDE + \
+   ((_p)[((_y) / TILE_VECTOR_HEIGHT) * TILE_Y_STRIDE + \
+         ((_x) / TILE_VECTOR_WIDTH) * TILE_X_STRIDE + \
+         (_c) * TILE_C_STRIDE + \
          tile_offset[(_y) % TILE_VECTOR_HEIGHT][(_x) % TILE_VECTOR_WIDTH]])
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 004c5c979e..5d53689a3d 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -129,22 +129,8 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     print
     
 
-def generate_format_write(format, src_type, src_native_type, src_suffix):
-    '''Generate the function to write pixels to a particular format'''
-
-    name = short_name(format)
-
-    dst_native_type = native_type(format)
-
-    print 'static void'
-    print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
-    print '{'
-    print '   unsigned x, y;'
-    print '   uint8_t *dst_row = dst + y0*dst_stride;'
-    print '   for (y = 0; y < h; ++y) {'
-    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
-    print '      for (x = 0; x < w; ++x) {'
-
+def compute_inverse_swizzle(format):
+    '''Return an array[4] of inverse swizzle terms'''
     inv_swizzle = [None]*4
     if format.colorspace == 'rgb':
         for i in range(4):
@@ -155,8 +141,86 @@ def generate_format_write(format, src_type, src_native_type, src_suffix):
         swizzle = format.out_swizzle[0]
         if swizzle < 4:
             inv_swizzle[swizzle] = 0
-    else:
-        assert False
+    return inv_swizzle
+
+
+def pack_rgba(format, src_type, r, g, b, a):
+    """Return an expression for packing r, g, b, a into a pixel of the
+    given format.  Ex: '(b << 24) | (g << 16) | (r << 8) | (a << 0)'
+    """
+    assert format.colorspace == 'rgb'
+    inv_swizzle = compute_inverse_swizzle(format)
+    shift = 0
+    expr = None
+    for i in range(4):
+		# choose r, g, b, or a depending on the inverse swizzle term
+        if inv_swizzle[i] == 0:
+            value = r
+        elif inv_swizzle[i] == 1:
+            value = g
+        elif inv_swizzle[i] == 2:
+            value = b
+        elif inv_swizzle[i] == 3:
+            value = a
+        else:
+            value = None
+
+        if value:
+            dst_type = format.in_types[i]
+            dst_native_type = native_type(format)
+            value = conversion_expr(src_type, dst_type, dst_native_type, value)
+            term = "((%s) << %d)" % (value, shift)
+            if expr:
+                expr = expr + " | " + term
+            else:
+                expr = term
+
+        width = format.in_types[i].size
+        shift = shift + width
+    return expr
+
+
+def emit_unrolled_write_code(format, src_type):
+    '''Emit code for writing a block based on unrolled loops.
+    This is considerably faster than the TILE_PIXEL-based code below.
+    '''
+    dst_native_type = native_type(format)
+    print '   const unsigned dstpix_stride = dst_stride / %d;' % format.stride()
+    print '   %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type)
+    print '   unsigned int qx, qy, i;'
+    print
+    print '   for (qy = 0; qy < h; qy += TILE_VECTOR_HEIGHT) {'
+    print '      const unsigned py = y0 + qy;'
+    print '      for (qx = 0; qx < w; qx += TILE_VECTOR_WIDTH) {'
+    print '         const unsigned px = x0 + qx;'
+    print '         const uint8_t *r = src + 0 * TILE_C_STRIDE;'
+    print '         const uint8_t *g = src + 1 * TILE_C_STRIDE;'
+    print '         const uint8_t *b = src + 2 * TILE_C_STRIDE;'
+    print '         const uint8_t *a = src + 3 * TILE_C_STRIDE;'
+    print '         (void) r; (void) g; (void) b; (void) a; /* silence warnings */'
+    print '         for (i = 0; i < TILE_C_STRIDE; i += 2) {'
+    print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_type, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
+    print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_type, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
+    print '            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);'
+    print '            dstpix[offset + 0] = pixel0;'
+    print '            dstpix[offset + 1] = pixel1;'
+    print '         }'
+    print '         src += TILE_X_STRIDE;'
+    print '      }'
+    print '   }'
+
+
+def emit_tile_pixel_write_code(format, src_type):
+    '''Emit code for writing a block based on the TILE_PIXEL macro.'''
+    dst_native_type = native_type(format)
+
+    inv_swizzle = compute_inverse_swizzle(format)
+
+    print '   unsigned x, y;'
+    print '   uint8_t *dst_row = dst + y0*dst_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
 
     if format.layout == ARITH:
         print '         %s pixel = 0;' % dst_native_type
@@ -185,6 +249,20 @@ def generate_format_write(format, src_type, src_native_type, src_suffix):
     print '      }'
     print '      dst_row += dst_stride;'
     print '   }'
+
+
+def generate_format_write(format, src_type, src_native_type, src_suffix):
+    '''Generate the function to write pixels to a particular format'''
+
+    name = short_name(format)
+
+    print 'static void'
+    print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
+    print '{'
+    if format.layout == ARITH and format.colorspace == 'rgb':
+        emit_unrolled_write_code(format, src_type)
+    else:
+        emit_tile_pixel_write_code(format, src_type)
     print '}'
     print
     
@@ -259,8 +337,23 @@ def main():
     print
     print 'const unsigned char'
     print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
-    print '   {  0,  1,  4,  5,  8,  9, 12, 13},'
-    print '   {  2,  3,  6,  7, 10, 11, 14, 15}'
+    print '   {  0,  1,  4,  5},'
+    print '   {  2,  3,  6,  7},'
+    print '   {  8,  9, 12, 13},'
+    print '   { 10, 11, 14, 15}'
+    print '};'
+    print
+    print '/* Note: these lookup tables could be replaced with some'
+    print ' * bit-twiddling code, but this is a little faster.'
+    print ' */'
+    print 'static unsigned tile_x_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
+    print '   0, 1, 0, 1, 2, 3, 2, 3,'
+    print '   0, 1, 0, 1, 2, 3, 2, 3'
+    print '};'
+    print
+    print 'static unsigned tile_y_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
+    print '   0, 0, 1, 1, 0, 0, 1, 1,'
+    print '   2, 2, 3, 3, 2, 2, 3, 3'
     print '};'
     print
 
diff --git a/src/gallium/drivers/llvmpipe/lp_winsys.h b/src/gallium/drivers/llvmpipe/lp_winsys.h
index 74b472b653..ce11fa9304 100644
--- a/src/gallium/drivers/llvmpipe/lp_winsys.h
+++ b/src/gallium/drivers/llvmpipe/lp_winsys.h
@@ -113,9 +113,6 @@ struct llvmpipe_winsys
 };
 
 
-struct pipe_context *
-llvmpipe_create( struct pipe_screen * );
-
 
 struct pipe_screen *
 llvmpipe_create_screen( struct llvmpipe_winsys * );