163 files changed, 13069 insertions, 9208 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 1d0930e024..38ce14df6b 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -105,9 +105,16 @@ C_SOURCES = \
 	util/u_cpu_detect.c \
 	util/u_dl.c \
 	util/u_draw_quad.c \
-	util/u_format_access.c \
+	util/u_format.c \
+	util/u_format_other.c \
+	util/u_format_s3tc.c \
+	util/u_format_srgb.c \
 	util/u_format_table.c \
+	util/u_format_tests.c \
+	util/u_format_yuv.c \
+	util/u_format_zs.c \
 	util/u_gen_mipmap.c \
+	util/u_half.c \
 	util/u_handle_table.c \
 	util/u_hash_table.c \
 	util/u_hash.c \
@@ -118,54 +125,55 @@ C_SOURCES = \
 	util/u_mm.c \
 	util/u_rect.c \
 	util/u_ringbuffer.c \
+	util/u_sampler.c \
 	util/u_simple_shaders.c \
 	util/u_snprintf.c \
 	util/u_surface.c \
+	util/u_surfaces.c \
 	util/u_texture.c \
 	util/u_tile.c \
-	util/u_timed_winsys.c \
+	util/u_transfer.c \
+	util/u_resource.c \
 	util/u_upload_mgr.c \
-	util/u_simple_screen.c \
-	vl/vl_bitstream_parser.c \
-	vl/vl_mpeg12_mc_renderer.c \
-	vl/vl_compositor.c \
-	vl/vl_csc.c \
-	vl/vl_shader_build.c
+	target-helpers/wrap_screen.c
+
+	# Disabling until pipe-video branch gets merged in
+	#vl/vl_bitstream_parser.c \
+	#vl/vl_mpeg12_mc_renderer.c \
+	#vl/vl_compositor.c \
+	#vl/vl_csc.c \
+	#vl/vl_shader_build.c \
 
 GALLIVM_SOURCES = \
-        gallivm/lp_bld_alpha.c \
         gallivm/lp_bld_arit.c \
-        gallivm/lp_bld_blend_aos.c \
-        gallivm/lp_bld_blend_logicop.c \
-        gallivm/lp_bld_blend_soa.c \
         gallivm/lp_bld_const.c \
         gallivm/lp_bld_conv.c \
         gallivm/lp_bld_debug.c \
-        gallivm/lp_bld_depth.c \
         gallivm/lp_bld_flow.c \
-        gallivm/lp_bld_format_aos.c \
-        gallivm/lp_bld_format_query.c \
         gallivm/lp_bld_format_soa.c \
-        gallivm/lp_bld_interp.c \
+        gallivm/lp_bld_init.c \
         gallivm/lp_bld_intr.c \
         gallivm/lp_bld_logic.c \
         gallivm/lp_bld_pack.c \
+        gallivm/lp_bld_printf.c \
         gallivm/lp_bld_sample.c \
         gallivm/lp_bld_sample_soa.c \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
         gallivm/lp_bld_tgsi_soa.c \
-        gallivm/lp_bld_type.c
+        gallivm/lp_bld_type.c \
+        draw/draw_llvm.c \
+        draw/draw_pt_fetch_shade_pipeline_llvm.c \
+        draw/draw_llvm_translate.c
 
-GALLIVM_CPP_SOURCES = \
-        gallivm/lp_bld_init.cpp
+GALLIVM_CPP_SOURCES =
 
 GENERATED_SOURCES = \
 	indices/u_indices_gen.c \
 	indices/u_unfilled_gen.c \
-	util/u_format_access.c \
-	util/u_format_pack.h \
-	util/u_format_table.c
+	util/u_format_srgb.c \
+	util/u_format_table.c \
+	util/u_half.c
 
 
 ifeq ($(MESA_LLVM),1)
@@ -188,12 +196,11 @@ indices/u_indices_gen.c: indices/u_indices_gen.py
 indices/u_unfilled_gen.c: indices/u_unfilled_gen.py
 	python $< > $@
 
-util/u_format_table.c: util/u_format_table.py util/u_format_parse.py util/u_format.csv
-	python util/u_format_table.py util/u_format.csv > $@
-
-util/u_format_pack.h: util/u_format_pack.py util/u_format_parse.py util/u_format.csv
-	python util/u_format_pack.py util/u_format.csv > $@
+util/u_format_srgb.c: util/u_format_srgb.py
+	python $< > $@
 
-util/u_format_access.c: util/u_format_access.py util/u_format_parse.py util/u_format.csv
-	python util/u_format_access.py util/u_format.csv > $@
+util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_format_parse.py util/u_format.csv
+	python util/u_format_table.py util/u_format.csv > $@
 
+util/u_half.c: util/u_half.py
+	python util/u_half.py > $@
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index f365c4bbdd..a9ec5d4e2c 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -7,6 +7,8 @@ env.Append(CPPPATH = [
     'util',
 ])
 
+env.Tool('udis86')
+
 env.CodeGenerate(
     target = 'indices/u_indices_gen.c', 
     script = 'indices/u_indices_gen.py', 
@@ -22,26 +24,31 @@ env.CodeGenerate(
 )
 
 env.CodeGenerate(
-    target = 'util/u_format_table.c',
-    script = 'util/u_format_table.py',
-    source = ['util/u_format.csv'],
-    command = 'python $SCRIPT $SOURCE > $TARGET'
+    target = 'util/u_format_srgb.c', 
+    script = 'util/u_format_srgb.py', 
+    source = [],
+    command = python_cmd + ' $SCRIPT > $TARGET'
 )
 
 env.CodeGenerate(
-    target = File('util/u_format_pack.h').srcnode(),
-    script = 'util/u_format_pack.py',
+    target = 'util/u_format_table.c',
+    script = 'util/u_format_table.py',
     source = ['util/u_format.csv'],
     command = 'python $SCRIPT $SOURCE > $TARGET'
 )
 
 env.CodeGenerate(
-    target = 'util/u_format_access.c',
-    script = 'util/u_format_access.py',
-    source = ['util/u_format.csv'],
-    command = 'python $SCRIPT $SOURCE > $TARGET'
+    target = 'util/u_half.c',
+    script = 'util/u_half.py',
+    source = [],
+    command = 'python $SCRIPT > $TARGET'
 )
 
+env.Depends('util/u_format_table.c', [
+    'util/u_format_parse.py', 
+    'util/u_format_pack.py', 
+])
+
 source = [
     'cso_cache/cso_context.c',
     'cso_cache/cso_cache.c',
@@ -147,9 +154,16 @@ source = [
     'util/u_dump_state.c',
     'util/u_dl.c',
     'util/u_draw_quad.c',
-    'util/u_format_access.c',
+    'util/u_format.c',
+    'util/u_format_other.c',
+    'util/u_format_s3tc.c',
+    'util/u_format_srgb.c',
     'util/u_format_table.c',
+    'util/u_format_tests.c',
+    'util/u_format_yuv.c',
+    'util/u_format_zs.c',
     'util/u_gen_mipmap.c',
+    'util/u_half.c',
     'util/u_handle_table.c',
     'util/u_hash.c',
     'util/u_hash_table.c',
@@ -158,48 +172,48 @@ source = [
     'util/u_math.c',
     'util/u_mm.c',
     'util/u_rect.c',
+    'util/u_resource.c',
     'util/u_ringbuffer.c',
+    'util/u_sampler.c',
     'util/u_simple_shaders.c',
     'util/u_snprintf.c',
     'util/u_surface.c',
+    'util/u_surfaces.c',
     'util/u_texture.c',
     'util/u_tile.c',
-    'util/u_timed_winsys.c',
+    'util/u_transfer.c',
     'util/u_upload_mgr.c',
-    'util/u_simple_screen.c',
-    'vl/vl_bitstream_parser.c',
-    'vl/vl_mpeg12_mc_renderer.c',
-    'vl/vl_compositor.c',
-    'vl/vl_csc.c',
-    'vl/vl_shader_build.c',
+    # Disabling until pipe-video branch gets merged in
+    #'vl/vl_bitstream_parser.c',
+    #'vl/vl_mpeg12_mc_renderer.c',
+    #'vl/vl_compositor.c',
+    #'vl/vl_csc.c',
+    #'vl/vl_shader_build.c',
+    'target-helpers/wrap_screen.c',
 ]
 
-if drawllvm:
+if env['llvm']:
     source += [
-    'gallivm/lp_bld_alpha.c',
     'gallivm/lp_bld_arit.c',
-    'gallivm/lp_bld_blend_aos.c',
-    'gallivm/lp_bld_blend_logicop.c',
-    'gallivm/lp_bld_blend_soa.c',
     'gallivm/lp_bld_const.c',
     'gallivm/lp_bld_conv.c',
     'gallivm/lp_bld_debug.c',
-    'gallivm/lp_bld_depth.c',
     'gallivm/lp_bld_flow.c',
-    'gallivm/lp_bld_format_aos.c',
-    'gallivm/lp_bld_format_query.c',
     'gallivm/lp_bld_format_soa.c',
-    'gallivm/lp_bld_interp.c',
     'gallivm/lp_bld_intr.c',
     'gallivm/lp_bld_logic.c',
-    'gallivm/lp_bld_init.cpp',
+    'gallivm/lp_bld_init.c',
     'gallivm/lp_bld_pack.c',
+    'gallivm/lp_bld_printf.c',
     'gallivm/lp_bld_sample.c',
     'gallivm/lp_bld_sample_soa.c',
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
+    'draw/draw_llvm.c',
+    'draw/draw_pt_fetch_shade_pipeline_llvm.c',
+    'draw/draw_llvm_translate.c'
     ]
 
 gallium = env.ConvenienceLibrary(
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index a6a07e72c2..900c64df4b 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -43,6 +43,7 @@ struct cso_cache {
    struct cso_hash *vs_hash;
    struct cso_hash *rasterizer_hash;
    struct cso_hash *sampler_hash;
+   struct cso_hash *velements_hash;
    int    max_size;
 
    cso_sanitize_callback sanitize_cb;
@@ -108,6 +109,9 @@ static struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_
    case CSO_VERTEX_SHADER:
       hash = sc->vs_hash;
       break;
+   case CSO_VELEMENTS:
+      hash = sc->velements_hash;
+      break;
    }
 
    return hash;
@@ -161,6 +165,13 @@ static void delete_vs_state(void *state, void *data)
    FREE(state);
 }
 
+static void delete_velements(void *state, void *data)
+{
+   struct cso_velements *cso = (struct cso_velements *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
 
 static INLINE void delete_cso(void *state, enum cso_cache_type type)
 {
@@ -183,6 +194,9 @@ static INLINE void delete_cso(void *state, enum cso_cache_type type)
    case CSO_VERTEX_SHADER:
       delete_vs_state(state, 0);
       break;
+   case CSO_VELEMENTS:
+      delete_velements(state, 0);
+      break;
    default:
       assert(0);
       FREE(state);
@@ -294,6 +308,7 @@ struct cso_cache *cso_cache_create(void)
    sc->rasterizer_hash    = cso_hash_create();
    sc->fs_hash            = cso_hash_create();
    sc->vs_hash            = cso_hash_create();
+   sc->velements_hash     = cso_hash_create();
    sc->sanitize_cb        = sanitize_cb;
    sc->sanitize_data      = 0;
 
@@ -325,6 +340,9 @@ void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
    case CSO_VERTEX_SHADER:
       hash = sc->vs_hash;
       break;
+   case CSO_VELEMENTS:
+      hash = sc->velements_hash;
+      break;
    }
 
    iter = cso_hash_first_node(hash);
@@ -351,6 +369,7 @@ void cso_cache_delete(struct cso_cache *sc)
    cso_for_each_state(sc, CSO_VERTEX_SHADER, delete_vs_state, 0);
    cso_for_each_state(sc, CSO_RASTERIZER, delete_rasterizer_state, 0);
    cso_for_each_state(sc, CSO_SAMPLER, delete_sampler_state, 0);
+   cso_for_each_state(sc, CSO_VELEMENTS, delete_velements, 0);
 
    cso_hash_delete(sc->blend_hash);
    cso_hash_delete(sc->sampler_hash);
@@ -358,6 +377,7 @@ void cso_cache_delete(struct cso_cache *sc)
    cso_hash_delete(sc->rasterizer_hash);
    cso_hash_delete(sc->fs_hash);
    cso_hash_delete(sc->vs_hash);
+   cso_hash_delete(sc->velements_hash);
    FREE(sc);
 }
 
@@ -372,6 +392,7 @@ void cso_set_maximum_cache_size(struct cso_cache *sc, int number)
    sanitize_hash(sc, sc->vs_hash, CSO_VERTEX_SHADER, sc->max_size);
    sanitize_hash(sc, sc->rasterizer_hash, CSO_RASTERIZER, sc->max_size);
    sanitize_hash(sc, sc->sampler_hash, CSO_SAMPLER, sc->max_size);
+   sanitize_hash(sc, sc->velements_hash, CSO_VELEMENTS, sc->max_size);
 }
 
 int cso_maximum_cache_size(const struct cso_cache *sc)
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.h b/src/gallium/auxiliary/cso_cache/cso_cache.h
index eea60b940b..fb09b83c62 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.h
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.h
@@ -53,6 +53,7 @@
   * - rasterizer (old setup)
   * - sampler
   * - vertex shader
+  * - vertex elements
   *
   * Things that are not constant state objects include:
   * - blend_color
@@ -90,7 +91,8 @@ enum cso_cache_type {
    CSO_DEPTH_STENCIL_ALPHA,
    CSO_RASTERIZER,
    CSO_FRAGMENT_SHADER,
-   CSO_VERTEX_SHADER
+   CSO_VERTEX_SHADER,
+   CSO_VELEMENTS
 };
 
 typedef void (*cso_state_callback)(void *ctx, void *obj);
@@ -144,6 +146,18 @@ struct cso_sampler {
    struct pipe_context *context;
 };
 
+struct cso_velems_state {
+   unsigned count;
+   struct pipe_vertex_element velems[PIPE_MAX_ATTRIBS];
+};
+
+struct cso_velements {
+   struct cso_velems_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
 unsigned cso_construct_key(void *item, int item_size);
 
 struct cso_cache *cso_cache_create(void);
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index a7335c340c..6fd4bd3642 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -37,6 +37,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 
@@ -69,17 +70,17 @@ struct cso_context {
    unsigned nr_vertex_samplers_saved;
    void *vertex_samplers_saved[PIPE_MAX_VERTEX_SAMPLERS];
 
-   struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
-   uint nr_textures;
+   uint nr_fragment_sampler_views;
+   struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
 
-   struct pipe_texture *vertex_textures[PIPE_MAX_VERTEX_SAMPLERS];
-   uint nr_vertex_textures;
+   uint nr_vertex_sampler_views;
+   struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
 
-   uint nr_textures_saved;
-   struct pipe_texture *textures_saved[PIPE_MAX_SAMPLERS];
+   uint nr_fragment_sampler_views_saved;
+   struct pipe_sampler_view *fragment_sampler_views_saved[PIPE_MAX_SAMPLERS];
 
-   uint nr_vertex_textures_saved;
-   struct pipe_texture *vertex_textures_saved[PIPE_MAX_SAMPLERS];
+   uint nr_vertex_sampler_views_saved;
+   struct pipe_sampler_view *vertex_sampler_views_saved[PIPE_MAX_VERTEX_SAMPLERS];
 
    /** Current and saved state.
     * The saved state is used as a 1-deep stack.
@@ -89,6 +90,7 @@ struct cso_context {
    void *rasterizer, *rasterizer_saved;
    void *fragment_shader, *fragment_shader_saved, *geometry_shader;
    void *vertex_shader, *vertex_shader_saved, *geometry_shader_saved;
+   void *velements, *velements_saved;
 
    struct pipe_clip_state clip;
    struct pipe_clip_state clip_saved;
@@ -174,6 +176,20 @@ static boolean delete_vs_state(struct cso_context *ctx, void *state)
    return FALSE;
 }
 
+static boolean delete_vertex_elements(struct cso_context *ctx,
+                                      void *state)
+{
+   struct cso_velements *cso = (struct cso_velements *)state;
+
+   if (ctx->velements == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
 
 static INLINE boolean delete_cso(struct cso_context *ctx,
                                  void *state, enum cso_cache_type type)
@@ -197,6 +213,9 @@ static INLINE boolean delete_cso(struct cso_context *ctx,
    case CSO_VERTEX_SHADER:
       return delete_vs_state(ctx, state);
       break;
+   case CSO_VELEMENTS:
+      return delete_vertex_elements(ctx, state);
+      break;
    default:
       assert(0);
       FREE(state);
@@ -271,16 +290,17 @@ void cso_release_all( struct cso_context *ctx )
       ctx->pipe->bind_depth_stencil_alpha_state( ctx->pipe, NULL );
       ctx->pipe->bind_fs_state( ctx->pipe, NULL );
       ctx->pipe->bind_vs_state( ctx->pipe, NULL );
+      ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL );
    }
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      pipe_texture_reference(&ctx->textures[i], NULL);
-      pipe_texture_reference(&ctx->textures_saved[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_sampler_views[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_sampler_views_saved[i], NULL);
    }
 
    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      pipe_texture_reference(&ctx->vertex_textures[i], NULL);
-      pipe_texture_reference(&ctx->vertex_textures_saved[i], NULL);
+      pipe_sampler_view_reference(&ctx->vertex_sampler_views[i], NULL);
+      pipe_sampler_view_reference(&ctx->vertex_sampler_views_saved[i], NULL);
    }
 
    free_framebuffer_state(&ctx->fb);
@@ -597,114 +617,6 @@ cso_restore_vertex_samplers(struct cso_context *ctx)
 }
 
 
-enum pipe_error cso_set_sampler_textures( struct cso_context *ctx,
-                                          uint count,
-                                          struct pipe_texture **textures )
-{
-   uint i;
-
-   ctx->nr_textures = count;
-
-   for (i = 0; i < count; i++)
-      pipe_texture_reference(&ctx->textures[i], textures[i]);
-   for ( ; i < PIPE_MAX_SAMPLERS; i++)
-      pipe_texture_reference(&ctx->textures[i], NULL);
-
-   ctx->pipe->set_fragment_sampler_textures(ctx->pipe, count, textures);
-
-   return PIPE_OK;
-}
-
-void cso_save_sampler_textures( struct cso_context *ctx )
-{
-   uint i;
-
-   ctx->nr_textures_saved = ctx->nr_textures;
-   for (i = 0; i < ctx->nr_textures; i++) {
-      assert(!ctx->textures_saved[i]);
-      pipe_texture_reference(&ctx->textures_saved[i], ctx->textures[i]);
-   }
-}
-
-void cso_restore_sampler_textures( struct cso_context *ctx )
-{
-   uint i;
-
-   ctx->nr_textures = ctx->nr_textures_saved;
-
-   for (i = 0; i < ctx->nr_textures; i++) {
-      pipe_texture_reference(&ctx->textures[i], NULL);
-      ctx->textures[i] = ctx->textures_saved[i];
-      ctx->textures_saved[i] = NULL;
-   }
-   for ( ; i < PIPE_MAX_SAMPLERS; i++)
-      pipe_texture_reference(&ctx->textures[i], NULL);
-
-   ctx->pipe->set_fragment_sampler_textures(ctx->pipe, ctx->nr_textures, ctx->textures);
-
-   ctx->nr_textures_saved = 0;
-}
-
-
-
-enum pipe_error
-cso_set_vertex_sampler_textures(struct cso_context *ctx,
-                                uint count,
-                                struct pipe_texture **textures)
-{
-   uint i;
-
-   ctx->nr_vertex_textures = count;
-
-   for (i = 0; i < count; i++) {
-      pipe_texture_reference(&ctx->vertex_textures[i], textures[i]);
-   }
-   for ( ; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      pipe_texture_reference(&ctx->vertex_textures[i], NULL);
-   }
-
-   ctx->pipe->set_vertex_sampler_textures(ctx->pipe, count, textures);
-
-   return PIPE_OK;
-}
-
-void
-cso_save_vertex_sampler_textures(struct cso_context *ctx)
-{
-   uint i;
-
-   ctx->nr_vertex_textures_saved = ctx->nr_vertex_textures;
-   for (i = 0; i < ctx->nr_vertex_textures; i++) {
-      assert(!ctx->vertex_textures_saved[i]);
-      pipe_texture_reference(&ctx->vertex_textures_saved[i], ctx->vertex_textures[i]);
-   }
-}
-
-void
-cso_restore_vertex_sampler_textures(struct cso_context *ctx)
-{
-   uint i;
-
-   ctx->nr_vertex_textures = ctx->nr_vertex_textures_saved;
-
-   for (i = 0; i < ctx->nr_vertex_textures; i++) {
-      pipe_texture_reference(&ctx->vertex_textures[i], NULL);
-      ctx->vertex_textures[i] = ctx->vertex_textures_saved[i];
-      ctx->vertex_textures_saved[i] = NULL;
-   }
-   for ( ; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      pipe_texture_reference(&ctx->vertex_textures[i], NULL);
-   }
-
-   ctx->pipe->set_vertex_sampler_textures(ctx->pipe,
-                                          ctx->nr_vertex_textures,
-                                          ctx->vertex_textures);
-
-   ctx->nr_vertex_textures_saved = 0;
-}
-
-
-
 enum pipe_error cso_set_depth_stencil_alpha(struct cso_context *ctx,
                                             const struct pipe_depth_stencil_alpha_state *templ)
 {
@@ -1130,7 +1042,6 @@ void cso_restore_geometry_shader(struct cso_context *ctx)
    ctx->geometry_shader_saved = NULL;
 }
 
-
 /* clip state */
 
 static INLINE void
@@ -1180,3 +1091,185 @@ cso_restore_clip(struct cso_context *ctx)
       ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip_saved);
    }
 }
+
+enum pipe_error cso_set_vertex_elements(struct cso_context *ctx,
+                                        unsigned count,
+                                        const struct pipe_vertex_element *states)
+{
+   unsigned key_size, hash_key;
+   struct cso_hash_iter iter;
+   void *handle;
+   struct cso_velems_state velems_state;
+
+   /* need to include the count into the stored state data too.
+      Otherwise first few count pipe_vertex_elements could be identical even if count
+      is different, and there's no guarantee the hash would be different in that
+      case neither */
+   key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
+   velems_state.count = count;
+   memcpy(velems_state.velems, states, sizeof(struct pipe_vertex_element) * count);
+   hash_key = cso_construct_key((void*)&velems_state, key_size);
+   iter = cso_find_state_template(ctx->cache, hash_key, CSO_VELEMENTS, (void*)&velems_state, key_size);
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_velements *cso = MALLOC(sizeof(struct cso_velements));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, &velems_state, key_size);
+      cso->data = ctx->pipe->create_vertex_elements_state(ctx->pipe, count, &cso->state.velems[0]);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_vertex_elements_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_VELEMENTS, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->velements != handle) {
+      ctx->velements = handle;
+      ctx->pipe->bind_vertex_elements_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_vertex_elements(struct cso_context *ctx)
+{
+   assert(!ctx->velements_saved);
+   ctx->velements_saved = ctx->velements;
+}
+
+void cso_restore_vertex_elements(struct cso_context *ctx)
+{
+   if (ctx->velements != ctx->velements_saved) {
+      ctx->velements = ctx->velements_saved;
+      ctx->pipe->bind_vertex_elements_state(ctx->pipe, ctx->velements_saved);
+   }
+   ctx->velements_saved = NULL;
+}
+
+/* fragment sampler view state */
+
+void
+cso_set_fragment_sampler_views(struct cso_context *cso,
+                               uint count,
+                               struct pipe_sampler_view **views)
+{
+   uint i;
+
+   for (i = 0; i < count; i++) {
+      pipe_sampler_view_reference(&cso->fragment_sampler_views[i], views[i]);
+   }
+   for (; i < cso->nr_fragment_sampler_views; i++) {
+      pipe_sampler_view_reference(&cso->fragment_sampler_views[i], NULL);
+   }
+
+   cso->pipe->set_fragment_sampler_views(cso->pipe,
+                                         MAX2(count, cso->nr_fragment_sampler_views),
+                                         cso->fragment_sampler_views);
+
+   cso->nr_fragment_sampler_views = count;
+}
+
+void
+cso_save_fragment_sampler_views(struct cso_context *cso)
+{
+   uint i;
+
+   cso->nr_fragment_sampler_views_saved = cso->nr_fragment_sampler_views;
+
+   for (i = 0; i < cso->nr_fragment_sampler_views; i++) {
+      assert(!cso->fragment_sampler_views_saved[i]);
+
+      pipe_sampler_view_reference(&cso->fragment_sampler_views_saved[i],
+                                  cso->fragment_sampler_views[i]);
+   }
+}
+
+void
+cso_restore_fragment_sampler_views(struct cso_context *cso)
+{
+   uint i;
+
+   for (i = 0; i < cso->nr_fragment_sampler_views_saved; i++) {
+      pipe_sampler_view_reference(&cso->fragment_sampler_views[i], cso->fragment_sampler_views_saved[i]);
+      pipe_sampler_view_reference(&cso->fragment_sampler_views_saved[i], NULL);
+   }
+   for (; i < cso->nr_fragment_sampler_views; i++) {
+      pipe_sampler_view_reference(&cso->fragment_sampler_views[i], NULL);
+   }
+
+   cso->pipe->set_fragment_sampler_views(cso->pipe,
+                                         MAX2(cso->nr_fragment_sampler_views, cso->nr_fragment_sampler_views_saved),
+                                         cso->fragment_sampler_views);
+
+   cso->nr_fragment_sampler_views = cso->nr_fragment_sampler_views_saved;
+   cso->nr_fragment_sampler_views_saved = 0;
+}
+
+
+/* vertex sampler view state */
+
+void
+cso_set_vertex_sampler_views(struct cso_context *cso,
+                             uint count,
+                             struct pipe_sampler_view **views)
+{
+   uint i;
+
+   for (i = 0; i < count; i++) {
+      pipe_sampler_view_reference(&cso->vertex_sampler_views[i], views[i]);
+   }
+   for (; i < cso->nr_vertex_sampler_views; i++) {
+      pipe_sampler_view_reference(&cso->vertex_sampler_views[i], NULL);
+   }
+
+   cso->pipe->set_vertex_sampler_views(cso->pipe,
+                                       MAX2(count, cso->nr_vertex_sampler_views),
+                                       cso->vertex_sampler_views);
+
+   cso->nr_vertex_sampler_views = count;
+}
+
+void
+cso_save_vertex_sampler_views(struct cso_context *cso)
+{
+   uint i;
+
+   cso->nr_vertex_sampler_views_saved = cso->nr_vertex_sampler_views;
+
+   for (i = 0; i < cso->nr_vertex_sampler_views; i++) {
+      assert(!cso->vertex_sampler_views_saved[i]);
+
+      pipe_sampler_view_reference(&cso->vertex_sampler_views_saved[i],
+                                  cso->vertex_sampler_views[i]);
+   }
+}
+
+void
+cso_restore_vertex_sampler_views(struct cso_context *cso)
+{
+   uint i;
+
+   for (i = 0; i < cso->nr_vertex_sampler_views_saved; i++) {
+      pipe_sampler_view_reference(&cso->vertex_sampler_views[i], cso->vertex_sampler_views_saved[i]);
+      pipe_sampler_view_reference(&cso->vertex_sampler_views_saved[i], NULL);
+   }
+   for (; i < cso->nr_vertex_sampler_views; i++) {
+      pipe_sampler_view_reference(&cso->vertex_sampler_views[i], NULL);
+   }
+
+   cso->pipe->set_vertex_sampler_views(cso->pipe,
+                                       MAX2(cso->nr_vertex_sampler_views, cso->nr_vertex_sampler_views_saved),
+                                       cso->vertex_sampler_views);
+
+   cso->nr_vertex_sampler_views = cso->nr_vertex_sampler_views_saved;
+   cso->nr_vertex_sampler_views_saved = 0;
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 251a9a644f..d6bcb1fe8f 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -103,24 +103,11 @@ void
 cso_single_vertex_sampler_done(struct cso_context *cso);
 
 
-
-enum pipe_error cso_set_sampler_textures( struct cso_context *cso,
-                                          uint count,
-                                          struct pipe_texture **textures );
-void cso_save_sampler_textures( struct cso_context *cso );
-void cso_restore_sampler_textures( struct cso_context *cso );
-
-
-
-enum pipe_error
-cso_set_vertex_sampler_textures(struct cso_context *cso,
-                                uint count,
-                                struct pipe_texture **textures);
-void
-cso_save_vertex_sampler_textures(struct cso_context *cso);
-void
-cso_restore_vertex_sampler_textures(struct cso_context *cso);
-
+enum pipe_error cso_set_vertex_elements(struct cso_context *ctx,
+                                        unsigned count,
+                                        const struct pipe_vertex_element *states);
+void cso_save_vertex_elements(struct cso_context *ctx);
+void cso_restore_vertex_elements(struct cso_context *ctx);
 
 
 /* These aren't really sensible -- most of the time the api provides
@@ -157,7 +144,6 @@ void cso_save_geometry_shader(struct cso_context *cso);
 void cso_restore_geometry_shader(struct cso_context *cso);
 
 
-
 enum pipe_error cso_set_framebuffer(struct cso_context *cso,
                                     const struct pipe_framebuffer_state *fb);
 void cso_save_framebuffer(struct cso_context *cso);
@@ -193,6 +179,34 @@ void
 cso_restore_clip(struct cso_context *cso);
 
 
+/* fragment sampler view state */
+
+void
+cso_set_fragment_sampler_views(struct cso_context *cso,
+                               uint count,
+                               struct pipe_sampler_view **views);
+
+void
+cso_save_fragment_sampler_views(struct cso_context *cso);
+
+void
+cso_restore_fragment_sampler_views(struct cso_context *cso);
+
+
+/* vertex sampler view state */
+
+void
+cso_set_vertex_sampler_views(struct cso_context *cso,
+                             uint count,
+                             struct pipe_sampler_view **views);
+
+void
+cso_save_vertex_sampler_views(struct cso_context *cso);
+
+void
+cso_restore_vertex_sampler_views(struct cso_context *cso);
+
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 18435af848..5726444c9b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -45,6 +45,26 @@ struct draw_context *draw_create( struct pipe_context *pipe )
    if (draw == NULL)
       goto fail;
 
+   if (!draw_init(draw))
+      goto fail;
+
+   draw->pipe = pipe;
+
+   return draw;
+
+fail:
+   draw_destroy( draw );
+   return NULL;
+}
+
+boolean draw_init(struct draw_context *draw)
+{
+   /*
+    * Note that several functions compute the clipmask of the predefined
+    * formats with hardcoded formulas instead of using these. So modifications
+    * here must be reflected there too.
+    */
+
    ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
@@ -58,24 +78,18 @@ struct draw_context *draw_create( struct pipe_context *pipe )
 
 
    if (!draw_pipeline_init( draw ))
-      goto fail;
+      return FALSE;
 
    if (!draw_pt_init( draw ))
-      goto fail;
+      return FALSE;
 
    if (!draw_vs_init( draw ))
-      goto fail;
+      return FALSE;
 
    if (!draw_gs_init( draw ))
-      goto fail;
+      return FALSE;
 
-   draw->pipe = pipe;
-
-   return draw;
-
-fail:
-   draw_destroy( draw );   
-   return NULL;
+   return TRUE;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 3b21a88170..0d328304eb 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -40,7 +40,6 @@
 
 #include "pipe/p_state.h"
 
-
 struct pipe_context;
 struct draw_context;
 struct draw_stage;
@@ -198,6 +197,11 @@ boolean draw_need_pipeline(const struct draw_context *draw,
                            const struct pipe_rasterizer_state *rasterizer,
                            unsigned prim );
 
-
+#ifdef HAVE_LLVM
+/*******************************************************************************
+ * LLVM integration
+ */
+struct draw_context *draw_create_with_llvm(void);
+#endif
 
 #endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 7069aa6b18..131deed43e 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -342,10 +342,10 @@ void draw_geometry_shader_delete(struct draw_geometry_shader *shader)
 void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
                                   struct draw_context *draw)
 {
-    if (shader->machine->Tokens != shader->state.tokens) {
-       tgsi_exec_machine_bind_shader(shader->machine,
-                                     shader->state.tokens,
-                                     draw->gs.num_samplers,
-                                     draw->gs.samplers);
-    }
+   if (shader && shader->machine->Tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(shader->machine,
+                                    shader->state.tokens,
+                                    draw->gs.num_samplers,
+                                    draw->gs.samplers);
+   }
 }
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
new file mode 100644
index 0000000000..a5403a49c3
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -0,0 +1,728 @@
+#include "draw_llvm.h"
+
+#include "draw_context.h"
+#include "draw_vs.h"
+
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_struct.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "gallivm/lp_bld_printf.h"
+#include "gallivm/lp_bld_init.h"
+
+#include "tgsi/tgsi_exec.h"
+
+#include "util/u_cpu_detect.h"
+#include "util/u_string.h"
+
+#include <llvm-c/Transforms/Scalar.h>
+
+#define DEBUG_STORE 0
+
+
+/* generates the draw jit function */
+static void
+draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var);
+
+static void
+init_globals(struct draw_llvm *llvm)
+{
+   LLVMTypeRef texture_type;
+
+   /* struct draw_jit_texture */
+   {
+      LLVMTypeRef elem_types[4];
+
+      elem_types[DRAW_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, width,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, height,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, stride,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, data,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct draw_jit_texture,
+                           llvm->target, texture_type);
+
+      LLVMAddTypeName(llvm->module, "texture", texture_type);
+   }
+
+
+   /* struct draw_jit_context */
+   {
+      LLVMTypeRef elem_types[3];
+      LLVMTypeRef context_type;
+
+      elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
+      elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
+      elem_types[2] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+
+      context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, vs_constants,
+                             llvm->target, context_type, 0);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, gs_constants,
+                             llvm->target, context_type, 1);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures,
+                             llvm->target, context_type,
+                             DRAW_JIT_CONTEXT_TEXTURES_INDEX);
+      LP_CHECK_STRUCT_SIZE(struct draw_jit_context,
+                           llvm->target, context_type);
+
+      LLVMAddTypeName(llvm->module, "draw_jit_context", context_type);
+
+      llvm->context_ptr_type = LLVMPointerType(context_type, 0);
+   }
+   {
+      LLVMTypeRef buffer_ptr = LLVMPointerType(LLVMIntType(8), 0);
+      llvm->buffer_ptr_type = LLVMPointerType(buffer_ptr, 0);
+   }
+   /* struct pipe_vertex_buffer */
+   {
+      LLVMTypeRef elem_types[4];
+      LLVMTypeRef vb_type;
+
+      elem_types[0] = LLVMInt32Type();
+      elem_types[1] = LLVMInt32Type();
+      elem_types[2] = LLVMInt32Type();
+      elem_types[3] = LLVMPointerType(LLVMOpaqueType(), 0); /* vs_constants */
+
+      vb_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, stride,
+                             llvm->target, vb_type, 0);
+      LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, buffer_offset,
+                             llvm->target, vb_type, 2);
+      LP_CHECK_STRUCT_SIZE(struct pipe_vertex_buffer,
+                           llvm->target, vb_type);
+
+      LLVMAddTypeName(llvm->module, "pipe_vertex_buffer", vb_type);
+
+      llvm->vb_ptr_type = LLVMPointerType(vb_type, 0);
+   }
+}
+
+static LLVMTypeRef
+create_vertex_header(struct draw_llvm *llvm, int data_elems)
+{
+   /* struct vertex_header */
+   LLVMTypeRef elem_types[3];
+   LLVMTypeRef vertex_header;
+   char struct_name[24];
+
+   util_snprintf(struct_name, 23, "vertex_header%d", data_elems);
+
+   elem_types[0]  = LLVMIntType(32);
+   elem_types[1]  = LLVMArrayType(LLVMFloatType(), 4);
+   elem_types[2]  = LLVMArrayType(elem_types[1], data_elems);
+
+   vertex_header = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+   /* these are bit-fields and we can't take address of them
+      LP_CHECK_MEMBER_OFFSET(struct vertex_header, clipmask,
+      llvm->target, vertex_header,
+      DRAW_JIT_VERTEX_CLIPMASK);
+      LP_CHECK_MEMBER_OFFSET(struct vertex_header, edgeflag,
+      llvm->target, vertex_header,
+      DRAW_JIT_VERTEX_EDGEFLAG);
+      LP_CHECK_MEMBER_OFFSET(struct vertex_header, pad,
+      llvm->target, vertex_header,
+      DRAW_JIT_VERTEX_PAD);
+      LP_CHECK_MEMBER_OFFSET(struct vertex_header, vertex_id,
+      llvm->target, vertex_header,
+      DRAW_JIT_VERTEX_VERTEX_ID);
+   */
+   LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip,
+                          llvm->target, vertex_header,
+                          DRAW_JIT_VERTEX_CLIP);
+   LP_CHECK_MEMBER_OFFSET(struct vertex_header, data,
+                          llvm->target, vertex_header,
+                          DRAW_JIT_VERTEX_DATA);
+
+   LLVMAddTypeName(llvm->module, struct_name, vertex_header);
+
+   return LLVMPointerType(vertex_header, 0);
+}
+
+struct draw_llvm *
+draw_llvm_create(struct draw_context *draw)
+{
+   struct draw_llvm *llvm = CALLOC_STRUCT( draw_llvm );
+
+   util_cpu_detect();
+
+   llvm->draw = draw;
+   llvm->engine = draw->engine;
+
+   debug_assert(llvm->engine);
+
+   llvm->module = LLVMModuleCreateWithName("draw_llvm");
+   llvm->provider = LLVMCreateModuleProviderForExistingModule(llvm->module);
+
+   LLVMAddModuleProvider(llvm->engine, llvm->provider);
+
+   llvm->target = LLVMGetExecutionEngineTargetData(llvm->engine);
+
+   llvm->pass = LLVMCreateFunctionPassManager(llvm->provider);
+   LLVMAddTargetData(llvm->target, llvm->pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   /* TODO: Add more passes */
+   LLVMAddConstantPropagationPass(llvm->pass);
+   if(util_cpu_caps.has_sse4_1) {
+      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+       * and sitofp (necessary for trunc/floor/ceil/round implementation)
+       * somehow becomes invalid code.
+       */
+      LLVMAddInstructionCombiningPass(llvm->pass);
+   }
+   LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
+   LLVMAddGVNPass(llvm->pass);
+   LLVMAddCFGSimplificationPass(llvm->pass);
+
+   init_globals(llvm);
+
+
+#if 1
+   LLVMDumpModule(llvm->module);
+#endif
+
+   return llvm;
+}
+
+void
+draw_llvm_destroy(struct draw_llvm *llvm)
+{
+   free(llvm);
+}
+
+struct draw_llvm_variant *
+draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs)
+{
+   struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant));
+
+   draw_llvm_make_variant_key(llvm, &variant->key);
+
+   llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs);
+
+   draw_llvm_generate(llvm, variant);
+
+   return variant;
+}
+
+
+struct draw_context *draw_create_with_llvm(void)
+{
+   struct draw_context *draw = CALLOC_STRUCT( draw_context );
+   if (draw == NULL)
+      goto fail;
+
+   assert(lp_build_engine);
+   draw->engine = lp_build_engine;
+
+   if (!draw_init(draw))
+      goto fail;
+
+   return draw;
+
+fail:
+   draw_destroy( draw );
+   return NULL;
+}
+
+static void
+generate_vs(struct draw_llvm *llvm,
+            LLVMBuilderRef builder,
+            LLVMValueRef (*outputs)[NUM_CHANNELS],
+            const LLVMValueRef (*inputs)[NUM_CHANNELS],
+            LLVMValueRef context_ptr)
+{
+   const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
+   struct lp_type vs_type;
+   LLVMValueRef consts_ptr = draw_jit_context_vs_constants(builder, context_ptr);
+
+   memset(&vs_type, 0, sizeof vs_type);
+   vs_type.floating = TRUE; /* floating point values */
+   vs_type.sign = TRUE;     /* values are signed */
+   vs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
+   vs_type.width = 32;      /* 32-bit float */
+   vs_type.length = 4;      /* 4 elements per vector */
+#if 0
+   num_vs = 4;              /* number of vertices per block */
+#endif
+
+   /*tgsi_dump(tokens, 0);*/
+   lp_build_tgsi_soa(builder,
+                     tokens,
+                     vs_type,
+                     NULL /*struct lp_build_mask_context *mask*/,
+                     consts_ptr,
+                     NULL /*pos*/,
+                     inputs,
+                     outputs,
+                     NULL/*sampler*/);
+}
+
+#if DEBUG_STORE
+static void print_vectorf(LLVMBuilderRef builder,
+                         LLVMValueRef vec)
+{
+   LLVMValueRef val[4];
+   val[0] = LLVMBuildExtractElement(builder, vec,
+                                    LLVMConstInt(LLVMInt32Type(), 0, 0), "");
+   val[1] = LLVMBuildExtractElement(builder, vec,
+                                    LLVMConstInt(LLVMInt32Type(), 1, 0), "");
+   val[2] = LLVMBuildExtractElement(builder, vec,
+                                    LLVMConstInt(LLVMInt32Type(), 2, 0), "");
+   val[3] = LLVMBuildExtractElement(builder, vec,
+                                    LLVMConstInt(LLVMInt32Type(), 3, 0), "");
+   lp_build_printf(builder, "vector = [%f, %f, %f, %f]\n",
+                   val[0], val[1], val[2], val[3]);
+}
+#endif
+
+static void
+generate_fetch(LLVMBuilderRef builder,
+               LLVMValueRef vbuffers_ptr,
+               LLVMValueRef *res,
+               struct pipe_vertex_element *velem,
+               LLVMValueRef vbuf,
+               LLVMValueRef index)
+{
+   LLVMValueRef indices = LLVMConstInt(LLVMInt64Type(), velem->vertex_buffer_index, 0);
+   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
+                                           &indices, 1, "");
+   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(builder, vbuf);
+   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(builder, vbuf);
+   LLVMValueRef stride = LLVMBuildMul(builder,
+                                      vb_stride,
+                                      index, "");
+
+   vbuffer_ptr = LLVMBuildLoad(builder, vbuffer_ptr, "vbuffer");
+
+   stride = LLVMBuildAdd(builder, stride,
+                         vb_buffer_offset,
+                         "");
+   stride = LLVMBuildAdd(builder, stride,
+                         LLVMConstInt(LLVMInt32Type(), velem->src_offset, 0),
+                         "");
+
+   /*lp_build_printf(builder, "vbuf index = %d, stride is %d\n", indices, stride);*/
+   vbuffer_ptr = LLVMBuildGEP(builder, vbuffer_ptr, &stride, 1, "");
+
+   *res = draw_llvm_translate_from(builder, vbuffer_ptr, velem->src_format);
+}
+
+static LLVMValueRef
+aos_to_soa(LLVMBuilderRef builder,
+           LLVMValueRef val0,
+           LLVMValueRef val1,
+           LLVMValueRef val2,
+           LLVMValueRef val3,
+           LLVMValueRef channel)
+{
+   LLVMValueRef ex, res;
+
+   ex = LLVMBuildExtractElement(builder, val0,
+                                channel, "");
+   res = LLVMBuildInsertElement(builder,
+                                LLVMConstNull(LLVMTypeOf(val0)),
+                                ex,
+                                LLVMConstInt(LLVMInt32Type(), 0, 0),
+                                "");
+
+   ex = LLVMBuildExtractElement(builder, val1,
+                                channel, "");
+   res = LLVMBuildInsertElement(builder,
+                                res, ex,
+                                LLVMConstInt(LLVMInt32Type(), 1, 0),
+                                "");
+
+   ex = LLVMBuildExtractElement(builder, val2,
+                                channel, "");
+   res = LLVMBuildInsertElement(builder,
+                                res, ex,
+                                LLVMConstInt(LLVMInt32Type(), 2, 0),
+                                "");
+
+   ex = LLVMBuildExtractElement(builder, val3,
+                                channel, "");
+   res = LLVMBuildInsertElement(builder,
+                                res, ex,
+                                LLVMConstInt(LLVMInt32Type(), 3, 0),
+                                "");
+
+   return res;
+}
+
+static void
+soa_to_aos(LLVMBuilderRef builder,
+           LLVMValueRef soa[NUM_CHANNELS],
+           LLVMValueRef aos[NUM_CHANNELS])
+{
+   LLVMValueRef comp;
+   int i = 0;
+
+   debug_assert(NUM_CHANNELS == 4);
+
+   aos[0] = LLVMConstNull(LLVMTypeOf(soa[0]));
+   aos[1] = aos[2] = aos[3] = aos[0];
+
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      LLVMValueRef channel = LLVMConstInt(LLVMInt32Type(), i, 0);
+
+      comp = LLVMBuildExtractElement(builder, soa[i],
+                                     LLVMConstInt(LLVMInt32Type(), 0, 0), "");
+      aos[0] = LLVMBuildInsertElement(builder, aos[0], comp, channel, "");
+
+      comp = LLVMBuildExtractElement(builder, soa[i],
+                                     LLVMConstInt(LLVMInt32Type(), 1, 0), "");
+      aos[1] = LLVMBuildInsertElement(builder, aos[1], comp, channel, "");
+
+      comp = LLVMBuildExtractElement(builder, soa[i],
+                                     LLVMConstInt(LLVMInt32Type(), 2, 0), "");
+      aos[2] = LLVMBuildInsertElement(builder, aos[2], comp, channel, "");
+
+      comp = LLVMBuildExtractElement(builder, soa[i],
+                                     LLVMConstInt(LLVMInt32Type(), 3, 0), "");
+      aos[3] = LLVMBuildInsertElement(builder, aos[3], comp, channel, "");
+
+   }
+}
+
+static void
+convert_to_soa(LLVMBuilderRef builder,
+               LLVMValueRef (*aos)[NUM_CHANNELS],
+               LLVMValueRef (*soa)[NUM_CHANNELS],
+               int num_attribs)
+{
+   int i;
+
+   debug_assert(NUM_CHANNELS == 4);
+
+   for (i = 0; i < num_attribs; ++i) {
+      LLVMValueRef val0 = aos[i][0];
+      LLVMValueRef val1 = aos[i][1];
+      LLVMValueRef val2 = aos[i][2];
+      LLVMValueRef val3 = aos[i][3];
+
+      soa[i][0] = aos_to_soa(builder, val0, val1, val2, val3,
+                             LLVMConstInt(LLVMInt32Type(), 0, 0));
+      soa[i][1] = aos_to_soa(builder, val0, val1, val2, val3,
+                             LLVMConstInt(LLVMInt32Type(), 1, 0));
+      soa[i][2] = aos_to_soa(builder, val0, val1, val2, val3,
+                             LLVMConstInt(LLVMInt32Type(), 2, 0));
+      soa[i][3] = aos_to_soa(builder, val0, val1, val2, val3,
+                             LLVMConstInt(LLVMInt32Type(), 3, 0));
+   }
+}
+
+static void
+store_aos(LLVMBuilderRef builder,
+          LLVMValueRef io_ptr,
+          LLVMValueRef index,
+          LLVMValueRef value)
+{
+   LLVMValueRef id_ptr = draw_jit_header_id(builder, io_ptr);
+   LLVMValueRef data_ptr = draw_jit_header_data(builder, io_ptr);
+   LLVMValueRef indices[3];
+
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = index;
+   indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+   /* undefined vertex */
+   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(),
+                                        0xffff, 0), id_ptr);
+
+#if DEBUG_STORE
+   lp_build_printf(builder, "    ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
+#endif
+#if 0
+   /*lp_build_printf(builder, " ---- %p storing at %d (%p)  ", io_ptr, index, data_ptr);
+     print_vectorf(builder, value);*/
+   data_ptr = LLVMBuildBitCast(builder, data_ptr,
+                               LLVMPointerType(LLVMArrayType(LLVMVectorType(LLVMFloatType(), 4), 0), 0),
+                               "datavec");
+   data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 2, "");
+
+   LLVMBuildStore(builder, value, data_ptr);
+#else
+   {
+      LLVMValueRef x, y, z, w;
+      LLVMValueRef idx0, idx1, idx2, idx3;
+      LLVMValueRef gep0, gep1, gep2, gep3;
+      data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
+
+      idx0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      idx1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+      idx2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+      idx3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+
+      x = LLVMBuildExtractElement(builder, value,
+                                  idx0, "");
+      y = LLVMBuildExtractElement(builder, value,
+                                  idx1, "");
+      z = LLVMBuildExtractElement(builder, value,
+                                  idx2, "");
+      w = LLVMBuildExtractElement(builder, value,
+                                  idx3, "");
+
+      gep0 = LLVMBuildGEP(builder, data_ptr, &idx0, 1, "");
+      gep1 = LLVMBuildGEP(builder, data_ptr, &idx1, 1, "");
+      gep2 = LLVMBuildGEP(builder, data_ptr, &idx2, 1, "");
+      gep3 = LLVMBuildGEP(builder, data_ptr, &idx3, 1, "");
+
+      /*lp_build_printf(builder, "##### x = %f (%p), y = %f (%p), z = %f (%p), w = %f (%p)\n",
+        x, gep0, y, gep1, z, gep2, w, gep3);*/
+      LLVMBuildStore(builder, x, gep0);
+      LLVMBuildStore(builder, y, gep1);
+      LLVMBuildStore(builder, z, gep2);
+      LLVMBuildStore(builder, w, gep3);
+   }
+#endif
+}
+
+static void
+store_aos_array(LLVMBuilderRef builder,
+                LLVMValueRef io_ptr,
+                LLVMValueRef aos[NUM_CHANNELS],
+                int attrib,
+                int num_outputs)
+{
+   LLVMValueRef attr_index = LLVMConstInt(LLVMInt32Type(), attrib, 0);
+   LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef ind1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+   LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
+
+   debug_assert(NUM_CHANNELS == 4);
+
+   io0_ptr = LLVMBuildGEP(builder, io_ptr,
+                          &ind0, 1, "");
+   io1_ptr = LLVMBuildGEP(builder, io_ptr,
+                          &ind1, 1, "");
+   io2_ptr = LLVMBuildGEP(builder, io_ptr,
+                          &ind2, 1, "");
+   io3_ptr = LLVMBuildGEP(builder, io_ptr,
+                          &ind3, 1, "");
+
+#if DEBUG_STORE
+   lp_build_printf(builder, "   io = %p, indexes[%d, %d, %d, %d]\n",
+                   io_ptr, ind0, ind1, ind2, ind3);
+#endif
+
+   store_aos(builder, io0_ptr, attr_index, aos[0]);
+   store_aos(builder, io1_ptr, attr_index, aos[1]);
+   store_aos(builder, io2_ptr, attr_index, aos[2]);
+   store_aos(builder, io3_ptr, attr_index, aos[3]);
+}
+
+static void
+convert_to_aos(LLVMBuilderRef builder,
+               LLVMValueRef io,
+               LLVMValueRef (*outputs)[NUM_CHANNELS],
+               int num_outputs,
+               int max_vertices)
+{
+   unsigned chan, attrib;
+
+#if DEBUG_STORE
+   lp_build_printf(builder, "   # storing begin\n");
+#endif
+   for (attrib = 0; attrib < num_outputs; ++attrib) {
+      LLVMValueRef soa[4];
+      LLVMValueRef aos[4];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(outputs[attrib][chan]) {
+            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+            lp_build_name(out, "output%u.%c", attrib, "xyzw"[chan]);
+            /*lp_build_printf(builder, "output %d : %d ",
+                            LLVMConstInt(LLVMInt32Type(), attrib, 0),
+                            LLVMConstInt(LLVMInt32Type(), chan, 0));
+              print_vectorf(builder, out);*/
+            soa[chan] = out;
+         } else
+            soa[chan] = 0;
+      }
+      soa_to_aos(builder, soa, aos);
+      store_aos_array(builder,
+                      io,
+                      aos,
+                      attrib,
+                      num_outputs);
+   }
+#if DEBUG_STORE
+   lp_build_printf(builder, "   # storing end\n");
+#endif
+}
+
+static void
+draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
+{
+   LLVMTypeRef arg_types[7];
+   LLVMTypeRef func_type;
+   LLVMValueRef context_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef start, end, count, stride, step, io_itr;
+   LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
+   struct draw_context *draw = llvm->draw;
+   unsigned i, j;
+   struct lp_build_context bld;
+   struct lp_build_loop_state lp_loop;
+   struct lp_type vs_type = lp_type_float_vec(32);
+   const int max_vertices = 4;
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
+
+   arg_types[0] = llvm->context_ptr_type;           /* context */
+   arg_types[1] = llvm->vertex_header_ptr_type;     /* vertex_header */
+   arg_types[2] = llvm->buffer_ptr_type;            /* vbuffers */
+   arg_types[3] = LLVMInt32Type();                  /* start */
+   arg_types[4] = LLVMInt32Type();                  /* count */
+   arg_types[5] = LLVMInt32Type();                  /* stride */
+   arg_types[6] = llvm->vb_ptr_type;                /* pipe_vertex_buffer's */
+
+   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+
+   variant->function = LLVMAddFunction(llvm->module, "draw_llvm_shader", func_type);
+   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(variant->function, i), LLVMNoAliasAttribute);
+
+   context_ptr  = LLVMGetParam(variant->function, 0);
+   io_ptr       = LLVMGetParam(variant->function, 1);
+   vbuffers_ptr = LLVMGetParam(variant->function, 2);
+   start        = LLVMGetParam(variant->function, 3);
+   count        = LLVMGetParam(variant->function, 4);
+   stride       = LLVMGetParam(variant->function, 5);
+   vb_ptr       = LLVMGetParam(variant->function, 6);
+
+   lp_build_name(context_ptr, "context");
+   lp_build_name(io_ptr, "io");
+   lp_build_name(vbuffers_ptr, "vbuffers");
+   lp_build_name(start, "start");
+   lp_build_name(count, "count");
+   lp_build_name(stride, "stride");
+   lp_build_name(vb_ptr, "vb");
+
+   /*
+    * Function body
+    */
+
+   block = LLVMAppendBasicBlock(variant->function, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   lp_build_context_init(&bld, builder, vs_type);
+
+   end = lp_build_add(&bld, start, count);
+
+   step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
+
+#if DEBUG_STORE
+   lp_build_printf(builder, "start = %d, end = %d, step = %d\n",
+                   start, end, step);
+#endif
+   lp_build_loop_begin(builder, start, &lp_loop);
+   {
+      LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } };
+      LLVMValueRef io;
+      const LLVMValueRef (*ptr_aos)[NUM_CHANNELS];
+
+      io_itr = LLVMBuildSub(builder, lp_loop.counter, start, "");
+      io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, "");
+#if DEBUG_STORE
+      lp_build_printf(builder, " --- io %d = %p, loop counter %d\n",
+                      io_itr, io, lp_loop.counter);
+#endif
+      for (i = 0; i < NUM_CHANNELS; ++i) {
+         LLVMValueRef true_index = LLVMBuildAdd(
+            builder,
+            lp_loop.counter,
+            LLVMConstInt(LLVMInt32Type(), i, 0), "");
+         for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
+            struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
+            LLVMValueRef vb_index = LLVMConstInt(LLVMInt32Type(),
+                                                 velem->vertex_buffer_index,
+                                                 0);
+            LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr,
+                                           &vb_index, 1, "");
+            generate_fetch(builder, vbuffers_ptr,
+                           &aos_attribs[j][i], velem, vb, true_index);
+         }
+      }
+      convert_to_soa(builder, aos_attribs, inputs,
+                     draw->pt.nr_vertex_elements);
+
+      ptr_aos = (const LLVMValueRef (*)[NUM_CHANNELS]) inputs;
+      generate_vs(llvm,
+                  builder,
+                  outputs,
+                  ptr_aos,
+                  context_ptr);
+
+      convert_to_aos(builder, io, outputs,
+                     draw->vs.vertex_shader->info.num_outputs,
+                     max_vertices);
+   }
+   lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop);
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+#ifdef DEBUG
+   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
+      LLVMDumpValue(variant->function);
+      assert(0);
+   }
+#endif
+
+   LLVMRunFunctionPassManager(llvm->pass, variant->function);
+
+   if (0) {
+      LLVMDumpValue(variant->function);
+      debug_printf("\n");
+   }
+   variant->jit_func = (draw_jit_vert_func)LLVMGetPointerToGlobal(llvm->draw->engine, variant->function);
+
+   if (0)
+      lp_disassemble(variant->jit_func);
+}
+
+void
+draw_llvm_make_variant_key(struct draw_llvm *llvm,
+                           struct draw_llvm_variant_key *key)
+{
+   memset(key, 0, sizeof(struct draw_llvm_variant_key));
+
+   key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
+
+   memcpy(key->vertex_element,
+          llvm->draw->pt.vertex_element,
+          sizeof(struct pipe_vertex_element) * key->nr_vertex_elements);
+
+   memcpy(&key->vs,
+          &llvm->draw->vs.vertex_shader->state,
+          sizeof(struct pipe_shader_state));
+}
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
new file mode 100644
index 0000000000..28b9044a81
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -0,0 +1,144 @@
+#ifndef HAVE_LLVM_H
+#define HAVE_LLVM_H
+
+#include "draw/draw_private.h"
+
+#include "pipe/p_context.h"
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Analysis.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/ExecutionEngine.h>
+
+struct draw_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t stride;
+   const void *data;
+};
+
+enum {
+   DRAW_JIT_TEXTURE_WIDTH = 0,
+   DRAW_JIT_TEXTURE_HEIGHT,
+   DRAW_JIT_TEXTURE_STRIDE,
+   DRAW_JIT_TEXTURE_DATA
+};
+
+enum {
+   DRAW_JIT_VERTEX_VERTEX_ID = 0,
+   DRAW_JIT_VERTEX_CLIP,
+   DRAW_JIT_VERTEX_DATA
+};
+
+/**
+ * This structure is passed directly to the generated vertex shader.
+ *
+ * It contains the derived state.
+ *
+ * Changes here must be reflected in the draw_jit_context_* macros.
+ * Changes to the ordering should be avoided.
+ *
+ * Only use types with a clear size and padding here, in particular prefer the
+ * stdint.h types to the basic integer types.
+ */
+struct draw_jit_context
+{
+   const float *vs_constants;
+   const float *gs_constants;
+
+
+   struct draw_jit_texture textures[PIPE_MAX_SAMPLERS];
+};
+
+
+#define draw_jit_context_vs_constants(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 0, "vs_constants")
+
+#define draw_jit_context_gs_constants(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 1, "gs_constants")
+
+#define DRAW_JIT_CONTEXT_TEXTURES_INDEX 2
+
+#define draw_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+
+
+
+#define draw_jit_header_id(_builder, _ptr)              \
+   lp_build_struct_get_ptr(_builder, _ptr, 0, "id")
+
+#define draw_jit_header_clip(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 1, "clip")
+
+#define draw_jit_header_data(_builder, _ptr)            \
+   lp_build_struct_get_ptr(_builder, _ptr, 2, "data")
+
+
+#define draw_jit_vbuffer_stride(_builder, _ptr)         \
+   lp_build_struct_get(_builder, _ptr, 0, "stride")
+
+#define draw_jit_vbuffer_offset(_builder, _ptr)                 \
+   lp_build_struct_get(_builder, _ptr, 2, "buffer_offset")
+
+
+typedef void
+(*draw_jit_vert_func)(struct draw_jit_context *context,
+                      struct vertex_header *io,
+                      const char *vbuffers[PIPE_MAX_ATTRIBS],
+                      unsigned start,
+                      unsigned count,
+                      unsigned stride,
+                      struct pipe_vertex_buffer *vertex_buffers);
+
+struct draw_llvm {
+   struct draw_context *draw;
+
+   struct draw_jit_context jit_context;
+
+   LLVMModuleRef module;
+   LLVMExecutionEngineRef engine;
+   LLVMModuleProviderRef provider;
+   LLVMTargetDataRef target;
+   LLVMPassManagerRef pass;
+
+   LLVMTypeRef context_ptr_type;
+   LLVMTypeRef vertex_header_ptr_type;
+   LLVMTypeRef buffer_ptr_type;
+   LLVMTypeRef vb_ptr_type;
+};
+
+struct draw_llvm_variant_key
+{
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned                   nr_vertex_elements;
+   struct pipe_shader_state   vs;
+};
+
+struct draw_llvm_variant
+{
+   struct draw_llvm_variant_key key;
+   LLVMValueRef function;
+   draw_jit_vert_func jit_func;
+
+   struct draw_llvm_variant *next;
+};
+
+struct draw_llvm *
+draw_llvm_create(struct draw_context *draw);
+
+void
+draw_llvm_destroy(struct draw_llvm *llvm);
+
+struct draw_llvm_variant *
+draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs);
+
+void
+draw_llvm_make_variant_key(struct draw_llvm *llvm,
+                           struct draw_llvm_variant_key *key);
+
+LLVMValueRef
+draw_llvm_translate_from(LLVMBuilderRef builder,
+                         LLVMValueRef vbuffer,
+                         enum pipe_format from_format);
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
new file mode 100644
index 0000000000..d8438b85a5
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_llvm_translate.c
@@ -0,0 +1,479 @@
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "draw_llvm.h"
+
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_struct.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+
+
+#define DRAW_DBG 0
+
+static  LLVMValueRef
+from_64_float(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMDoubleType(), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   return LLVMBuildFPTrunc(builder, l, LLVMFloatType(), "");
+}
+
+static LLVMValueRef
+from_32_float(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMFloatType(), 0) , "");
+   return LLVMBuildLoad(builder, bc, "");
+}
+
+static INLINE LLVMValueRef
+from_8_uscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, val, "");
+   return LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+}
+
+static INLINE LLVMValueRef
+from_16_uscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(16), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   return LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+}
+
+static INLINE LLVMValueRef
+from_32_uscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(32), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   return LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+}
+
+static INLINE LLVMValueRef
+from_8_sscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, val, "");
+   return LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+}
+
+static INLINE LLVMValueRef
+from_16_sscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(16), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   return LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+}
+
+static INLINE LLVMValueRef
+from_32_sscaled(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(32), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   return LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+}
+
+
+static INLINE LLVMValueRef
+from_8_unorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, val, "");
+   LLVMValueRef uscaled = LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 255.), "");
+}
+
+static INLINE LLVMValueRef
+from_16_unorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(16), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   LLVMValueRef uscaled = LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 65535.), "");
+}
+
+static INLINE LLVMValueRef
+from_32_unorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(32), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   LLVMValueRef uscaled = LLVMBuildUIToFP(builder, l, LLVMFloatType(), "");
+
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 4294967295.), "");
+}
+
+static INLINE LLVMValueRef
+from_8_snorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, val, "");
+   LLVMValueRef uscaled = LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 127.0), "");
+}
+
+static INLINE LLVMValueRef
+from_16_snorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(16), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   LLVMValueRef uscaled = LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 32767.0f), "");
+}
+
+static INLINE LLVMValueRef
+from_32_snorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(32), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   LLVMValueRef uscaled = LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 2147483647.0), "");
+}
+
+static INLINE LLVMValueRef
+from_32_fixed(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef bc = LLVMBuildBitCast(builder, val,
+                                      LLVMPointerType(LLVMIntType(32), 0) , "");
+   LLVMValueRef l = LLVMBuildLoad(builder, bc, "");
+   LLVMValueRef uscaled = LLVMBuildSIToFP(builder, l, LLVMFloatType(), "");
+
+   return LLVMBuildFDiv(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 65536.0), "");
+}
+
+static LLVMValueRef
+to_64_float(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPExt(builder, l, LLVMDoubleType(), "");
+}
+
+static LLVMValueRef
+to_32_float(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   return LLVMBuildLoad(builder, fp, "");
+}
+
+static INLINE LLVMValueRef
+to_8_uscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToUI(builder, l, LLVMIntType(8), "");
+}
+
+static INLINE LLVMValueRef
+to_16_uscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToUI(builder, l, LLVMIntType(16), "");
+}
+
+static INLINE LLVMValueRef
+to_32_uscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToUI(builder, l, LLVMIntType(32), "");
+}
+
+static INLINE LLVMValueRef
+to_8_sscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToSI(builder, l, LLVMIntType(8), "");
+}
+
+static INLINE LLVMValueRef
+to_16_sscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToSI(builder, l, LLVMIntType(16), "");
+}
+
+static INLINE LLVMValueRef
+to_32_sscaled(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   return LLVMBuildFPToSI(builder, l, LLVMIntType(32), "");
+}
+
+static INLINE LLVMValueRef
+to_8_unorm(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToUI(builder, l, LLVMIntType(8), "");
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 255.), "");
+}
+
+static INLINE LLVMValueRef
+to_16_unorm(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToUI(builder, l, LLVMIntType(32), "");
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 65535.), "");
+}
+
+static INLINE LLVMValueRef
+to_32_unorm(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToUI(builder, l, LLVMIntType(32), "");
+
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 4294967295.), "");
+}
+
+static INLINE LLVMValueRef
+to_8_snorm(LLVMBuilderRef builder, LLVMValueRef val)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, val, "");
+   LLVMValueRef uscaled = LLVMBuildFPToSI(builder, l, LLVMIntType(8), "");
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 127.0), "");
+}
+
+static INLINE LLVMValueRef
+to_16_snorm(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToSI(builder, l, LLVMIntType(16), "");
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 32767.0f), "");
+}
+
+static INLINE LLVMValueRef
+to_32_snorm(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToSI(builder, l, LLVMIntType(32), "");
+
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 2147483647.0), "");
+}
+
+static INLINE LLVMValueRef
+to_32_fixed(LLVMBuilderRef builder, LLVMValueRef fp)
+{
+   LLVMValueRef l = LLVMBuildLoad(builder, fp, "");
+   LLVMValueRef uscaled = LLVMBuildFPToSI(builder, l, LLVMIntType(32), "");
+
+   return LLVMBuildFMul(builder, uscaled,
+                        LLVMConstReal(LLVMFloatType(), 65536.0), "");
+}
+
+typedef LLVMValueRef (*from_func)(LLVMBuilderRef, LLVMValueRef);
+typedef  LLVMValueRef (*to_func)(LLVMBuilderRef, LLVMValueRef);
+
+/* so that underneath can avoid function calls which are prohibited
+ * for static initialization we need this conversion */
+enum ll_type {
+   LL_Double,
+   LL_Float,
+   LL_Int32,
+   LL_Int16,
+   LL_Int8
+};
+
+static INLINE LLVMTypeRef
+ll_type_to_llvm(enum ll_type type)
+{
+   switch (type) {
+   case LL_Double:
+      return LLVMDoubleType();
+   case LL_Float:
+      return LLVMFloatType();
+   case LL_Int32:
+      return LLVMInt32Type();
+   case LL_Int16:
+      return LLVMIntType(16);
+   case LL_Int8:
+      return LLVMIntType(8);
+   }
+   return LLVMIntType(8);
+}
+
+static INLINE int
+ll_type_size(enum ll_type type)
+{
+   switch (type) {
+   case LL_Double:
+      return 8;
+   case LL_Float:
+      return 4;
+   case LL_Int32:
+      return 4;
+   case LL_Int16:
+      return 2;
+   case LL_Int8:
+      return 1;
+   }
+   return 1;
+}
+
+struct draw_llvm_translate {
+   int format;
+   from_func from;
+   to_func to;
+   enum ll_type type;
+   int num_components;
+} translates[] =
+{
+   {PIPE_FORMAT_R64_FLOAT,          from_64_float, to_64_float, LL_Double, 1},
+   {PIPE_FORMAT_R64G64_FLOAT,       from_64_float, to_64_float, LL_Double, 2},
+   {PIPE_FORMAT_R64G64B64_FLOAT,    from_64_float, to_64_float, LL_Double, 3},
+   {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4},
+   {PIPE_FORMAT_R32_FLOAT,          from_32_float, to_32_float, LL_Float, 1},
+   {PIPE_FORMAT_R32G32_FLOAT,       from_32_float, to_32_float, LL_Float, 2},
+   {PIPE_FORMAT_R32G32B32_FLOAT,    from_32_float, to_32_float, LL_Float, 3},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4},
+
+   {PIPE_FORMAT_R32_UNORM,          from_32_unorm, to_32_unorm, LL_Int32, 1},
+   {PIPE_FORMAT_R32G32_UNORM,       from_32_unorm, to_32_unorm, LL_Int32, 2},
+   {PIPE_FORMAT_R32G32B32_UNORM,    from_32_unorm, to_32_unorm, LL_Int32, 3},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4},
+
+   {PIPE_FORMAT_R32_USCALED,          from_32_uscaled, to_32_uscaled, LL_Int32, 1},
+   {PIPE_FORMAT_R32G32_USCALED,       from_32_uscaled, to_32_uscaled, LL_Int32, 2},
+   {PIPE_FORMAT_R32G32B32_USCALED,    from_32_uscaled, to_32_uscaled, LL_Int32, 3},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4},
+
+   {PIPE_FORMAT_R32_SNORM,          from_32_snorm, to_32_snorm, LL_Int32, 1},
+   {PIPE_FORMAT_R32G32_SNORM,       from_32_snorm, to_32_snorm, LL_Int32, 2},
+   {PIPE_FORMAT_R32G32B32_SNORM,    from_32_snorm, to_32_snorm, LL_Int32, 3},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4},
+
+   {PIPE_FORMAT_R32_SSCALED,          from_32_sscaled, to_32_sscaled, LL_Int32, 1},
+   {PIPE_FORMAT_R32G32_SSCALED,       from_32_sscaled, to_32_sscaled, LL_Int32, 2},
+   {PIPE_FORMAT_R32G32B32_SSCALED,    from_32_sscaled, to_32_sscaled, LL_Int32, 3},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4},
+
+   {PIPE_FORMAT_R16_UNORM,          from_16_unorm, to_16_unorm, LL_Int16, 1},
+   {PIPE_FORMAT_R16G16_UNORM,       from_16_unorm, to_16_unorm, LL_Int16, 2},
+   {PIPE_FORMAT_R16G16B16_UNORM,    from_16_unorm, to_16_unorm, LL_Int16, 3},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4},
+
+   {PIPE_FORMAT_R16_USCALED,          from_16_uscaled, to_16_uscaled, LL_Int16, 1},
+   {PIPE_FORMAT_R16G16_USCALED,       from_16_uscaled, to_16_uscaled, LL_Int16, 2},
+   {PIPE_FORMAT_R16G16B16_USCALED,    from_16_uscaled, to_16_uscaled, LL_Int16, 3},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4},
+
+   {PIPE_FORMAT_R16_SNORM,          from_16_snorm, to_16_snorm, LL_Int16, 1},
+   {PIPE_FORMAT_R16G16_SNORM,       from_16_snorm, to_16_snorm, LL_Int16, 2},
+   {PIPE_FORMAT_R16G16B16_SNORM,    from_16_snorm, to_16_snorm, LL_Int16, 3},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4},
+
+   {PIPE_FORMAT_R16_SSCALED,          from_16_sscaled, to_16_sscaled, LL_Int16, 1},
+   {PIPE_FORMAT_R16G16_SSCALED,       from_16_sscaled, to_16_sscaled, LL_Int16, 2},
+   {PIPE_FORMAT_R16G16B16_SSCALED,    from_16_sscaled, to_16_sscaled, LL_Int16, 3},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4},
+
+   {PIPE_FORMAT_R8_UNORM,       from_8_unorm, to_8_unorm, LL_Int8, 1},
+   {PIPE_FORMAT_R8G8_UNORM,     from_8_unorm, to_8_unorm, LL_Int8, 2},
+   {PIPE_FORMAT_R8G8B8_UNORM,   from_8_unorm, to_8_unorm, LL_Int8, 3},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
+
+   {PIPE_FORMAT_R8_USCALED,       from_8_uscaled, to_8_uscaled, LL_Int8, 1},
+   {PIPE_FORMAT_R8G8_USCALED,     from_8_uscaled, to_8_uscaled, LL_Int8, 2},
+   {PIPE_FORMAT_R8G8B8_USCALED,   from_8_uscaled, to_8_uscaled, LL_Int8, 3},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4},
+
+   {PIPE_FORMAT_R8_SNORM,       from_8_snorm, to_8_snorm, LL_Int8, 1},
+   {PIPE_FORMAT_R8G8_SNORM,     from_8_snorm, to_8_snorm, LL_Int8, 2},
+   {PIPE_FORMAT_R8G8B8_SNORM,   from_8_snorm, to_8_snorm, LL_Int8, 3},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4},
+
+   {PIPE_FORMAT_R8_SSCALED,       from_8_sscaled, to_8_sscaled, LL_Int8, 1},
+   {PIPE_FORMAT_R8G8_SSCALED,     from_8_sscaled, to_8_sscaled, LL_Int8, 2},
+   {PIPE_FORMAT_R8G8B8_SSCALED,   from_8_sscaled, to_8_sscaled, LL_Int8, 3},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4},
+
+   {PIPE_FORMAT_R32_FIXED,          from_32_fixed, to_32_fixed, LL_Int32, 1},
+   {PIPE_FORMAT_R32G32_FIXED,       from_32_fixed, to_32_fixed, LL_Int32, 2},
+   {PIPE_FORMAT_R32G32B32_FIXED,    from_32_fixed, to_32_fixed, LL_Int32, 3},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4},
+
+   {PIPE_FORMAT_A8R8G8B8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
+};
+
+
+static LLVMValueRef
+fetch(LLVMBuilderRef builder,
+      LLVMValueRef ptr, int val_size, int nr_components,
+      from_func func)
+{
+   int i;
+   int offset = 0;
+   LLVMValueRef res = LLVMConstNull(
+      LLVMVectorType(LLVMFloatType(), 4));
+   LLVMValueRef defaults[4];
+
+   defaults[0] = LLVMConstReal(LLVMFloatType(), 0);
+   defaults[1] = LLVMConstReal(LLVMFloatType(), 0);
+   defaults[2] = LLVMConstReal(LLVMFloatType(), 0);
+   defaults[3] = LLVMConstReal(LLVMFloatType(), 1);
+
+   for (i = 0; i < nr_components; ++i) {
+      LLVMValueRef src_index = LLVMConstInt(LLVMInt32Type(), offset, 0);
+      LLVMValueRef dst_index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef src_tmp;
+      LLVMValueRef component;
+
+      src_tmp = LLVMBuildGEP(builder, ptr, &src_index, 1, "src_tmp");
+
+      /* convert src_tmp to float */
+      component = func(builder, src_tmp);
+
+      /* vec.comp = component */
+      res = LLVMBuildInsertElement(builder,
+                                   res,
+                                   component,
+                                   dst_index, "");
+      offset += val_size;
+   }
+   for (; i < 4; ++i) {
+      LLVMValueRef dst_index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(builder,
+                                   res,
+                                   defaults[i],
+                                   dst_index, "");
+   }
+   return res;
+}
+
+
+LLVMValueRef
+draw_llvm_translate_from(LLVMBuilderRef builder,
+                         LLVMValueRef vbuffer,
+                         enum pipe_format from_format)
+{
+   int i;
+   for (i = 0; i < Elements(translates); ++i) {
+      if (translates[i].format == from_format) {
+         /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/
+         return fetch(builder,
+                      vbuffer,
+                      ll_type_size(translates[i].type),
+                      translates[i].num_components,
+                      translates[i].from);
+      }
+   }
+   return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 83dc1a35f4..b8f57dde6d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -225,7 +225,7 @@ static void do_triangle( struct draw_context *draw,
 
 
 /**
- * Code to run the pipeline on a fairly arbitary collection of vertices.
+ * Code to run the pipeline on a fairly arbitrary collection of vertices.
  * For drawing indexed primitives.
  *
  * Vertex headers must be pre-initialized with the
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 72d7480efb..4faf0a779c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -40,6 +40,7 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_sampler.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
@@ -87,9 +88,10 @@ struct aaline_stage
    uint pos_slot;
 
    void *sampler_cso;
-   struct pipe_texture *texture;
+   struct pipe_resource *texture;
+   struct pipe_sampler_view *sampler_view;
    uint num_samplers;
-   uint num_textures;
+   uint num_sampler_views;
 
 
    /*
@@ -98,7 +100,7 @@ struct aaline_stage
    struct aaline_fragment_shader *fs;
    struct {
       void *sampler[PIPE_MAX_SAMPLERS];
-      struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
    } state;
 
    /*
@@ -111,8 +113,10 @@ struct aaline_stage
 
    void (*driver_bind_sampler_states)(struct pipe_context *, unsigned,
                                       void **);
-   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
-                                       struct pipe_texture **);
+
+   void (*driver_set_sampler_views)(struct pipe_context *,
+                                    unsigned,
+                                    struct pipe_sampler_view **);
 };
 
 
@@ -392,7 +396,8 @@ aaline_create_texture(struct aaline_stage *aaline)
 {
    struct pipe_context *pipe = aaline->stage.draw->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_texture texTemp;
+   struct pipe_resource texTemp;
+   struct pipe_sampler_view viewTempl;
    uint level;
 
    memset(&texTemp, 0, sizeof(texTemp));
@@ -402,28 +407,46 @@ aaline_create_texture(struct aaline_stage *aaline)
    texTemp.width0 = 1 << MAX_TEXTURE_LEVEL;
    texTemp.height0 = 1 << MAX_TEXTURE_LEVEL;
    texTemp.depth0 = 1;
+   texTemp.bind = PIPE_BIND_SAMPLER_VIEW;
 
-   aaline->texture = screen->texture_create(screen, &texTemp);
+   aaline->texture = screen->resource_create(screen, &texTemp);
    if (!aaline->texture)
       return FALSE;
 
+   u_sampler_view_default_template(&viewTempl,
+                                   aaline->texture,
+                                   aaline->texture->format);
+   aaline->sampler_view = pipe->create_sampler_view(pipe,
+                                                    aaline->texture,
+                                                    &viewTempl);
+   if (!aaline->sampler_view) {
+      return FALSE;
+   }
+
    /* Fill in mipmap images.
     * Basically each level is solid opaque, except for the outermost
     * texels which are zero.  Special case the 1x1 and 2x2 levels.
     */
    for (level = 0; level <= MAX_TEXTURE_LEVEL; level++) {
       struct pipe_transfer *transfer;
+      struct pipe_box box;
       const uint size = u_minify(aaline->texture->width0, level);
       ubyte *data;
       uint i, j;
 
       assert(aaline->texture->width0 == aaline->texture->height0);
 
+      u_box_origin_2d( size, size, &box );
+
       /* This texture is new, no need to flush. 
        */
-      transfer = screen->get_tex_transfer(screen, aaline->texture, 0, level, 0,
-                                         PIPE_TRANSFER_WRITE, 0, 0, size, size);
-      data = screen->transfer_map(screen, transfer);
+      transfer = pipe->get_transfer(pipe,
+				    aaline->texture,
+				    u_subresource(0, level), 
+				    PIPE_TRANSFER_WRITE,
+				    &box);
+
+      data = pipe->transfer_map(pipe, transfer);
       if (data == NULL)
          return FALSE;
 
@@ -447,8 +470,8 @@ aaline_create_texture(struct aaline_stage *aaline)
       }
 
       /* unmap */
-      screen->transfer_unmap(screen, transfer);
-      screen->tex_transfer_destroy(transfer);
+      pipe->transfer_unmap(pipe, transfer);
+      pipe->transfer_destroy(pipe, transfer);
    }
    return TRUE;
 }
@@ -671,16 +694,16 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(aaline->num_textures, aaline->num_samplers);
+   num_samplers = MAX2(aaline->num_sampler_views, aaline->num_samplers);
    num_samplers = MAX2(num_samplers, aaline->fs->sampler_unit + 1);
 
    aaline->state.sampler[aaline->fs->sampler_unit] = aaline->sampler_cso;
-   pipe_texture_reference(&aaline->state.texture[aaline->fs->sampler_unit],
-                          aaline->texture);
+   pipe_sampler_view_reference(&aaline->state.sampler_views[aaline->fs->sampler_unit],
+                               aaline->sampler_view);
 
    draw->suspend_flushing = TRUE;
    aaline->driver_bind_sampler_states(pipe, num_samplers, aaline->state.sampler);
-   aaline->driver_set_sampler_textures(pipe, num_samplers, aaline->state.texture);
+   aaline->driver_set_sampler_views(pipe, num_samplers, aaline->state.sampler_views);
 
    /* Disable triangle culling, stippling, unfilled mode etc. */
    r = draw_get_rasterizer_no_cull(draw, rast->scissor, rast->flatshade);
@@ -709,8 +732,9 @@ aaline_flush(struct draw_stage *stage, unsigned flags)
    aaline->driver_bind_fs_state(pipe, aaline->fs->driver_fs);
    aaline->driver_bind_sampler_states(pipe, aaline->num_samplers,
                                       aaline->state.sampler);
-   aaline->driver_set_sampler_textures(pipe, aaline->num_textures,
-                                       aaline->state.texture);
+   aaline->driver_set_sampler_views(pipe,
+                                    aaline->num_sampler_views,
+                                    aaline->state.sampler_views);
 
    /* restore original rasterizer state */
    if (draw->rast_handle) {
@@ -738,14 +762,18 @@ aaline_destroy(struct draw_stage *stage)
    uint i;
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      pipe_texture_reference(&aaline->state.texture[i], NULL);
+      pipe_sampler_view_reference(&aaline->state.sampler_views[i], NULL);
    }
 
    if (aaline->sampler_cso)
       pipe->delete_sampler_state(pipe, aaline->sampler_cso);
 
    if (aaline->texture)
-      pipe_texture_reference(&aaline->texture, NULL);
+      pipe_resource_reference(&aaline->texture, NULL);
+
+   if (aaline->sampler_view) {
+      pipe_sampler_view_reference(&aaline->sampler_view, NULL);
+   }
 
    draw_free_temp_verts( stage );
 
@@ -859,23 +887,24 @@ aaline_bind_sampler_states(struct pipe_context *pipe,
 
 
 static void
-aaline_set_sampler_textures(struct pipe_context *pipe,
-                            unsigned num, struct pipe_texture **texture)
+aaline_set_sampler_views(struct pipe_context *pipe,
+                         unsigned num,
+                         struct pipe_sampler_view **views)
 {
    struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
    uint i;
 
    /* save current */
    for (i = 0; i < num; i++) {
-      pipe_texture_reference(&aaline->state.texture[i], texture[i]);
+      pipe_sampler_view_reference(&aaline->state.sampler_views[i], views[i]);
    }
    for ( ; i < PIPE_MAX_SAMPLERS; i++) {
-      pipe_texture_reference(&aaline->state.texture[i], NULL);
+      pipe_sampler_view_reference(&aaline->state.sampler_views[i], NULL);
    }
-   aaline->num_textures = num;
+   aaline->num_sampler_views = num;
 
    /* pass-through */
-   aaline->driver_set_sampler_textures(pipe, num, texture);
+   aaline->driver_set_sampler_views(pipe, num, views);
 }
 
 
@@ -911,7 +940,7 @@ draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
    aaline->driver_delete_fs_state = pipe->delete_fs_state;
 
    aaline->driver_bind_sampler_states = pipe->bind_fragment_sampler_states;
-   aaline->driver_set_sampler_textures = pipe->set_fragment_sampler_textures;
+   aaline->driver_set_sampler_views = pipe->set_fragment_sampler_views;
 
    /* override the driver's functions */
    pipe->create_fs_state = aaline_create_fs_state;
@@ -919,7 +948,7 @@ draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
    pipe->delete_fs_state = aaline_delete_fs_state;
 
    pipe->bind_fragment_sampler_states = aaline_bind_sampler_states;
-   pipe->set_fragment_sampler_textures = aaline_set_sampler_textures;
+   pipe->set_fragment_sampler_views = aaline_set_sampler_views;
    
    /* Install once everything is known to be OK:
     */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index d0d99aa331..ef30db094f 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -42,6 +42,7 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_sampler.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
@@ -74,9 +75,10 @@ struct pstip_stage
    struct draw_stage stage;
 
    void *sampler_cso;
-   struct pipe_texture *texture;
+   struct pipe_resource *texture;
+   struct pipe_sampler_view *sampler_view;
    uint num_samplers;
-   uint num_textures;
+   uint num_sampler_views;
 
    /*
     * Currently bound state
@@ -84,7 +86,7 @@ struct pstip_stage
    struct pstip_fragment_shader *fs;
    struct {
       void *samplers[PIPE_MAX_SAMPLERS];
-      struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
       const struct pipe_poly_stipple *stipple;
    } state;
 
@@ -98,8 +100,9 @@ struct pstip_stage
 
    void (*driver_bind_sampler_states)(struct pipe_context *, unsigned, void **);
 
-   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
-                                       struct pipe_texture **);
+   void (*driver_set_sampler_views)(struct pipe_context *,
+                                    unsigned,
+                                    struct pipe_sampler_view **);
 
    void (*driver_set_polygon_stipple)(struct pipe_context *,
                                       const struct pipe_poly_stipple *);
@@ -374,19 +377,21 @@ pstip_update_texture(struct pstip_stage *pstip)
 {
    static const uint bit31 = 1 << 31;
    struct pipe_context *pipe = pstip->pipe;
-   struct pipe_screen *screen = pipe->screen;
    struct pipe_transfer *transfer;
    const uint *stipple = pstip->state.stipple->stipple;
    uint i, j;
    ubyte *data;
 
    /* XXX: want to avoid flushing just because we use stipple: 
+    *
+    * Flush should no longer be necessary if driver is properly
+    * interleaving drawing and transfers on a given context:
     */
    pipe->flush( pipe, PIPE_FLUSH_TEXTURE_CACHE, NULL );
 
-   transfer = screen->get_tex_transfer(screen, pstip->texture, 0, 0, 0,
-                                       PIPE_TRANSFER_WRITE, 0, 0, 32, 32);
-   data = screen->transfer_map(screen, transfer);
+   transfer = pipe_get_transfer(pipe, pstip->texture, 0, 0, 0,
+				    PIPE_TRANSFER_WRITE, 0, 0, 32, 32);
+   data = pipe->transfer_map(pipe, transfer);
 
    /*
     * Load alpha texture.
@@ -408,8 +413,8 @@ pstip_update_texture(struct pstip_stage *pstip)
    }
 
    /* unmap */
-   screen->transfer_unmap(screen, transfer);
-   screen->tex_transfer_destroy(transfer);
+   pipe->transfer_unmap(pipe, transfer);
+   pipe->transfer_destroy(pipe, transfer);
 }
 
 
@@ -421,7 +426,8 @@ pstip_create_texture(struct pstip_stage *pstip)
 {
    struct pipe_context *pipe = pstip->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_texture texTemp;
+   struct pipe_resource texTemp;
+   struct pipe_sampler_view viewTempl;
 
    memset(&texTemp, 0, sizeof(texTemp));
    texTemp.target = PIPE_TEXTURE_2D;
@@ -430,11 +436,22 @@ pstip_create_texture(struct pstip_stage *pstip)
    texTemp.width0 = 32;
    texTemp.height0 = 32;
    texTemp.depth0 = 1;
+   texTemp.bind = PIPE_BIND_SAMPLER_VIEW;
 
-   pstip->texture = screen->texture_create(screen, &texTemp);
+   pstip->texture = screen->resource_create(screen, &texTemp);
    if (pstip->texture == NULL)
       return FALSE;
 
+   u_sampler_view_default_template(&viewTempl,
+                                   pstip->texture,
+                                   pstip->texture->format);
+   pstip->sampler_view = pipe->create_sampler_view(pipe,
+                                                   pstip->texture,
+                                                   &viewTempl);
+   if (!pstip->sampler_view) {
+      return FALSE;
+   }
+
    return TRUE;
 }
 
@@ -513,19 +530,19 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(pstip->num_textures, pstip->num_samplers);
+   num_samplers = MAX2(pstip->num_sampler_views, pstip->num_samplers);
    num_samplers = MAX2(num_samplers, pstip->fs->sampler_unit + 1);
 
    /* plug in our sampler, texture */
    pstip->state.samplers[pstip->fs->sampler_unit] = pstip->sampler_cso;
-   pipe_texture_reference(&pstip->state.textures[pstip->fs->sampler_unit],
-                          pstip->texture);
+   pipe_sampler_view_reference(&pstip->state.sampler_views[pstip->fs->sampler_unit],
+                               pstip->sampler_view);
 
    assert(num_samplers <= PIPE_MAX_SAMPLERS);
 
    draw->suspend_flushing = TRUE;
    pstip->driver_bind_sampler_states(pipe, num_samplers, pstip->state.samplers);
-   pstip->driver_set_sampler_textures(pipe, num_samplers, pstip->state.textures);
+   pstip->driver_set_sampler_views(pipe, num_samplers, pstip->state.sampler_views);
    draw->suspend_flushing = FALSE;
 
    /* now really draw first triangle */
@@ -549,8 +566,9 @@ pstip_flush(struct draw_stage *stage, unsigned flags)
    pstip->driver_bind_fs_state(pipe, pstip->fs->driver_fs);
    pstip->driver_bind_sampler_states(pipe, pstip->num_samplers,
                                      pstip->state.samplers);
-   pstip->driver_set_sampler_textures(pipe, pstip->num_textures,
-                                      pstip->state.textures);
+   pstip->driver_set_sampler_views(pipe,
+                                   pstip->num_sampler_views,
+                                   pstip->state.sampler_views);
    draw->suspend_flushing = FALSE;
 }
 
@@ -569,12 +587,16 @@ pstip_destroy(struct draw_stage *stage)
    uint i;
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      pipe_texture_reference(&pstip->state.textures[i], NULL);
+      pipe_sampler_view_reference(&pstip->state.sampler_views[i], NULL);
    }
 
    pstip->pipe->delete_sampler_state(pstip->pipe, pstip->sampler_cso);
 
-   pipe_texture_reference(&pstip->texture, NULL);
+   pipe_resource_reference(&pstip->texture, NULL);
+
+   if (pstip->sampler_view) {
+      pipe_sampler_view_reference(&pstip->sampler_view, NULL);
+   }
 
    draw_free_temp_verts( stage );
    FREE( stage );
@@ -680,24 +702,25 @@ pstip_bind_sampler_states(struct pipe_context *pipe,
 
 
 static void
-pstip_set_sampler_textures(struct pipe_context *pipe,
-                           unsigned num, struct pipe_texture **texture)
+pstip_set_sampler_views(struct pipe_context *pipe,
+                        unsigned num,
+                        struct pipe_sampler_view **views)
 {
    struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
    uint i;
 
    /* save current */
    for (i = 0; i < num; i++) {
-      pipe_texture_reference(&pstip->state.textures[i], texture[i]);
+      pipe_sampler_view_reference(&pstip->state.sampler_views[i], views[i]);
    }
    for (; i < PIPE_MAX_SAMPLERS; i++) {
-      pipe_texture_reference(&pstip->state.textures[i], NULL);
+      pipe_sampler_view_reference(&pstip->state.sampler_views[i], NULL);
    }
 
-   pstip->num_textures = num;
+   pstip->num_sampler_views = num;
 
    /* pass-through */
-   pstip->driver_set_sampler_textures(pstip->pipe, num, texture);
+   pstip->driver_set_sampler_views(pstip->pipe, num, views);
 }
 
 
@@ -754,7 +777,7 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pstip->driver_delete_fs_state = pipe->delete_fs_state;
 
    pstip->driver_bind_sampler_states = pipe->bind_fragment_sampler_states;
-   pstip->driver_set_sampler_textures = pipe->set_fragment_sampler_textures;
+   pstip->driver_set_sampler_views = pipe->set_fragment_sampler_views;
    pstip->driver_set_polygon_stipple = pipe->set_polygon_stipple;
 
    /* override the driver's functions */
@@ -763,7 +786,7 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pipe->delete_fs_state = pstip_delete_fs_state;
 
    pipe->bind_fragment_sampler_states = pstip_bind_sampler_states;
-   pipe->set_fragment_sampler_textures = pstip_set_sampler_textures;
+   pipe->set_fragment_sampler_views = pstip_set_sampler_views;
    pipe->set_polygon_stipple = pstip_set_polygon_stipple;
 
    return TRUE;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 2709957961..ee2b811603 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -235,41 +235,18 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
    for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
-      unsigned output_format;
+      enum pipe_format output_format;
       unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      switch (vbuf->vinfo->attrib[i].emit) {
-      case EMIT_4F:
-	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-	 emit_sz = 4 * sizeof(float);
-	 break;
-      case EMIT_3F:
-	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
-	 emit_sz = 3 * sizeof(float);
-	 break;
-      case EMIT_2F:
-	 output_format = PIPE_FORMAT_R32G32_FLOAT;
-	 emit_sz = 2 * sizeof(float);
-	 break;
-      case EMIT_1F:
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
-	 break;
-      case EMIT_1F_PSIZE:
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
+      output_format = draw_translate_vinfo_format(vbuf->vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vbuf->vinfo->attrib[i].emit);
+
+      /* doesn't handle EMIT_OMIT */
+      assert(emit_sz != 0);
+
+      if (vbuf->vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
 	 src_buffer = 1;
 	 src_offset = 0;
-	 break;
-      case EMIT_4UB:
-	 output_format = PIPE_FORMAT_A8R8G8B8_UNORM;
-	 emit_sz = 4 * sizeof(ubyte);
-         break;
-      default:
-	 assert(0);
-	 output_format = PIPE_FORMAT_NONE;
-	 emit_sz = 0;
-	 break;
       }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index d6d7513e2a..f6edb599a9 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -114,7 +114,10 @@ static void set_texcoords(const struct widepoint_stage *wide,
       /* put gl_PointCoord into the extra vertex slot */
       uint slot = wide->stage.draw->extra_shader_outputs.slot;
       v->data[slot][0] = tc[0];
-      v->data[slot][1] = tc[1];
+      if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+         v->data[slot][1] = 1.0f - tc[1];
+      else
+         v->data[slot][1] = tc[1];
       v->data[slot][2] = 0.0F;
       v->data[slot][3] = 1.0F;
    }
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 3fe3dc519d..e250b3a1d5 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -46,6 +46,10 @@
 
 #include "tgsi/tgsi_scan.h"
 
+#ifdef HAVE_LLVM
+#include <llvm-c/ExecutionEngine.h>
+#endif
+
 
 struct pipe_context;
 struct draw_vertex_shader;
@@ -245,9 +249,16 @@ struct draw_context
 
    unsigned instance_id;
 
+#ifdef HAVE_LLVM
+   LLVMExecutionEngineRef engine;
+#endif
    void *driver_private;
 };
 
+/*******************************************************************************
+ * Draw common initialization code
+ */
+boolean draw_init(struct draw_context *draw);
 
 /*******************************************************************************
  * Vertex shader code:
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 6d90a6c42f..43f6c5650a 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -140,7 +140,14 @@ boolean draw_pt_init( struct draw_context *draw )
    if (!draw->pt.middle.fetch_shade_emit)
       return FALSE;
 
-   draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );
+#if HAVE_LLVM
+   draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit_llvm( draw );
+#else
+   draw->pt.middle.general = NULL;
+#endif
+
+   if (!draw->pt.middle.general)
+      draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );
    if (!draw->pt.middle.general)
       return FALSE;
 
@@ -307,9 +314,8 @@ draw_arrays_instanced(struct draw_context *draw,
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
       debug_printf("Elements:\n");
       for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
-         debug_printf("  format=%s comps=%u\n",
-                      util_format_name(draw->pt.vertex_element[i].src_format),
-                      draw->pt.vertex_element[i].nr_components);
+         debug_printf("  format=%s\n",
+                      util_format_name(draw->pt.vertex_element[i].src_format));
       }
       debug_printf("Buffers:\n");
       for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index d5e0d92a60..c2797a759e 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -147,6 +147,7 @@ struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
 struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
+struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw);
 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h
index 4ca5b52020..3c44f7c11e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_decompose.h
+++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h
@@ -105,40 +105,20 @@ static void FUNC( ARGS,
 
 
    case PIPE_PRIM_QUADS:
-      if (flatfirst) {
-         for (i = 0; i+3 < count; i += 4) {
-            QUAD( (i + 1),
-                  (i + 2),
-                  (i + 3),
-                  (i + 0) );
-         }
-      }
-      else {
-         for (i = 0; i+3 < count; i += 4) {
-            QUAD( (i + 0),
-                  (i + 1),
-                  (i + 2),
-                  (i + 3));
-         }
+      for (i = 0; i+3 < count; i += 4) {
+         QUAD( (i + 0),
+               (i + 1),
+               (i + 2),
+               (i + 3));
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      if (flatfirst) {
-         for (i = 0; i+3 < count; i += 2) {
-            QUAD( (i + 1),
-                  (i + 3),
-                  (i + 2),
-                  (i + 0) );
-         }
-      }
-      else {
-         for (i = 0; i+3 < count; i += 2) {
-            QUAD( (i + 2),
-                  (i + 0),
-                  (i + 1),
-                  (i + 3));
-         }
+      for (i = 0; i+3 < count; i += 2) {
+         QUAD( (i + 2),
+               (i + 0),
+               (i + 1),
+               (i + 3));
       }
       break;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index ae357b5122..a7917f54b0 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -86,40 +86,15 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       unsigned output_format;
       unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
+      output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
 
-         
-      switch (vinfo->attrib[i].emit) {
-      case EMIT_4F:
-	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-	 emit_sz = 4 * sizeof(float);
-	 break;
-      case EMIT_3F:
-	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
-	 emit_sz = 3 * sizeof(float);
-	 break;
-      case EMIT_2F:
-	 output_format = PIPE_FORMAT_R32G32_FLOAT;
-	 emit_sz = 2 * sizeof(float);
-	 break;
-      case EMIT_1F:
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
-	 break;
-      case EMIT_1F_PSIZE:
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
+      /* doesn't handle EMIT_OMIT */
+      assert(emit_sz != 0);
+
+      if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
 	 src_buffer = 1;
 	 src_offset = 0;
-	 break;
-      case EMIT_4UB:
-	 output_format = PIPE_FORMAT_A8R8G8B8_UNORM;
-	 emit_sz = 4 * sizeof(ubyte);
-         break;
-      default:
-	 assert(0);
-	 output_format = PIPE_FORMAT_NONE;
-	 emit_sz = 0;
-	 break;
       }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 2a604470e9..1994ddf2bc 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -129,41 +129,16 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
       unsigned input_offset = src->src_offset;
       unsigned output_format;
 
-      switch (vinfo->attrib[i].emit) {
-      case EMIT_4UB:
-	 output_format = PIPE_FORMAT_R8G8B8A8_UNORM;
-	 emit_sz = 4 * sizeof(unsigned char);
-         break;
-      case EMIT_4F:
-	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-	 emit_sz = 4 * sizeof(float);
-         break;
-      case EMIT_3F:
-	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
-	 emit_sz = 3 * sizeof(float);
-         break;
-      case EMIT_2F:
-	 output_format = PIPE_FORMAT_R32G32_FLOAT;
-	 emit_sz = 2 * sizeof(float);
-         break;
-      case EMIT_1F:
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
-         break;
-      case EMIT_1F_PSIZE:
+      output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
+
+      if (vinfo->attrib[i].emit == EMIT_OMIT)
+	 continue;
+
+      if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
 	 input_format = PIPE_FORMAT_R32_FLOAT;
 	 input_buffer = draw->pt.nr_vertex_buffers;
 	 input_offset = 0;
-	 output_format = PIPE_FORMAT_R32_FLOAT;
-	 emit_sz = 1 * sizeof(float);
-         break;
-      case EMIT_OMIT:
-         continue;
-      default:
-         assert(0);
-	 output_format = PIPE_FORMAT_NONE;
-	 emit_sz = 0;
-	 continue;
       }
 
       key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 1aecb51077..389e2b105e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -130,31 +130,10 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       unsigned dst_offset = 0;
 
       for (i = 0; i < vinfo->num_attribs; i++) {
-         unsigned emit_sz = 0;
-
-         switch (vinfo->attrib[i].emit) {
-         case EMIT_4F:
-            emit_sz = 4 * sizeof(float);
-            break;
-         case EMIT_3F:
-            emit_sz = 3 * sizeof(float);
-            break;
-         case EMIT_2F:
-            emit_sz = 2 * sizeof(float);
-            break;
-         case EMIT_1F:
-            emit_sz = 1 * sizeof(float);
-            break;
-         case EMIT_1F_PSIZE:
-            emit_sz = 1 * sizeof(float);
-            break;
-         case EMIT_4UB:
-            emit_sz = 4 * sizeof(ubyte);
-            break;
-         default:
-            assert(0);
-            break;
-         }
+         unsigned emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
+
+         /* doesn't handle EMIT_OMIT */
+         assert(emit_sz != 0);
 
          /* The elements in the key correspond to vertex shader output
           * numbers, not to positions in the hw vertex description --
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
new file mode 100644
index 0000000000..f71271bd91
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -0,0 +1,440 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMWare, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "draw/draw_gs.h"
+#include "draw/draw_llvm.h"
+
+
+struct llvm_middle_end {
+   struct draw_pt_middle_end base;
+   struct draw_context *draw;
+
+   struct pt_emit *emit;
+   struct pt_fetch *fetch;
+   struct pt_post_vs *post_vs;
+
+
+   unsigned vertex_data_offset;
+   unsigned vertex_size;
+   unsigned prim;
+   unsigned opt;
+
+   struct draw_llvm *llvm;
+   struct draw_llvm_variant *variants;
+   struct draw_llvm_variant *current_variant;
+   int nr_variants;
+};
+
+
+static void
+llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
+                         unsigned prim,
+                         unsigned opt,
+                         unsigned *max_vertices )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   struct draw_geometry_shader *gs = draw->gs.geometry_shader;
+   struct draw_llvm_variant_key key;
+   struct draw_llvm_variant *variant = NULL;
+   unsigned i;
+   unsigned instance_id_index = ~0;
+
+   /* Add one to num_outputs because the pipeline occasionally tags on
+    * an additional texcoord, eg for AA lines.
+    */
+   unsigned nr = MAX2( vs->info.num_inputs,
+		       vs->info.num_outputs + 1 );
+
+   /* Scan for instanceID system value.
+    */
+   for (i = 0; i < vs->info.num_inputs; i++) {
+      if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
+         instance_id_index = i;
+         break;
+      }
+   }
+
+   fpme->prim = prim;
+   fpme->opt = opt;
+
+   /* Always leave room for the vertex header whether we need it or
+    * not.  It's hard to get rid of it in particular because of the
+    * viewport code in draw_pt_post_vs.c.
+    */
+   fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
+
+
+
+   draw_pt_fetch_prepare( fpme->fetch,
+                          vs->info.num_inputs,
+                          fpme->vertex_size,
+                          instance_id_index );
+   if (opt & PT_SHADE) {
+      vs->prepare(vs, draw);
+      draw_geometry_shader_prepare(gs, draw);
+   }
+
+
+   /* XXX: it's not really gl rasterization rules we care about here,
+    * but gl vs dx9 clip spaces.
+    */
+   draw_pt_post_vs_prepare( fpme->post_vs,
+			    (boolean)draw->bypass_clipping,
+			    (boolean)(draw->identity_viewport),
+			    (boolean)draw->rasterizer->gl_rasterization_rules,
+			    (draw->vs.edgeflag_output ? true : false) );
+
+   if (!(opt & PT_PIPELINE)) {
+      draw_pt_emit_prepare( fpme->emit,
+			    prim,
+                            max_vertices );
+
+      *max_vertices = MAX2( *max_vertices,
+                            DRAW_PIPE_MAX_VERTICES );
+   }
+   else {
+      *max_vertices = DRAW_PIPE_MAX_VERTICES;
+   }
+
+   /* return even number */
+   *max_vertices = *max_vertices & ~1;
+
+   draw_llvm_make_variant_key(fpme->llvm, &key);
+
+   variant = fpme->variants;
+   while(variant) {
+      if(memcmp(&variant->key, &key, sizeof key) == 0)
+         break;
+
+      variant = variant->next;
+   }
+
+   if (!variant) {
+      variant = draw_llvm_prepare(fpme->llvm, nr);
+      variant->next = fpme->variants;
+      fpme->variants = variant;
+      ++fpme->nr_variants;
+   }
+   fpme->current_variant = variant;
+
+   /*XXX we only support one constant buffer */
+   fpme->llvm->jit_context.vs_constants =
+      draw->pt.user.vs_constants[0];
+   fpme->llvm->jit_context.gs_constants =
+      draw->pt.user.gs_constants[0];
+}
+
+
+
+static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
+                                 const unsigned *fetch_elts,
+                                 unsigned fetch_count,
+                                 const ushort *draw_elts,
+                                 unsigned draw_count )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( fetch_count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
+      assert(0);
+      return;
+   }
+
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run( fpme->fetch,
+		      fetch_elts,
+		      fetch_count,
+		      (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, eg if
+    * bypass_vs_clip_and_viewport, then the inputs == outputs, and are
+    * already in the correct place.*/
+   if (opt & PT_SHADE)
+   {
+      vshader->run_linear(vshader,
+                          (const float (*)[4])pipeline_verts->data,
+                          (      float (*)[4])pipeline_verts->data,
+                          draw->pt.user.vs_constants,
+                          fetch_count,
+                          fpme->vertex_size,
+                          fpme->vertex_size);
+      if (gshader)
+         draw_geometry_shader_run(gshader,
+                                  (const float (*)[4])pipeline_verts->data,
+                                  (      float (*)[4])pipeline_verts->data,
+                                  draw->pt.user.gs_constants,
+                                  fetch_count,
+                                  fpme->vertex_size,
+                                  fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    fetch_count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run( fpme->draw,
+                         fpme->prim,
+                         pipeline_verts,
+                         fetch_count,
+                         fpme->vertex_size,
+                         draw_elts,
+                         draw_count );
+   }
+   else {
+      draw_pt_emit( fpme->emit,
+		    (const float (*)[4])pipeline_verts->data,
+		    fetch_count,
+		    fpme->vertex_size,
+		    draw_elts,
+		    draw_count );
+   }
+
+
+   FREE(pipeline_verts);
+}
+
+
+static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
+                                       unsigned start,
+                                       unsigned count)
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
+      assert(0);
+      return;
+   }
+
+#if 0
+   debug_printf("#### Pipeline = %p (data = %p)\n",
+                pipeline_verts, pipeline_verts->data);
+#endif
+   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+                                    pipeline_verts,
+                                    (const char **)draw->pt.user.vbuffer,
+                                    start,
+                                    count,
+                                    fpme->vertex_size,
+                                    draw->pt.vertex_buffer );
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run_linear( fpme->draw,
+                                fpme->prim,
+                                pipeline_verts,
+                                count,
+                                fpme->vertex_size);
+   }
+   else {
+      draw_pt_emit_linear( fpme->emit,
+                           (const float (*)[4])pipeline_verts->data,
+                           fpme->vertex_size,
+                           count );
+   }
+
+   FREE(pipeline_verts);
+}
+
+
+
+static boolean
+llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
+                                 unsigned start,
+                                 unsigned count,
+                                 const ushort *draw_elts,
+                                 unsigned draw_count )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts)
+      return FALSE;
+
+   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+                                    pipeline_verts,
+                                    (const char **)draw->pt.user.vbuffer,
+                                    start,
+                                    count,
+                                    fpme->vertex_size,
+                                    draw->pt.vertex_buffer );
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run( fpme->draw,
+                         fpme->prim,
+                         pipeline_verts,
+                         count,
+                         fpme->vertex_size,
+                         draw_elts,
+                         draw_count );
+   }
+   else {
+      draw_pt_emit( fpme->emit,
+		    (const float (*)[4])pipeline_verts->data,
+		    count,
+		    fpme->vertex_size,
+		    draw_elts,
+		    draw_count );
+   }
+
+   FREE(pipeline_verts);
+   return TRUE;
+}
+
+
+
+static void llvm_middle_end_finish( struct draw_pt_middle_end *middle )
+{
+   /* nothing to do */
+}
+
+static void llvm_middle_end_destroy( struct draw_pt_middle_end *middle )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+
+   if (fpme->fetch)
+      draw_pt_fetch_destroy( fpme->fetch );
+
+   if (fpme->emit)
+      draw_pt_emit_destroy( fpme->emit );
+
+   if (fpme->post_vs)
+      draw_pt_post_vs_destroy( fpme->post_vs );
+
+   if (fpme->llvm)
+      draw_llvm_destroy( fpme->llvm );
+
+   FREE(middle);
+}
+
+
+struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_context *draw )
+{
+   struct llvm_middle_end *fpme = 0;
+
+   if (!draw->engine)
+      return NULL;
+
+   fpme = CALLOC_STRUCT( llvm_middle_end );
+   if (!fpme)
+      goto fail;
+
+   fpme->base.prepare         = llvm_middle_end_prepare;
+   fpme->base.run             = llvm_middle_end_run;
+   fpme->base.run_linear      = llvm_middle_end_linear_run;
+   fpme->base.run_linear_elts = llvm_middle_end_linear_run_elts;
+   fpme->base.finish          = llvm_middle_end_finish;
+   fpme->base.destroy         = llvm_middle_end_destroy;
+
+   fpme->draw = draw;
+
+   fpme->fetch = draw_pt_fetch_create( draw );
+   if (!fpme->fetch)
+      goto fail;
+
+   fpme->post_vs = draw_pt_post_vs_create( draw );
+   if (!fpme->post_vs)
+      goto fail;
+
+   fpme->emit = draw_pt_emit_create( draw );
+   if (!fpme->emit)
+      goto fail;
+
+   fpme->llvm = draw_llvm_create(draw);
+   if (!fpme->llvm)
+      goto fail;
+
+   fpme->variants = NULL;
+   fpme->current_variant = NULL;
+   fpme->nr_variants = 0;
+
+   return &fpme->base;
+
+ fail:
+   if (fpme)
+      llvm_middle_end_destroy( &fpme->base );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 9728d5c2bd..5525dfc748 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -108,6 +108,11 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    for (j = 0; j < count; j++) {
       float *position = out->data[pos];
 
+#if 0
+      debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n",
+                   j, out, position, position[0], position[1], position[2], position[3]);
+#endif
+
       out->clip[0] = position[0];
       out->clip[1] = position[1];
       out->clip[2] = position[2];
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
index 62822a3d56..7cba8547f1 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
@@ -118,39 +118,21 @@ static void FUNC( struct draw_pt_front_end *frontend,
 
    case PIPE_PRIM_QUADS:
       for (i = 0; i+3 < count; i += 4) {
-         if (flatfirst) {
-            QUAD( vcache,
-                  get_elt(elts, i + 0),
-                  get_elt(elts, i + 1),
-                  get_elt(elts, i + 2),
-                  get_elt(elts, i + 3) );
-         }
-         else {
-            QUAD( vcache,
-                  get_elt(elts, i + 0),
-                  get_elt(elts, i + 1),
-                  get_elt(elts, i + 2),
-                  get_elt(elts, i + 3) );
-         }
+         QUAD( vcache,
+               get_elt(elts, i + 0),
+               get_elt(elts, i + 1),
+               get_elt(elts, i + 2),
+               get_elt(elts, i + 3) );
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
       for (i = 0; i+3 < count; i += 2) {
-         if (flatfirst) {
-            QUAD( vcache,
-                  get_elt(elts, i + 0),
-                  get_elt(elts, i + 1),
-                  get_elt(elts, i + 3),
-                  get_elt(elts, i + 2) );
-         }
-         else {
-            QUAD( vcache,
-                  get_elt(elts, i + 2),
-                  get_elt(elts, i + 0),
-                  get_elt(elts, i + 1),
-                  get_elt(elts, i + 3) );
-         }
+         QUAD( vcache,
+               get_elt(elts, i + 2),
+               get_elt(elts, i + 0),
+               get_elt(elts, i + 1),
+               get_elt(elts, i + 3) );
       }
       break;
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 3214213e44..a4f5e882c0 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -48,30 +48,12 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
    uint i;
 
    vinfo->size = 0;
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      switch (vinfo->attrib[i].emit) {
-      case EMIT_OMIT:
-         break;
-      case EMIT_4UB:
-         /* fall-through */
-      case EMIT_1F_PSIZE:
-         /* fall-through */
-      case EMIT_1F:
-         vinfo->size += 1;
-         break;
-      case EMIT_2F:
-         vinfo->size += 2;
-         break;
-      case EMIT_3F:
-         vinfo->size += 3;
-         break;
-      case EMIT_4F:
-         vinfo->size += 4;
-         break;
-      default:
-         assert(0);
-      }
-   }
+   for (i = 0; i < vinfo->num_attribs; i++)
+      vinfo->size += draw_translate_vinfo_size(vinfo->attrib[i].emit);
+
+   assert(vinfo->size % 4 == 0);
+   /* in dwords */
+   vinfo->size /= 4;
 }
 
 
@@ -120,6 +102,13 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
          debug_printf("%u ", *data++);
          debug_printf("%u ", *data++);
          break;
+      case EMIT_4UB_BGRA:
+         debug_printf("EMIT_4UB_BGRA:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         break;
       default:
          assert(0);
       }
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 8c3c7befbc..3af31ffe12 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -54,7 +54,8 @@ enum attrib_emit {
    EMIT_2F,
    EMIT_3F,
    EMIT_4F,
-   EMIT_4UB  /**< XXX may need variations for RGBA vs BGRA, etc */
+   EMIT_4UB, /**< is RGBA like the rest */
+   EMIT_4UB_BGRA
 };
 
 
@@ -141,9 +142,11 @@ void draw_dump_emitted_vertex(const struct vertex_info *vinfo,
                               const uint8_t *data);
 
 
-static INLINE unsigned draw_translate_vinfo_format(unsigned format )
+static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit)
 {
-   switch (format) {
+   switch (emit) {
+   case EMIT_OMIT:
+      return PIPE_FORMAT_NONE;
    case EMIT_1F:
    case EMIT_1F_PSIZE:
       return PIPE_FORMAT_R32_FLOAT;
@@ -155,10 +158,36 @@ static INLINE unsigned draw_translate_vinfo_format(unsigned format )
       return PIPE_FORMAT_R32G32B32A32_FLOAT;
    case EMIT_4UB:
       return PIPE_FORMAT_R8G8B8A8_UNORM;
+   case EMIT_4UB_BGRA:
+      return PIPE_FORMAT_B8G8R8A8_UNORM;
    default:
+      assert(!"unexpected format");
       return PIPE_FORMAT_NONE;
    }
 }
 
+static INLINE enum attrib_emit draw_translate_vinfo_size(enum attrib_emit emit)
+{
+   switch (emit) {
+   case EMIT_OMIT:
+      return 0;
+   case EMIT_1F:
+   case EMIT_1F_PSIZE:
+      return 1 * sizeof(float);
+   case EMIT_2F:
+      return 2 * sizeof(float);
+   case EMIT_3F:
+      return 3 * sizeof(float);
+   case EMIT_4F:
+      return 4 * sizeof(float);
+   case EMIT_4UB:
+      return 4 * sizeof(unsigned char);
+   case EMIT_4UB_BGRA:
+      return 4 * sizeof(unsigned char);
+   default:
+      assert(!"unexpected format");
+      return 0;
+   }
+}
 
 #endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index ece1ddde0c..8f8bbe7cb8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -401,13 +401,11 @@ static boolean emit_output( struct aos_compilation *cp,
       emit_store_R32G32B32A32(cp, ptr, dataXMM);
       break;
    case EMIT_4UB:
-      if (1) {
-         emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
-      }
-      else {
-         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
-      }
+      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
+      break;
+   case EMIT_4UB_BGRA:
+      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
       break;
    default:
       AOS_ERROR(cp, "unhandled output format");
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 5f7a645f5d..2a3b6b39b9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -40,7 +40,7 @@
 
 #include "tgsi/tgsi_parse.h"
 
-#ifdef MESA_LLVM
+#ifdef HAVE_LLVM
 
 struct draw_llvm_vertex_shader {
    struct draw_vertex_shader base;
@@ -64,12 +64,8 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
 		   unsigned input_stride,
 		   unsigned output_stride )
 {
-   struct draw_llvm_vertex_shader *shader =
-      (struct draw_llvm_vertex_shader *)base;
 }
 
-
-
 static void
 vs_llvm_delete( struct draw_vertex_shader *base )
 {
@@ -90,6 +86,7 @@ struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ)
 {
+#if 0
    struct draw_llvm_vertex_shader *vs;
 
    vs = CALLOC_STRUCT( draw_llvm_vertex_shader );
@@ -113,6 +110,8 @@ draw_create_vs_llvm(struct draw_context *draw,
    vs->machine = draw->vs.machine;
 
    return &vs->base;
+#endif
+   return NULL;
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h b/src/gallium/auxiliary/gallivm/lp_bld.h
index 634575670d..2fa682f400 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,29 +26,22 @@
  **************************************************************************/
 
 /**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
+ * @file
+ * Wrapper for LLVM header file #includes.
  */
 
-#ifndef LP_BLD_ALPHA_H
-#define LP_BLD_ALPHA_H
 
+#ifndef LP_BLD_H
+#define LP_BLD_H
 
-#include <llvm-c/Core.h>  
 
-struct pipe_alpha_state;
-struct lp_type;
-struct lp_build_mask_context;
+#include <llvm-c/Core.h>  
 
 
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref);
+/** Ensure HAVE_LLVM is set to avoid #ifdef HAVE_LLVM everywhere */
+#ifndef HAVE_LLVM
+#error "HAVE_LLVM should be set with LLVM's version number, e.g. (0x0207 for 2.7)"
+#endif
 
 
-#endif /* !LP_BLD_ALPHA_H */
+#endif /* LP_BLD_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.c b/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
deleted file mode 100644
index 7245730350..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Alpha testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_alpha.h"
-
-
-void
-lp_build_alpha_test(LLVMBuilderRef builder,
-                    const struct pipe_alpha_state *state,
-                    struct lp_type type,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef alpha,
-                    LLVMValueRef ref)
-{
-   struct lp_build_context bld;
-
-   lp_build_context_init(&bld, builder, type);
-
-   if(state->enabled) {
-      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
-
-      lp_build_name(test, "alpha_mask");
-
-      lp_build_mask_update(mask, test);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 32f9e5201c..8e8fcccf56 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -232,6 +232,37 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
+/** Return the sum of the elements of a */
+LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef index, res;
+   int i;
+
+   if (a == bld->zero)
+      return bld->zero;
+   if (a == bld->undef)
+      return bld->undef;
+   assert(type.length > 1);
+
+   assert(!bld->type.norm);
+
+   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   res = LLVMBuildExtractElement(bld->builder, a, index, "");
+
+   for (i = 1; i < type.length; i++) {
+      index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildAdd(bld->builder, res,
+                         LLVMBuildExtractElement(bld->builder, a, index, ""),
+                         "");
+   }
+
+   return res;
+}
+
+
 /**
  * Generate a - b
  */
@@ -330,12 +361,12 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    LLVMValueRef c8;
    LLVMValueRef ab;
 
-   c8 = lp_build_int_const_scalar(i16_type, 8);
+   c8 = lp_build_const_int_vec(i16_type, 8);
    
 #if 0
    
    /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
+   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
    ab = LLVMBuildMul(builder, a, b, "");
 
 #else
@@ -343,7 +374,7 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    ab = LLVMBuildMul(builder, a, b, "");
    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
-   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 
 #endif
    
@@ -398,7 +429,7 @@ lp_build_mul(struct lp_build_context *bld,
    }
 
    if(type.fixed)
-      shift = lp_build_int_const_scalar(type, type.width/2);
+      shift = lp_build_const_int_vec(type, type.width/2);
    else
       shift = NULL;
 
@@ -460,7 +491,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
           * for Inf and NaN.
           */
          unsigned mantissa = lp_mantissa(bld->type);
-         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
          a = LLVMBuildAdd(bld->builder, a, factor, "");
          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
@@ -468,12 +499,12 @@ lp_build_mul_imm(struct lp_build_context *bld,
 #endif
       }
       else {
-         factor = lp_build_const_scalar(bld->type, shift);
+         factor = lp_build_const_vec(bld->type, shift);
          return LLVMBuildShl(bld->builder, a, factor, "");
       }
    }
 
-   factor = lp_build_const_scalar(bld->type, (double)b);
+   factor = lp_build_const_vec(bld->type, (double)b);
    return lp_build_mul(bld, a, factor);
 }
 
@@ -536,7 +567,7 @@ lp_build_lerp(struct lp_build_context *bld,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
-      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 
    return res;
 }
@@ -644,13 +675,26 @@ lp_build_abs(struct lp_build_context *bld,
 
    if(type.floating) {
       /* Mask out the sign bit */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      unsigned long long absMask = ~(1ULL << (type.width - 1));
-      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      a = LLVMBuildAnd(bld->builder, a, mask, "");
-      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-      return a;
+      if (type.length == 1) {
+         LLVMTypeRef int_type = LLVMIntType(type.width);
+         LLVMTypeRef float_type = LLVMFloatType();
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
+         a = LLVMBuildBitCast(bld->builder, a, int_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, float_type, "");
+         return a;
+      }
+      else {
+         /* vector of floats */
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         a = LLVMBuildAnd(bld->builder, a, mask, "");
+         a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+         return a;
+      }
    }
 
    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -676,12 +720,12 @@ lp_build_negate(struct lp_build_context *bld,
 }
 
 
+/** Return -1, 0 or +1 depending on the sign of a */
 LLVMValueRef
 lp_build_sgn(struct lp_build_context *bld,
              LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMValueRef cond;
    LLVMValueRef res;
 
@@ -691,27 +735,42 @@ lp_build_sgn(struct lp_build_context *bld,
       res = bld->one;
    }
    else if(type.floating) {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMTypeRef vec_type;
+      LLVMTypeRef int_type;
+      LLVMValueRef mask;
       LLVMValueRef sign;
       LLVMValueRef one;
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
+
+      if (type.length == 1) {
+         int_type = lp_build_int_elem_type(type);
+         vec_type = lp_build_elem_type(type);
+         mask = LLVMConstInt(int_type, maskBit, 0);
+      }
+      else {
+         /* vector */
+         int_type = lp_build_int_vec_type(type);
+         vec_type = lp_build_vec_type(type);
+         mask = lp_build_const_int_vec(type, maskBit);
+      }
+
+      /* Take the sign bit and add it to 1 constant */
+      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      one = LLVMConstBitCast(bld->one, int_vec_type);
+      one = LLVMConstBitCast(bld->one, int_type);
       res = LLVMBuildOr(bld->builder, sign, one, "");
       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
    }
    else
    {
-      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
    }
 
    /* Handle zero */
    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
-   res = lp_build_select(bld, cond, bld->zero, bld->one);
+   res = lp_build_select(bld, cond, bld->zero, res);
 
    return res;
 }
@@ -730,8 +789,8 @@ lp_build_set_sign(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMValueRef shift = lp_build_int_const_scalar(type, type.width - 1);
-   LLVMValueRef mask = lp_build_int_const_scalar(type,
+   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
+   LLVMValueRef mask = lp_build_const_int_vec(type,
                              ~((unsigned long long) 1 << (type.width - 1)));
    LLVMValueRef val, res;
 
@@ -753,7 +812,7 @@ lp_build_set_sign(struct lp_build_context *bld,
 
 
 /**
- * Convert vector of int to vector of float.
+ * Convert vector of (or scalar) int to vector of (or scalar) float.
  */
 LLVMValueRef
 lp_build_int_to_float(struct lp_build_context *bld,
@@ -764,7 +823,11 @@ lp_build_int_to_float(struct lp_build_context *bld,
    assert(type.floating);
    /*assert(lp_check_value(type, a));*/
 
-   {
+   if (type.length == 1) {
+      LLVMTypeRef float_type = LLVMFloatType();
+      return LLVMBuildSIToFP(bld->builder, a, float_type, "");
+   }
+   else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
       LLVMValueRef res;
@@ -866,6 +929,13 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
+   if (type.length == 1) {
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
+      return res;
+   }
+
    if(util_cpu_caps.has_sse4_1)
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    else {
@@ -921,15 +991,24 @@ lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
    assert(type.floating);
-   assert(lp_check_value(type, a));
 
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   if (type.length == 1) {
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      return LLVMBuildFPToSI(bld->builder, a, int_type, "");
+   }
+   else {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      assert(lp_check_value(type, a));
+      return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   }
 }
 
 
+/**
+ * Convert float[] to int[] with round().
+ */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -939,6 +1018,15 @@ lp_build_iround(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      /* XXX we want rounding here! */
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -946,7 +1034,7 @@ lp_build_iround(struct lp_build_context *bld,
    }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef half;
 
@@ -955,7 +1043,7 @@ lp_build_iround(struct lp_build_context *bld,
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 
       /* sign * 0.5 */
-      half = lp_build_const_scalar(type, 0.5);
+      half = lp_build_const_vec(type, 0.5);
       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
       half = LLVMBuildOr(bld->builder, sign, half, "");
       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
@@ -981,6 +1069,14 @@ lp_build_ifloor(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
+
+   if (type.length == 1) {
+      /* scalar float to int */
+      LLVMTypeRef int_type = LLVMIntType(type.width);
+      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
+      return res;
+   }
+
    assert(lp_check_value(type, a));
 
    if(util_cpu_caps.has_sse4_1) {
@@ -990,18 +1086,18 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* Take the sign bit and add it to 1 constant */
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
       LLVMValueRef sign;
       LLVMValueRef offset;
 
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
       lp_build_name(sign, "floor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
@@ -1172,7 +1268,7 @@ lp_build_exp(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log2(e) = 1/log(2) */
-   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
+   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
 
    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 }
@@ -1186,7 +1282,7 @@ lp_build_log(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
+   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
 
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
@@ -1207,6 +1303,7 @@ lp_build_polynomial(struct lp_build_context *bld,
                     unsigned num_coeffs)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
    LLVMValueRef res = NULL;
    unsigned i;
 
@@ -1216,7 +1313,13 @@ lp_build_polynomial(struct lp_build_context *bld,
                    __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
-      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
+      LLVMValueRef coeff;
+
+      if (type.length == 1)
+         coeff = LLVMConstReal(float_type, coeffs[i]);
+      else
+         coeff = lp_build_const_vec(type, coeffs[i]);
+
       if(res)
          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
       else
@@ -1272,11 +1375,11 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
       assert(type.floating && type.width == 32);
 
-      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
-      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
+      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
+      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
 
       /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
+      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
 
       /* fpart = x - ipart */
@@ -1286,8 +1389,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
 
    if(p_exp2_int_part || p_exp2) {
       /* expipart = (float) (1 << ipart) */
-      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
-      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
+      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
+      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
    }
 
@@ -1353,8 +1456,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
-   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
-   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
+   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
 
    LLVMValueRef i = NULL;
@@ -1379,8 +1482,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    }
 
    if(p_floor_log2 || p_log2) {
-      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
-      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
+      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
+      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
    }
 
@@ -1410,11 +1513,87 @@ lp_build_log2_approx(struct lp_build_context *bld,
 }
 
 
+/** scalar version of above function */
+static void
+lp_build_float_log2_approx(struct lp_build_context *bld,
+                           LLVMValueRef x,
+                           LLVMValueRef *p_exp,
+                           LLVMValueRef *p_floor_log2,
+                           LLVMValueRef *p_log2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef float_type = LLVMFloatType();
+   LLVMTypeRef int_type = LLVMIntType(type.width);
+
+   LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
+   LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
+
+   LLVMValueRef i = NULL;
+   LLVMValueRef exp = NULL;
+   LLVMValueRef mant = NULL;
+   LLVMValueRef logexp = NULL;
+   LLVMValueRef logmant = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp || p_floor_log2 || p_log2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
+
+      assert(type.floating && type.width == 32);
+
+      i = LLVMBuildBitCast(bld->builder, x, int_type, "");
+
+      /* exp = (float) exponent(x) */
+      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
+   }
+
+   if(p_floor_log2 || p_log2) {
+      LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
+      LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
+      logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
+      logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
+      logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
+   }
+
+   if(p_log2) {
+      /* mant = (float) mantissa(x) */
+      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
+      mant = LLVMBuildOr(bld->builder, mant, one, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
+
+      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
+                                    Elements(lp_build_log2_polynomial));
+
+      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+
+      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+   }
+
+   if(p_exp)
+      *p_exp = exp;
+
+   if(p_floor_log2)
+      *p_floor_log2 = logexp;
+
+   if(p_log2)
+      *p_log2 = res;
+}
+
+
 LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef x)
 {
    LLVMValueRef res;
-   lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   if (bld->type.length == 1) {
+      lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
+   }
+   else {
+      lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   }
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 55385e3a66..31efa9921c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -37,7 +37,7 @@
 #define LP_BLD_ARIT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -57,6 +57,10 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_sum_vector(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend.h b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
deleted file mode 100644
index da272e549f..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef LP_BLD_BLEND_H
-#define LP_BLD_BLEND_H
-
-
-/**
- * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
- */
-
-#include <llvm-c/Core.h>  
- 
-#include "pipe/p_format.h"
-
-
-struct pipe_blend_state;
-struct lp_type;
-struct lp_build_context;
-
-
-/**
- * Whether the blending function is commutative or not.
- */
-boolean
-lp_build_blend_func_commutative(unsigned func);
-
-
-/**
- * Whether the blending functions are the reverse of each other.
- */
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
-
-
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1,
-                    LLVMValueRef term2);
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle);
-
-
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef const_[4],
-                   LLVMValueRef res[4]);
-
-
-/**
- * Apply a logic op.
- *
- * src/dst parameters are packed values. It should work regardless the inputs
- * are scalars, or a vector.
- */
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst);
-
-
-#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
deleted file mode 100644
index 0215bb72ac..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- AoS layout.
- *
- * AoS blending is in general much slower than SoA, but there are some cases
- * where it might be faster. In particular, if a pixel is rendered only once
- * then the overhead of tiling and untiling will dominate over the speedup that
- * SoA gives. So we might want to detect such cases and fallback to AoS in the
- * future, but for now this function is here for historical/benchmarking
- * purposes.
- *
- * Run lp_blend_test after any change to this file.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_blend.h"
-#include "lp_bld_debug.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_aos_context
-{
-   struct lp_build_context base;
-   
-   LLVMValueRef src;
-   LLVMValueRef dst;
-   LLVMValueRef const_;
-
-   LLVMValueRef inv_src;
-   LLVMValueRef inv_dst;
-   LLVMValueRef inv_const;
-   LLVMValueRef saturate;
-
-   LLVMValueRef rgb_src_factor;
-   LLVMValueRef alpha_src_factor;
-   LLVMValueRef rgb_dst_factor;
-   LLVMValueRef alpha_dst_factor;
-};
-
-
-static LLVMValueRef
-lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
-                                 unsigned factor,
-                                 boolean alpha)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src;
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst)
-            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-         if(!bld->saturate)
-            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
-         return bld->saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->const_;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src)
-         bld->inv_src = lp_build_comp(&bld->base, bld->src);
-      return bld->inv_src;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
-         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-      return bld->inv_dst;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const)
-         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
-      return bld->inv_const;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-enum lp_build_blend_swizzle {
-   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
-   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
-};
-
-
-/**
- * How should we shuffle the base factor.
- */
-static enum lp_build_blend_swizzle
-lp_build_blend_factor_swizzle(unsigned factor)
-{
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-   case PIPE_BLENDFACTOR_ZERO:
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_DST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      return LP_BUILD_BLEND_SWIZZLE_AAAA;
-   default:
-      assert(0);
-      return LP_BUILD_BLEND_SWIZZLE_RGBA;
-   }
-}
-
-
-static LLVMValueRef
-lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
-                       LLVMValueRef rgb, 
-                       LLVMValueRef alpha, 
-                       enum lp_build_blend_swizzle rgb_swizzle,
-                       unsigned alpha_swizzle)
-{
-   if(rgb == alpha) {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
-         return rgb;
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
-         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
-   }
-   else {
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
-         boolean cond[4] = {0, 0, 0, 0};
-         cond[alpha_swizzle] = 1;
-         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
-      }
-      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
-         unsigned char swizzle[4];
-         swizzle[0] = alpha_swizzle;
-         swizzle[1] = alpha_swizzle;
-         swizzle[2] = alpha_swizzle;
-         swizzle[3] = alpha_swizzle;
-         swizzle[alpha_swizzle] += 4;
-         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
-      }
-   }
-   assert(0);
-   return bld->base.undef;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
- */
-static LLVMValueRef
-lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
-                      LLVMValueRef factor1,
-                      unsigned rgb_factor,
-                      unsigned alpha_factor,
-                      unsigned alpha_swizzle)
-{
-   LLVMValueRef rgb_factor_;
-   LLVMValueRef alpha_factor_;
-   LLVMValueRef factor2;
-   enum lp_build_blend_swizzle rgb_swizzle;
-
-   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
-   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
-
-   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
-
-   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
-
-   return lp_build_mul(&bld->base, factor1, factor2);
-}
-
-
-boolean
-lp_build_blend_func_commutative(unsigned func)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-   case PIPE_BLEND_MIN:
-   case PIPE_BLEND_MAX:
-      return TRUE;
-   case PIPE_BLEND_SUBTRACT:
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return FALSE;
-   default:
-      assert(0);
-      return TRUE;
-   }
-}
-
-
-boolean
-lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
-{
-   if(rgb_func == alpha_func)
-      return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
-      return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
-      return TRUE;
-   return FALSE;
-}
-
-
-/**
- * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
- */
-LLVMValueRef
-lp_build_blend_func(struct lp_build_context *bld,
-                    unsigned func,
-                    LLVMValueRef term1, 
-                    LLVMValueRef term2)
-{
-   switch (func) {
-   case PIPE_BLEND_ADD:
-      return lp_build_add(bld, term1, term2);
-      break;
-   case PIPE_BLEND_SUBTRACT:
-      return lp_build_sub(bld, term1, term2);
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return lp_build_sub(bld, term2, term1);
-   case PIPE_BLEND_MIN:
-      return lp_build_min(bld, term1, term2);
-   case PIPE_BLEND_MAX:
-      return lp_build_max(bld, term1, term2);
-   default:
-      assert(0);
-      return bld->zero;
-   }
-}
-
-
-LLVMValueRef
-lp_build_blend_aos(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src,
-                   LLVMValueRef dst,
-                   LLVMValueRef const_,
-                   unsigned alpha_swizzle)
-{
-   struct lp_build_blend_aos_context bld;
-   LLVMValueRef src_term;
-   LLVMValueRef dst_term;
-
-   /* FIXME */
-   assert(blend->independent_blend_enable == 0);
-   assert(blend->rt[0].colormask == 0xf);
-
-   if(!blend->rt[0].blend_enable)
-      return src;
-
-   /* It makes no sense to blend unless values are normalized */
-   assert(type.norm);
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   bld.src = src;
-   bld.dst = dst;
-   bld.const_ = const_;
-
-   /* TODO: There are still a few optimization opportunities here. For certain
-    * combinations it is possible to reorder the operations and therefore saving
-    * some instructions. */
-
-   src_term = lp_build_blend_factor(&bld, src, blend->rt[0].rgb_src_factor,
-                                    blend->rt[0].alpha_src_factor, alpha_swizzle);
-   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[0].rgb_dst_factor,
-                                    blend->rt[0].alpha_dst_factor, alpha_swizzle);
-
-   lp_build_name(src_term, "src_term");
-   lp_build_name(dst_term, "dst_term");
-
-   if(blend->rt[0].rgb_func == blend->rt[0].alpha_func) {
-      return lp_build_blend_func(&bld.base, blend->rt[0].rgb_func, src_term, dst_term);
-   }
-   else {
-      /* Seperate RGB / A functions */
-
-      LLVMValueRef rgb;
-      LLVMValueRef alpha;
-
-      rgb   = lp_build_blend_func(&bld.base, blend->rt[0].rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld.base, blend->rt[0].alpha_func, src_term, dst_term);
-
-      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
deleted file mode 100644
index 1eac0a5c89..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- logic ops.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_blend.h"
-
-
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
-                 unsigned logicop_func,
-                 LLVMValueRef src,
-                 LLVMValueRef dst)
-{
-   LLVMTypeRef type;
-   LLVMValueRef res;
-
-   type = LLVMTypeOf(src);
-
-   switch (logicop_func) {
-   case PIPE_LOGICOP_CLEAR:
-      res = LLVMConstNull(type);
-      break;
-   case PIPE_LOGICOP_NOR:
-      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND_INVERTED:
-      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY_INVERTED:
-      res = LLVMBuildNot(builder, src, "");
-      break;
-   case PIPE_LOGICOP_AND_REVERSE:
-      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_INVERT:
-      res = LLVMBuildNot(builder, dst, "");
-      break;
-   case PIPE_LOGICOP_XOR:
-      res = LLVMBuildXor(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_NAND:
-      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_AND:
-      res = LLVMBuildAnd(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_EQUIV:
-      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_NOOP:
-      res = dst;
-      break;
-   case PIPE_LOGICOP_OR_INVERTED:
-      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
-      break;
-   case PIPE_LOGICOP_COPY:
-      res = src;
-      break;
-   case PIPE_LOGICOP_OR_REVERSE:
-      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
-      break;
-   case PIPE_LOGICOP_OR:
-      res = LLVMBuildOr(builder, src, dst, "");
-      break;
-   case PIPE_LOGICOP_SET:
-      res = LLVMConstAllOnes(type);
-      break;
-   default:
-      assert(0);
-      res = src;
-   }
-
-   return res;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
deleted file mode 100644
index 6d5a45db7a..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * @file
- * Blend LLVM IR generation -- SoA layout.
- *
- * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
- * factors/functions are used, since no channel masking/shuffling is necessary
- * and we can achieve the full throughput of the SIMD operations. Furthermore
- * the fragment shader output is also in SoA, so it fits nicely with the rest of
- * the fragment pipeline.
- *
- * The drawback is that to be displayed the color buffer needs to be in AoS
- * layout, so we need to tile/untile the color buffer before/after rendering.
- * A color buffer like
- *
- *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
- *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
- *
- *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
- *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
- *
- *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
- *
- * will actually be stored in memory as
- *
- *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
- *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
- *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
- *
- * NOTE: Run lp_blend_test after any change to this file.
- *
- * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
- * as:
- *
- *  lp_blend_test -o blend.tsv
- *
- * will generate a tab-seperated-file with the test results and performance
- * measurements.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_blend.h"
-
-
-/**
- * We may the same values several times, so we keep them here to avoid
- * recomputing them. Also reusing the values allows us to do simplifications
- * that LLVM optimization passes wouldn't normally be able to do.
- */
-struct lp_build_blend_soa_context
-{
-   struct lp_build_context base;
-
-   LLVMValueRef src[4];
-   LLVMValueRef dst[4];
-   LLVMValueRef con[4];
-
-   LLVMValueRef inv_src[4];
-   LLVMValueRef inv_dst[4];
-   LLVMValueRef inv_con[4];
-
-   LLVMValueRef src_alpha_saturate;
-
-   /**
-    * We store all factors in a table in order to eliminate redundant
-    * multiplications later.
-    */
-   LLVMValueRef factor[2][2][4];
-
-   /**
-    * Table with all terms.
-    */
-   LLVMValueRef term[2][4];
-};
-
-
-static LLVMValueRef
-lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
-                          unsigned factor, unsigned i)
-{
-   /*
-    * Compute src/first term RGB
-    */
-   switch (factor) {
-   case PIPE_BLENDFACTOR_ONE:
-      return bld->base.one;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-      return bld->src[i];
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return bld->src[3];
-   case PIPE_BLENDFACTOR_DST_COLOR:
-      return bld->dst[i];
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return bld->dst[3];
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(i == 3)
-         return bld->base.one;
-      else {
-         if(!bld->inv_dst[3])
-            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-         if(!bld->src_alpha_saturate)
-            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
-         return bld->src_alpha_saturate;
-      }
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-      return bld->con[i];
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return bld->con[3];
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_ZERO:
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src[i])
-         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
-      return bld->inv_src[i];
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src[3])
-         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
-      return bld->inv_src[3];
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      if(!bld->inv_dst[i])
-         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
-      return bld->inv_dst[i];
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst[3])
-         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
-      return bld->inv_dst[3];
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_con[i])
-         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
-      return bld->inv_con[i];
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_con[3])
-         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
-      return bld->inv_con[3];
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* TODO */
-      assert(0);
-      return bld->base.zero;
-   default:
-      assert(0);
-      return bld->base.zero;
-   }
-}
-
-
-/**
- * Generate blend code in SOA mode.
- * \param src  src/fragment color
- * \param dst  dst/framebuffer color
- * \param con  constant blend color
- * \param res  the result/output
- */
-void
-lp_build_blend_soa(LLVMBuilderRef builder,
-                   const struct pipe_blend_state *blend,
-                   struct lp_type type,
-                   LLVMValueRef src[4],
-                   LLVMValueRef dst[4],
-                   LLVMValueRef con[4],
-                   LLVMValueRef res[4])
-{
-   struct lp_build_blend_soa_context bld;
-   unsigned i, j, k;
-
-   /* Setup build context */
-   memset(&bld, 0, sizeof bld);
-   lp_build_context_init(&bld.base, builder, type);
-   for (i = 0; i < 4; ++i) {
-      bld.src[i] = src[i];
-      bld.dst[i] = dst[i];
-      bld.con[i] = con[i];
-   }
-
-   for (i = 0; i < 4; ++i) {
-      if (blend->rt[0].colormask & (1 << i)) {
-         if (blend->logicop_enable) {
-            if(!type.floating) {
-               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
-            }
-            else
-               res[i] = dst[i];
-         }
-         else if (blend->rt[0].blend_enable) {
-            unsigned src_factor = i < 3 ? blend->rt[0].rgb_src_factor : blend->rt[0].alpha_src_factor;
-            unsigned dst_factor = i < 3 ? blend->rt[0].rgb_dst_factor : blend->rt[0].alpha_dst_factor;
-            unsigned func = i < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
-            boolean func_commutative = lp_build_blend_func_commutative(func);
-
-            /* It makes no sense to blend unless values are normalized */
-            assert(type.norm);
-
-            /*
-             * Compute src/dst factors.
-             */
-
-            bld.factor[0][0][i] = src[i];
-            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
-            bld.factor[1][0][i] = dst[i];
-            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
-
-            /*
-             * Compute src/dst terms
-             */
-
-            for(k = 0; k < 2; ++k) {
-               /* See if this multiplication has been previously computed */
-               for(j = 0; j < i; ++j) {
-                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
-                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
-                      bld.factor[k][1][j] == bld.factor[k][0][i]))
-                     break;
-               }
-
-               if(j < i)
-                  bld.term[k][i] = bld.term[k][j];
-               else
-                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
-            }
-
-            /*
-             * Combine terms
-             */
-
-            /* See if this function has been previously applied */
-            for(j = 0; j < i; ++j) {
-               unsigned prev_func = j < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
-               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
-
-               if((!func_reverse &&
-                   bld.term[0][j] == bld.term[0][i] &&
-                   bld.term[1][j] == bld.term[1][i]) ||
-                  ((func_commutative || func_reverse) &&
-                   bld.term[0][j] == bld.term[1][i] &&
-                   bld.term[1][j] == bld.term[0][i]))
-                  break;
-            }
-
-            if(j < i)
-               res[i] = res[j];
-            else
-               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
-         }
-         else {
-            res[i] = src[i];
-         }
-      }
-      else {
-         res[i] = dst[i];
-      }
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index c8eaa8c394..57843e9a60 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -221,8 +221,16 @@ lp_build_undef(struct lp_type type)
 LLVMValueRef
 lp_build_zero(struct lp_type type)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   return LLVMConstNull(vec_type);
+   if (type.length == 1) {
+      if (type.floating)
+         return LLVMConstReal(LLVMFloatType(), 0.0);
+      else
+         return LLVMConstInt(LLVMIntType(type.width), 0, 0);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      return LLVMConstNull(vec_type);
+   }
 }
                
 
@@ -255,7 +263,7 @@ lp_build_one(struct lp_type type)
       if(type.sign)
          /* TODO: Unfortunately this caused "Tried to create a shift operation
           * on a non-integer type!" */
-         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
+         vec = LLVMConstLShr(vec, lp_build_const_int_vec(type, 1));
 #endif
 
       return vec;
@@ -264,13 +272,19 @@ lp_build_one(struct lp_type type)
    for(i = 1; i < type.length; ++i)
       elems[i] = elems[0];
 
-   return LLVMConstVector(elems, type.length);
+   if (type.length == 1)
+      return elems[0];
+   else
+      return LLVMConstVector(elems, type.length);
 }
                
 
+/**
+ * Build constant-valued vector from a scalar value.
+ */
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val)
+lp_build_const_vec(struct lp_type type,
+                   double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -295,7 +309,7 @@ lp_build_const_scalar(struct lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
+lp_build_const_int_vec(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index cb8e1c7b00..9ca2f0664e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -37,9 +37,9 @@
 #define LP_BLD_CONST_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 struct lp_type;
@@ -85,13 +85,11 @@ lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(struct lp_type type,
-                      double val);
+lp_build_const_vec(struct lp_type type, double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(struct lp_type type,
-                          long long val);
+lp_build_const_int_vec(struct lp_type type, long long val);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index f77cf78721..3f7f2ebde9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -114,13 +114,13 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    scale = (double)mask/ubound;
    bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
+   res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), "");
+   res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), "");
    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 
    if(dst_width > n) {
       int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
+      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
 
       /* TODO: Fill in the empty lower bits for additional precision? */
       /* YES: this fixes progs/trivial/tri-z-eq.c.
@@ -130,21 +130,21 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 #if 0
       {
          LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
+         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
+         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
+         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
          res = LLVMBuildOr(builder, res, msb, "");
       }
 #elif 0
       while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
+         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
          shift -= n;
          n *= 2;
       }
 #endif
    }
    else
-      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -183,10 +183,10 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    if(src_width > mantissa) {
       int shift = src_width - mantissa;
-      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
+      res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), "");
    }
 
-   bias_ = lp_build_const_scalar(dst_type, bias);
+   bias_ = lp_build_const_vec(dst_type, bias);
 
    res = LLVMBuildOr(builder,
                      res,
@@ -195,7 +195,7 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
    res = LLVMBuildBitCast(builder, res, vec_type, "");
 
    res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
+   res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 
    return res;
 }
@@ -251,7 +251,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_min == 0.0)
             thres = bld.zero;
          else
-            thres = lp_build_const_scalar(src_type, dst_min);
+            thres = lp_build_const_vec(src_type, dst_min);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_max(&bld, tmp[i], thres);
       }
@@ -260,7 +260,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if(dst_max == 1.0)
             thres = bld.one;
          else
-            thres = lp_build_const_scalar(src_type, dst_max);
+            thres = lp_build_const_vec(src_type, dst_max);
          for(i = 0; i < num_tmps; ++i)
             tmp[i] = lp_build_min(&bld, tmp[i], thres);
       }
@@ -288,7 +288,7 @@ lp_build_conv(LLVMBuilderRef builder,
          LLVMTypeRef tmp_vec_type;
 
          if (dst_scale != 1.0) {
-            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
+            LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
             for(i = 0; i < num_tmps; ++i)
                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
          }
@@ -315,7 +315,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
       /* FIXME: compensate different offsets too */
       if(src_shift > dst_shift) {
-         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
+         LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift);
          for(i = 0; i < num_tmps; ++i)
             if(src_type.sign)
                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
@@ -388,7 +388,7 @@ lp_build_conv(LLVMBuilderRef builder,
           }
 
           if (src_scale != 1.0) {
-             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
+             LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
              for(i = 0; i < num_tmps; ++i)
                 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
           }
@@ -400,7 +400,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
        /* FIXME: compensate different offsets too */
        if(src_shift < dst_shift) {
-          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
+          LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift);
           for(i = 0; i < num_tmps; ++i)
              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
        }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index 948e68fae4..628831c3ad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -37,7 +37,7 @@
 #define LP_BLD_CONV_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 583e6132b4..7b010cbdb0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -30,7 +30,7 @@
 #define LP_BLD_DEBUG_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_compiler.h"
 #include "util/u_string.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.c b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
deleted file mode 100644
index f08f8eb6d8..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Depth/stencil testing to LLVM IR translation.
- *
- * To be done accurately/efficiently the depth/stencil test must be done with
- * the same type/format of the depth/stencil buffer, which implies massaging
- * the incoming depths to fit into place. Using a more straightforward
- * type/format for depth/stencil values internally and only convert when
- * flushing would avoid this, but it would most likely result in depth fighting
- * artifacts.
- *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
- *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
- *
- * FIXME: Code generate stencil test
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_logic.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_depth.h"
-
-
-/**
- * Return a type appropriate for depth/stencil testing.
- */
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length)
-{
-   struct lp_type type;
-   unsigned swizzle;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   swizzle = format_desc->swizzle[0];
-   assert(swizzle < 4);
-
-   memset(&type, 0, sizeof type);
-   type.width = format_desc->block.bits;
-
-   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
-      type.floating = TRUE;
-      assert(swizzle == 0);
-      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
-   }
-   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-      assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
-   }
-   else
-      assert(0);
-
-   assert(type.width <= length);
-   type.length = length / type.width;
-
-   return type;
-}
-
-
-/**
- * Depth test.
- */
-void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr)
-{
-   struct lp_build_context bld;
-   unsigned z_swizzle;
-   LLVMValueRef dst;
-   LLVMValueRef z_bitmask = NULL;
-   LLVMValueRef test;
-
-   if(!state->enabled)
-      return;
-
-   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
-   assert(format_desc->block.width == 1);
-   assert(format_desc->block.height == 1);
-
-   z_swizzle = format_desc->swizzle[0];
-   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
-      return;
-
-   /* Sanity checking */
-   assert(z_swizzle < 4);
-   assert(format_desc->block.bits == type.width);
-   if(type.floating) {
-      assert(z_swizzle == 0);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
-      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
-   }
-   else {
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].normalized);
-      assert(!type.fixed);
-      assert(!type.sign);
-      assert(type.norm);
-   }
-
-   /* Setup build context */
-   lp_build_context_init(&bld, builder, type);
-
-   dst = LLVMBuildLoad(builder, dst_ptr, "");
-
-   lp_build_name(dst, "zsbuf");
-
-   /* Align the source depth bits with the destination's, and mask out any
-    * stencil or padding bits from both */
-   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
-      assert(z_swizzle == 0);
-      /* nothing to do */
-   }
-   else {
-      unsigned padding_left;
-      unsigned padding_right;
-      unsigned chan;
-
-      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
-      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
-      assert(format_desc->channel[z_swizzle].normalized);
-
-      padding_right = 0;
-      for(chan = 0; chan < z_swizzle; ++chan)
-         padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits -
-                     (padding_right + format_desc->channel[z_swizzle].size);
-
-      if(padding_left || padding_right) {
-         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
-      }
-
-      if(padding_left)
-         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
-      if(padding_right)
-         src = LLVMBuildAnd(builder, src, z_bitmask, "");
-      if(padding_left || padding_right)
-         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
-   }
-
-   lp_build_name(dst, "zsbuf.z");
-
-   test = lp_build_cmp(&bld, state->func, src, dst);
-   lp_build_mask_update(mask, test);
-
-   if(state->writemask) {
-      if(z_bitmask)
-         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
-      else
-         z_bitmask = mask->value;
-
-      dst = lp_build_select(&bld, z_bitmask, src, dst);
-      LLVMBuildStore(builder, dst, dst_ptr);
-   }
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index bc83138908..e60ab4f6ba 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -308,7 +308,7 @@ lp_build_flow_scope_end(struct lp_build_flow_context *flow)
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
-static LLVMBasicBlockRef
+LLVMBasicBlockRef
 lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 {
    LLVMBasicBlockRef current_block;
@@ -570,6 +570,35 @@ lp_build_loop_end(LLVMBuilderRef builder,
    LLVMPositionBuilderAtEnd(builder, after_block);
 }
 
+void
+lp_build_loop_end_cond(LLVMBuilderRef builder,
+                       LLVMValueRef end,
+                       LLVMValueRef step,
+                       int llvm_cond,
+                       struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   LLVMValueRef next;
+   LLVMValueRef cond;
+   LLVMBasicBlockRef after_block;
+
+   if (!step)
+      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
+
+   next = LLVMBuildAdd(builder, state->counter, step, "");
+
+   cond = LLVMBuildICmp(builder, llvm_cond, next, end, "");
+
+   after_block = LLVMAppendBasicBlock(function, "");
+
+   LLVMBuildCondBr(builder, cond, after_block, state->block);
+
+   LLVMAddIncoming(state->counter, &next, &block, 1);
+
+   LLVMPositionBuilderAtEnd(builder, after_block);
+}
+
 
 
 /*
@@ -648,7 +677,9 @@ lp_build_if(struct lp_build_if_state *ctx,
       ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
 
       /* add add the initial value of the var from the entry block */
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->entry_block, 1);
+      if (!LLVMIsUndef(*flow->variables[i]))
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
+                         &ifthen->entry_block, 1);
    }
 
    /* create/insert true_block before merge_block */
@@ -695,18 +726,21 @@ lp_build_endif(struct lp_build_if_state *ctx)
 {
    struct lp_build_flow_context *flow = ctx->flow;
    struct lp_build_flow_if *ifthen;
+   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
    unsigned i;
 
    ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
 
+   /* Insert branch to the merge block from current block */
+   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+
    if (ifthen->false_block) {
       LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
       /* for each variable, update the Phi node with a (variable, block) pair */
       for (i = 0; i < flow->num_variables; i++) {
          assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->false_block, 1);
-
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
          /* replace the variable ref with the phi function */
          *flow->variables[i] = ifthen->phi[i];
       }
@@ -742,15 +776,18 @@ lp_build_endif(struct lp_build_if_state *ctx)
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Append an unconditional Br(anch) instruction on the true_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   /* Insert branch from end of true_block to merge_block */
    if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the false_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+      /* Append an unconditional Br(anch) instruction on the true_block */
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
       LLVMBuildBr(ctx->builder, ifthen->merge_block);
    }
-
+   else {
+      /* No else clause.
+       * Note that we've already inserted the branch at the end of
+       * true_block.  See the very first LLVMBuildBr() call in this function.
+       */
+   }
 
    /* Resume building code at end of the ifthen->merge_block */
    LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 4c225a0d4f..745838570c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -35,7 +35,7 @@
 #define LP_BLD_FLOW_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
@@ -124,6 +124,13 @@ lp_build_loop_end(LLVMBuilderRef builder,
                   LLVMValueRef step,
                   struct lp_build_loop_state *state);
 
+void
+lp_build_loop_end_cond(LLVMBuilderRef builder,
+                       LLVMValueRef end,
+                       LLVMValueRef step,
+                       int cond, /* LLVM condition */
+                       struct lp_build_loop_state *state);
+
 
 
 
@@ -145,7 +152,9 @@ lp_build_else(struct lp_build_if_state *ctx);
 
 void
 lp_build_endif(struct lp_build_if_state *ctx);
-              
+
+LLVMBasicBlockRef
+lp_build_insert_new_block(LLVMBuilderRef builder, const char *name);
 
 
 #endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 970bee379f..bb1298ed3f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -34,7 +34,7 @@
  * Pixel format helpers.
  */
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_format.h"
 
@@ -80,4 +80,15 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          LLVMValueRef *rgba);
 
 
+void
+lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offsets,
+                        LLVMValueRef i,
+                        LLVMValueRef j,
+                        LLVMValueRef *rgba);
+
+
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
deleted file mode 100644
index a07f7418f2..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ /dev/null
@@ -1,383 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * AoS pixel format manipulation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_cpu_detect.h"
-#include "util/u_format.h"
-
-#include "lp_bld_type.h"
-#include "lp_bld_const.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_format.h"
-
-
-/**
- * Unpack a single pixel into its RGBA components.
- *
- * @param packed integer.
- *
- * @return RGBA in a 4 floats vector.
- *
- * XXX: This is mostly for reference and testing -- operating a single pixel at
- * a time is rarely if ever needed.
- */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed)
-{
-   LLVMTypeRef type;
-   LLVMValueRef shifted, casted, scaled, masked;
-   LLVMValueRef shifts[4];
-   LLVMValueRef masks[4];
-   LLVMValueRef scales[4];
-   LLVMValueRef swizzles[4];
-   LLVMValueRef aux[4];
-   bool normalized;
-   int empty_channel;
-   unsigned shift;
-   unsigned i;
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   type = LLVMIntType(desc->block.bits);
-
-   /* Do the intermediate integer computations with 32bit integers since it
-    * matches floating point size */
-   if (desc->block.bits < 32)
-      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
-
-   /* Broadcast the packed value to all four channels */
-   packed = LLVMBuildInsertElement(builder,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   packed,
-                                   LLVMConstNull(LLVMInt32Type()),
-                                   "");
-   packed = LLVMBuildShuffleVector(builder,
-                                   packed,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)),
-                                   "");
-
-   /* Initialize vector constants */
-   normalized = FALSE;
-   empty_channel = -1;
-   shift = 0;
-   for (i = 0; i < 4; ++i) {
-      unsigned bits = desc->channel[i].size;
-
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
-         shifts[i] = LLVMGetUndef(LLVMInt32Type());
-         masks[i] = LLVMConstNull(LLVMInt32Type());
-         scales[i] =  LLVMConstNull(LLVMFloatType());
-         empty_channel = i;
-      }
-      else {
-         unsigned mask = (1 << bits) - 1;
-
-         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(bits < 32);
-
-         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
-         masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0);
-
-         if (desc->channel[i].normalized) {
-            scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask);
-            normalized = TRUE;
-         }
-         else
-            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
-      }
-
-      shift += bits;
-   }
-
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
-   /* UIToFP can't be expressed in SSE2 */
-   casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
-
-   if (normalized)
-      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
-   else
-      scaled = casted;
-
-   for (i = 0; i < 4; ++i)
-      aux[i] = LLVMGetUndef(LLVMFloatType());
-
-   for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle = desc->swizzle[i];
-
-      switch (swizzle) {
-      case UTIL_FORMAT_SWIZZLE_X:
-      case UTIL_FORMAT_SWIZZLE_Y:
-      case UTIL_FORMAT_SWIZZLE_Z:
-      case UTIL_FORMAT_SWIZZLE_W:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_0:
-         assert(empty_channel >= 0);
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_1:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
-         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_NONE:
-         swizzles[i] = LLVMGetUndef(LLVMFloatType());
-         assert(0);
-         break;
-      }
-   }
-
-   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), "");
-}
-
-
-/**
- * Take a vector with packed pixels and unpack into a rgba8 vector.
- *
- * Formats with bit depth smaller than 32bits are accepted, but they must be
- * padded to 32bits.
- */
-LLVMValueRef
-lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
-                          const struct util_format_description *desc,
-                          struct lp_type type,
-                          LLVMValueRef packed)
-{
-   struct lp_build_context bld;
-   bool rgba8;
-   LLVMValueRef res;
-   unsigned i;
-
-   lp_build_context_init(&bld, builder, type);
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   assert(!type.floating);
-   assert(!type.fixed);
-   assert(type.norm);
-   assert(type.width == 8);
-   assert(type.length % 4 == 0);
-
-   rgba8 = TRUE;
-   for(i = 0; i < 4; ++i) {
-      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
-      if(desc->channel[0].size != 8)
-         rgba8 = FALSE;
-   }
-
-   if(rgba8) {
-      /*
-       * The pixel is already in a rgba8 format variant. All it is necessary
-       * is to swizzle the channels.
-       */
-
-      unsigned char swizzles[4];
-      boolean zeros[4]; /* bitwise AND mask */
-      boolean ones[4]; /* bitwise OR mask */
-      boolean swizzles_needed = FALSE;
-      boolean zeros_needed = FALSE;
-      boolean ones_needed = FALSE;
-
-      for(i = 0; i < 4; ++i) {
-         enum util_format_swizzle swizzle = desc->swizzle[i];
-
-         /* Initialize with the no-op case */
-         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
-         zeros[i] = TRUE;
-         ones[i] = FALSE;
-
-         switch (swizzle) {
-         case UTIL_FORMAT_SWIZZLE_X:
-         case UTIL_FORMAT_SWIZZLE_Y:
-         case UTIL_FORMAT_SWIZZLE_Z:
-         case UTIL_FORMAT_SWIZZLE_W:
-            if(swizzle != swizzles[i]) {
-               swizzles[i] = swizzle;
-               swizzles_needed = TRUE;
-            }
-            break;
-         case UTIL_FORMAT_SWIZZLE_0:
-            zeros[i] = FALSE;
-            zeros_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_1:
-            ones[i] = TRUE;
-            ones_needed = TRUE;
-            break;
-         case UTIL_FORMAT_SWIZZLE_NONE:
-            assert(0);
-            break;
-         }
-      }
-
-      res = packed;
-
-      if(swizzles_needed)
-         res = lp_build_swizzle1_aos(&bld, res, swizzles);
-
-      if(zeros_needed) {
-         /* Mask out zero channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
-         res = LLVMBuildAnd(builder, res, mask, "");
-      }
-
-      if(ones_needed) {
-         /* Or one channels */
-         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
-         res = LLVMBuildOr(builder, res, mask, "");
-      }
-   }
-   else {
-      /* FIXME */
-      assert(0);
-      res = lp_build_undef(type);
-   }
-
-   return res;
-}
-
-
-/**
- * Pack a single pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- *
- * XXX: This is mostly for reference and testing -- operating a single pixel at
- * a time is rarely if ever needed.
- */
-LLVMValueRef
-lp_build_pack_rgba_aos(LLVMBuilderRef builder,
-                       const struct util_format_description *desc,
-                       LLVMValueRef rgba)
-{
-   LLVMTypeRef type;
-   LLVMValueRef packed = NULL;
-   LLVMValueRef swizzles[4];
-   LLVMValueRef shifted, casted, scaled, unswizzled;
-   LLVMValueRef shifts[4];
-   LLVMValueRef scales[4];
-   bool normalized;
-   unsigned shift;
-   unsigned i, j;
-
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-
-   type = LLVMIntType(desc->block.bits);
-
-   /* Unswizzle the color components into the source vector. */
-   for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) {
-         if (desc->swizzle[j] == i)
-            break;
-      }
-      if (j < 4)
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0);
-      else
-         swizzles[i] = LLVMGetUndef(LLVMInt32Type());
-   }
-
-   unswizzled = LLVMBuildShuffleVector(builder, rgba,
-                                       LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)),
-                                       LLVMConstVector(swizzles, 4), "");
-
-   normalized = FALSE;
-   shift = 0;
-   for (i = 0; i < 4; ++i) {
-      unsigned bits = desc->channel[i].size;
-
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
-         shifts[i] = LLVMGetUndef(LLVMInt32Type());
-         scales[i] =  LLVMGetUndef(LLVMFloatType());
-      }
-      else {
-         unsigned mask = (1 << bits) - 1;
-
-         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(bits < 32);
-
-         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
-
-         if (desc->channel[i].normalized) {
-            scales[i] = LLVMConstReal(LLVMFloatType(), mask);
-            normalized = TRUE;
-         }
-         else
-            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
-      }
-
-      shift += bits;
-   }
-
-   if (normalized)
-      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
-   else
-      scaled = unswizzled;
-
-   casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), "");
-
-   shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
-   
-   /* Bitwise or all components */
-   for (i = 0; i < 4; ++i) {
-      if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-         LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), "");
-         if (packed)
-            packed = LLVMBuildOr(builder, packed, component, "");
-         else
-            packed = component;
-      }
-   }
-
-   if (!packed)
-      packed = LLVMGetUndef(LLVMInt32Type());
-
-   if (desc->block.bits < 32)
-      packed = LLVMBuildTrunc(builder, packed, type, "");
-
-   return packed;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_query.c b/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
deleted file mode 100644
index f3832d07ff..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Utility functions to make assertions about formats.
- *
- * This module centralizes most of logic used when determining what algorithm
- * is most suitable (i.e., most efficient yet correct) for a given format.
- *
- * It might be possible to move some of these functions to u_format module,
- * but since tiny differences in the format my render it more/less
- * appropriate to a given algorithm it is impossible to make any long term
- * guarantee about the semantics of these functions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-
-#include "util/u_format.h"
-
-#include "lp_bld_format.h"
-
-
-/**
- * Whether this format is a 4 rgba8 variant
- */
-boolean
-lp_format_is_rgba8(const struct util_format_description *desc)
-{
-   unsigned chan;
-
-   if(desc->block.width != 1 ||
-      desc->block.height != 1 ||
-      desc->block.bits != 32)
-      return FALSE;
-
-   for(chan = 0; chan < 4; ++chan) {
-      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
-         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
-         return FALSE;
-      if(desc->channel[chan].size != 8)
-         return FALSE;
-   }
-
-   return TRUE;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index abb27e4c32..2b66162eb4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -27,10 +27,14 @@
 
 
 #include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
+#include "lp_bld_sample.h" /* for lp_build_gather */
+#include "lp_bld_init.h"
 #include "lp_bld_format.h"
 
 
@@ -80,6 +84,24 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
 }
 
 
+/**
+ * Unpack several pixels in SoA.
+ *
+ * It takes a vector of packed pixels:
+ *
+ *   packed = {P0, P1, P2, P3, ..., Pn}
+ *
+ * And will produce four vectors:
+ *
+ *   red    = {R0, R1, R2, R3, ..., Rn}
+ *   green  = {G0, G1, G2, G3, ..., Gn}
+ *   blue   = {B0, B1, B2, B3, ..., Bn}
+ *   alpha  = {A0, A1, A2, A3, ..., An}
+ *
+ * It requires that a packed pixel fits into an element of the output
+ * channels. The common case is when converting pixel with a depth of 32 bit or
+ * less into floats.
+ */
 void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          const struct util_format_description *format_desc,
@@ -91,15 +113,17 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
    unsigned start;
    unsigned chan;
 
-   /* FIXME: Support more formats */
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
-   assert(format_desc->block.bits <= 32);
+   assert(format_desc->block.bits <= type.width);
+   /* FIXME: Support more output types */
+   assert(type.floating);
+   assert(type.width == 32);
 
    /* Decode the input vector components */
    start = 0;
-   for (chan = 0; chan < 4; ++chan) {
+   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
       unsigned width = format_desc->channel[chan].size;
       unsigned stop = start + width;
       LLVMValueRef input;
@@ -108,22 +132,106 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 
       switch(format_desc->channel[chan].type) {
       case UTIL_FORMAT_TYPE_VOID:
-         input = NULL;
+         input = lp_build_undef(type);
          break;
 
       case UTIL_FORMAT_TYPE_UNSIGNED:
-         if(type.floating) {
-            if(start)
-               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
-            if(stop < format_desc->block.bits) {
-               unsigned mask = ((unsigned long long)1 << width) - 1;
-               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
-            }
+         /*
+          * Align the LSB
+          */
+
+         if (start) {
+            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
+         }
 
+         /*
+          * Zero the MSBs
+          */
+
+         if (stop < format_desc->block.bits) {
+            unsigned mask = ((unsigned long long)1 << width) - 1;
+            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
+         }
+
+         /*
+          * Type conversion
+          */
+
+         if (type.floating) {
             if(format_desc->channel[chan].normalized)
                input = lp_build_unsigned_norm_to_float(builder, width, type, input);
             else
-               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+               input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+
+         break;
+
+      case UTIL_FORMAT_TYPE_SIGNED:
+         /*
+          * Align the sign bit first.
+          */
+
+         if (stop < type.width) {
+            unsigned bits = type.width - stop;
+            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            input = LLVMBuildShl(builder, input, bits_val, "");
+         }
+
+         /*
+          * Align the LSB (with an arithmetic shift to preserve the sign)
+          */
+
+         if (format_desc->channel[chan].size < type.width) {
+            unsigned bits = type.width - format_desc->channel[chan].size;
+            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            input = LLVMBuildAShr(builder, input, bits_val, "");
+         }
+
+         /*
+          * Type conversion
+          */
+
+         if (type.floating) {
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            if (format_desc->channel[chan].normalized) {
+               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
+               LLVMValueRef scale_val = lp_build_const_vec(type, scale);
+               input = LLVMBuildMul(builder, input, scale_val, "");
+            }
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+
+         break;
+
+      case UTIL_FORMAT_TYPE_FLOAT:
+         if (type.floating) {
+            assert(start == 0);
+            assert(stop == 32);
+            assert(type.width == 32);
+            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      case UTIL_FORMAT_TYPE_FIXED:
+         if (type.floating) {
+            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
+            LLVMValueRef scale_val = lp_build_const_vec(type, scale);
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            input = LLVMBuildMul(builder, input, scale_val, "");
          }
          else {
             /* FIXME */
@@ -133,7 +241,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
          break;
 
       default:
-         /* fall through */
+         assert(0);
          input = lp_build_undef(type);
          break;
       }
@@ -145,3 +253,144 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
 
    lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
 }
+
+
+/**
+ * Fetch a pixel into a SoA.
+ *
+ * i and j are the sub-block pixel coordinates.
+ */
+void
+lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
+                        LLVMValueRef i,
+                        LLVMValueRef j,
+                        LLVMValueRef *rgba)
+{
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits <= type.width &&
+       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
+        format_desc->channel[0].size == 32))
+   {
+      /*
+       * The packed pixel fits into an element of the destination format. Put
+       * the packed pixels into a vector and estract each component for all
+       * vector elements in parallel.
+       */
+
+      LLVMValueRef packed;
+
+      /*
+       * gather the texels from the texture
+       */
+      packed = lp_build_gather(builder,
+                               type.length,
+                               format_desc->block.bits,
+                               type.width,
+                               base_ptr, offset);
+
+      /*
+       * convert texels to float rgba
+       */
+      lp_build_unpack_rgba_soa(builder,
+                               format_desc,
+                               type,
+                               packed, rgba);
+   }
+   else {
+      /*
+       * Fallback to calling util_format_description::fetch_rgba_float for each
+       * pixel.
+       *
+       * This is definitely not the most efficient way of fetching pixels, as
+       * we miss the opportunity to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+      char name[256];
+      LLVMValueRef function;
+      LLVMValueRef tmp;
+      unsigned k, chan;
+
+      assert(type.floating);
+
+      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name);
+
+      /*
+       * Declare and bind format_desc->fetch_rgba_float().
+       */
+
+      function = LLVMGetNamedFunction(module, name);
+      if (!function) {
+         LLVMTypeRef ret_type;
+         LLVMTypeRef arg_types[4];
+         LLVMTypeRef function_type;
+
+         ret_type = LLVMVoidType();
+         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
+         arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
+         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
+         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
+         function = LLVMAddFunction(module, name, function_type);
+
+         LLVMSetFunctionCallConv(function, LLVMCCallConv);
+         LLVMSetLinkage(function, LLVMExternalLinkage);
+
+         assert(LLVMIsDeclaration(function));
+
+         LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float);
+      }
+
+      for (chan = 0; chan < 4; ++chan) {
+         rgba[chan] = lp_build_undef(type);
+      }
+
+      tmp = LLVMBuildArrayAlloca(builder,
+                                 LLVMFloatType(),
+                                 LLVMConstInt(LLVMInt32Type(), 4, 0),
+                                 "");
+
+      /*
+       * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
+       * in the SoA vectors.
+       */
+
+      for(k = 0; k < type.length; ++k) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef offset_elem;
+         LLVMValueRef ptr;
+         LLVMValueRef i_elem, j_elem;
+         LLVMValueRef args[4];
+
+         offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
+         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
+
+         i_elem = LLVMBuildExtractElement(builder, i, index, "");
+         j_elem = LLVMBuildExtractElement(builder, j, index, "");
+
+         args[0] = tmp;
+         args[1] = ptr;
+         args[2] = i_elem;
+         args[3] = j_elem;
+
+         LLVMBuildCall(builder, function, args, 4, "");
+
+         for (chan = 0; chan < 4; ++chan) {
+            LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
+            tmp_chan = LLVMBuildGEP(builder, tmp, &chan_val, 1, "");
+            tmp_chan = LLVMBuildLoad(builder, tmp_chan, "");
+            rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, "");
+         }
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.cpp b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 067397a520..de07c222a3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,33 +26,42 @@
  **************************************************************************/
 
 
-#include <llvm/Config/config.h>
-#include <llvm/Target/TargetSelect.h>
-#include <llvm/Target/TargetOptions.h>
-
-#include "pipe/p_config.h"
-
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
 #include "lp_bld_init.h"
 
 
-extern "C" void LLVMLinkInJIT();
+LLVMModuleRef lp_build_module = NULL;
+LLVMExecutionEngineRef lp_build_engine = NULL;
+LLVMModuleProviderRef lp_build_provider = NULL;
+LLVMTargetDataRef lp_build_target = NULL;
 
 
-extern "C" void
+void
 lp_build_init(void)
 {
-#if defined(PIPE_OS_WINDOWS) && defined(PIPE_ARCH_X86)
-   /*
-    * This is mis-detected on some hardware / software combinations.
-    */
-   llvm::StackAlignment = 4;
-   llvm::RealignStack = true;
-#endif
-
-   /* Same as LLVMInitializeNativeTarget(); */
-   llvm::InitializeNativeTarget();
+   LLVMInitializeNativeTarget();
 
    LLVMLinkInJIT();
+
+   if (!lp_build_module)
+      lp_build_module = LLVMModuleCreateWithName("gallivm");
+
+   if (!lp_build_provider)
+      lp_build_provider = LLVMCreateModuleProviderForExistingModule(lp_build_module);
+
+   if (!lp_build_engine) {
+      char *error = NULL;
+
+      if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider, 1, &error)) {
+         _debug_printf("%s\n", error);
+         LLVMDisposeMessage(error);
+         assert(0);
+      }
+   }
+
+   if (!lp_build_target)
+      lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine);
 }
 
 
@@ -64,6 +73,6 @@ lp_build_init(void)
  */
 #if defined(_MSC_VER) && defined(_DEBUG)
 #include <crtdefs.h>
-extern "C" _CRTIMP void __cdecl
+_CRTIMP void __cdecl
 _invalid_parameter_noinfo(void) {}
 #endif
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 07f50d1c43..0ec2afcd1b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -30,18 +30,18 @@
 #define LP_BLD_INIT_H
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
 
 
-void
-lp_build_init(void);
+extern LLVMModuleRef lp_build_module;
+extern LLVMExecutionEngineRef lp_build_engine;
+extern LLVMModuleProviderRef lp_build_provider;
+extern LLVMTargetDataRef lp_build_target;
 
 
-#ifdef __cplusplus
-}
-#endif
+void
+lp_build_init(void);
 
 
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.c b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
deleted file mode 100644
index 2fc894017d..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "tgsi/tgsi_parse.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_swizzle.h"
-#include "lp_bld_interp.h"
-
-
-/*
- * The shader JIT function operates on blocks of quads.
- * Each block has 2x2 quads and each quad has 2x2 pixels.
- *
- * We iterate over the quads in order 0, 1, 2, 3:
- *
- * #################
- * #   |   #   |   #
- * #---0---#---1---#
- * #   |   #   |   #
- * #################
- * #   |   #   |   #
- * #---2---#---3---#
- * #   |   #   |   #
- * #################
- *
- * Within each quad, we have four pixels which are represented in SOA
- * order:
- *
- * #########
- * # 0 | 1 #
- * #---+---#
- * # 2 | 3 #
- * #########
- *
- * So the green channel (for example) of the four pixels is stored in
- * a single vector register: {g0, g1, g2, g3}.
- */
-
-
-static void
-attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
-{
-   if(attrib == 0)
-      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
-   else
-      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
-}
-
-
-/**
- * Initialize the bld->a0, dadx, dady fields.  This involves fetching
- * those values from the arrays which are passed into the JIT function.
- */
-static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
-            LLVMValueRef a0_ptr,
-            LLVMValueRef dadx_ptr,
-            LLVMValueRef dady_ptr)
-{
-   LLVMBuilderRef builder = bld->base.builder;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
-            LLVMValueRef a0 = NULL;
-            LLVMValueRef dadx = NULL;
-            LLVMValueRef dady = NULL;
-
-            switch( mode ) {
-            case TGSI_INTERPOLATE_PERSPECTIVE:
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_LINEAR:
-               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
-               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
-               dady = lp_build_broadcast_scalar(&bld->base, dady);
-               attrib_name(dadx, attrib, chan, ".dadx");
-               attrib_name(dady, attrib, chan, ".dady");
-               /* fall-through */
-
-            case TGSI_INTERPOLATE_CONSTANT:
-               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
-               a0 = lp_build_broadcast_scalar(&bld->base, a0);
-               attrib_name(a0, attrib, chan, ".a0");
-               break;
-
-            default:
-               assert(0);
-               break;
-            }
-
-            bld->a0  [attrib][chan] = a0;
-            bld->dadx[attrib][chan] = dadx;
-            bld->dady[attrib][chan] = dady;
-         }
-      }
-   }
-}
-
-
-/**
- * Emit LLVM code to compute the fragment shader input attribute values.
- * For example, for a color input, we'll compute red, green, blue and alpha
- * values for the four pixels in a quad.
- * Recall that we're operating on 4-element vectors so each arithmetic
- * operation is operating on the four pixels in a quad.
- */
-static void
-attribs_init(struct lp_build_interp_soa_context *bld)
-{
-   LLVMValueRef x = bld->pos[0];
-   LLVMValueRef y = bld->pos[1];
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef a0   = bld->a0  [attrib][chan];
-            LLVMValueRef dadx = bld->dadx[attrib][chan];
-            LLVMValueRef dady = bld->dady[attrib][chan];
-            LLVMValueRef res;
-
-            res = a0;
-
-            if (mode != TGSI_INTERPOLATE_CONSTANT) {
-               /* res = res + x * dadx */
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
-               /* res = res + y * dady */
-               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
-            }
-
-            /* Keep the value of the attribue before perspective divide for faster updates */
-            bld->attribs_pre[attrib][chan] = res;
-
-            if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-               LLVMValueRef w = bld->pos[3];
-               assert(attrib != 0);
-               if(!oow)
-                  oow = lp_build_rcp(&bld->base, w);
-               res = lp_build_mul(&bld->base, res, oow);
-            }
-
-            attrib_name(res, attrib, chan, "");
-
-            bld->attribs[attrib][chan] = res;
-         }
-      }
-   }
-}
-
-
-/**
- * Increment the shader input attribute values.
- * This is called when we move from one quad to the next.
- */
-static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
-{
-   LLVMValueRef oow = NULL;
-   unsigned attrib;
-   unsigned chan;
-
-   assert(quad_index < 4);
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      unsigned mask = bld->mask[attrib];
-      unsigned mode = bld->mode[attrib];
-
-      if (mode != TGSI_INTERPOLATE_CONSTANT) {
-         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-            if(mask & (1 << chan)) {
-               LLVMValueRef dadx = bld->dadx[attrib][chan];
-               LLVMValueRef dady = bld->dady[attrib][chan];
-               LLVMValueRef res;
-
-               res = bld->attribs_pre[attrib][chan];
-
-               if (quad_index == 1 || quad_index == 3) {
-                  /* top-right or bottom-right quad */
-                  /* build res = res + dadx + dadx */
-                  res = lp_build_add(&bld->base, res, dadx);
-                  res = lp_build_add(&bld->base, res, dadx);
-               }
-
-               if (quad_index == 2 || quad_index == 3) {
-                  /* bottom-left or bottom-right quad */
-                  /* build res = res + dady + dady */
-                  res = lp_build_add(&bld->base, res, dady);
-                  res = lp_build_add(&bld->base, res, dady);
-               }
-
-               //XXX bld->attribs_pre[attrib][chan] = res;
-
-               if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
-                  LLVMValueRef w = bld->pos[3];
-                  assert(attrib != 0);
-                  if(!oow)
-                     oow = lp_build_rcp(&bld->base, w);
-                  res = lp_build_mul(&bld->base, res, oow);
-               }
-
-               attrib_name(res, attrib, chan, "");
-
-               bld->attribs[attrib][chan] = res;
-            }
-         }
-      }
-   }
-}
-
-
-/**
- * Generate the position vectors.
- *
- * Parameter x0, y0 are the integer values with the quad upper left coordinates.
- */
-static void
-pos_init(struct lp_build_interp_soa_context *bld,
-         LLVMValueRef x0,
-         LLVMValueRef y0)
-{
-   lp_build_name(x0, "pos.x");
-   lp_build_name(y0, "pos.y");
-
-   bld->attribs[0][0] = x0;
-   bld->attribs[0][1] = y0;
-}
-
-
-/**
- * Update quad position values when moving to the next quad.
- */
-static void
-pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
-{
-   LLVMValueRef x = bld->attribs[0][0];
-   LLVMValueRef y = bld->attribs[0][1];
-   const int xstep = 2, ystep = 2;
-
-   if (quad_index == 1 || quad_index == 3) {
-      /* top-right or bottom-right quad in block */
-      /* build x += xstep */
-      x = lp_build_add(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
-   }
-
-   if (quad_index == 2) {
-      /* bottom-left quad in block */
-      /* build y += ystep */
-      y = lp_build_add(&bld->base, y,
-                       lp_build_const_scalar(bld->base.type, ystep));
-      /* build x -= xstep */
-      x = lp_build_sub(&bld->base, x,
-                       lp_build_const_scalar(bld->base.type, xstep));
-   }
-
-   lp_build_name(x, "pos.x");
-   lp_build_name(y, "pos.y");
-
-   bld->attribs[0][0] = x;
-   bld->attribs[0][1] = y;
-}
-
-
-/**
- * Initialize fragment shader input attribute info.
- */
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         boolean flatshade,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0)
-{
-   struct tgsi_parse_context parse;
-   struct tgsi_full_declaration *decl;
-
-   memset(bld, 0, sizeof *bld);
-
-   lp_build_context_init(&bld->base, builder, type);
-
-   /* For convenience */
-   bld->pos = bld->attribs[0];
-   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
-
-   /* Position */
-   bld->num_attribs = 1;
-   bld->mask[0] = TGSI_WRITEMASK_ZW;
-   bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
-
-   /* Inputs */
-   tgsi_parse_init( &parse, tokens );
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         decl = &parse.FullToken.FullDeclaration;
-         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-            unsigned first, last, mask;
-            unsigned attrib;
-
-            first = decl->Range.First;
-            last = decl->Range.Last;
-            mask = decl->Declaration.UsageMask;
-
-            for( attrib = first; attrib <= last; ++attrib ) {
-               bld->mask[1 + attrib] = mask;
-
-               /* XXX: have mesa set INTERP_CONSTANT in the fragment
-                * shader.
-                */
-               if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
-                   flatshade)
-                  bld->mode[1 + attrib] = TGSI_INTERPOLATE_CONSTANT;
-               else
-                  bld->mode[1 + attrib] = decl->Declaration.Interpolate;
-            }
-
-            bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-      case TGSI_TOKEN_TYPE_PROPERTY:
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-   tgsi_parse_free( &parse );
-
-   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   pos_init(bld, x0, y0);
-
-   attribs_init(bld);
-}
-
-
-/**
- * Advance the position and inputs to the given quad within the block.
- */
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
-{
-   assert(quad_index < 4);
-
-   pos_update(bld, quad_index);
-
-   attribs_update(bld, quad_index);
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.h b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
deleted file mode 100644
index ca958cdf34..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_interp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Position and shader input interpolation.
- *
- * Special attention is given to the interpolation of side by side quads.
- * Multiplications are made only for the first quad. Interpolation of
- * inputs for posterior quads are done exclusively with additions, and
- * perspective divide if necessary.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_INTERP_H
-#define LP_BLD_INTERP_H
-
-
-#include <llvm-c/Core.h>
-
-#include "tgsi/tgsi_exec.h"
-
-#include "lp_bld_type.h"
-
-
-struct tgsi_token;
-
-
-struct lp_build_interp_soa_context
-{
-   struct lp_build_context base;
-
-   unsigned num_attribs;
-   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS];
-   unsigned mode[1 + PIPE_MAX_SHADER_INPUTS];
-
-   LLVMValueRef a0  [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   LLVMValueRef dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   /* Attribute values before perspective divide */
-   LLVMValueRef attribs_pre[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-
-   /*
-    * Convenience pointers. Callers may access this one.
-    */
-   const LLVMValueRef *pos;
-   const LLVMValueRef (*inputs)[NUM_CHANNELS];
-};
-
-
-void
-lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
-                         const struct tgsi_token *tokens,
-                         boolean flatshade,
-                         LLVMBuilderRef builder,
-                         struct lp_type type,
-                         LLVMValueRef a0_ptr,
-                         LLVMValueRef dadx_ptr,
-                         LLVMValueRef dady_ptr,
-                         LLVMValueRef x0,
-                         LLVMValueRef y0);
-
-void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index);
-
-
-#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index f813f27074..977f767322 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -37,7 +37,7 @@
 #define LP_BLD_INTR_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 /**
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 2726747eae..a3b6970116 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -42,6 +42,26 @@
 #include "lp_bld_logic.h"
 
 
+/*
+ * XXX
+ *
+ * Selection with vector conditional like
+ *
+ *    select <4 x i1> %C, %A, %B
+ *
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
+ * supported on any backend.
+ *
+ * Expanding the boolean vector to full SIMD register width, as in
+ *
+ *    sext <4 x i1> %C to <4 x i32>
+ *
+ * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
+ * it causes assertion failures in LLVM 2.6. It appears to work correctly on 
+ * LLVM 2.7.
+ */
+
+
 /**
  * Build code to compare two values 'a' and 'b' of 'type' using the given func.
  * \param func  one of PIPE_FUNC_x
@@ -54,13 +74,11 @@ lp_build_compare(LLVMBuilderRef builder,
                  LLVMValueRef a,
                  LLVMValueRef b)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
    LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
    LLVMValueRef cond;
    LLVMValueRef res;
-   unsigned i;
 
    assert(func >= PIPE_FUNC_NEVER);
    assert(func <= PIPE_FUNC_ALWAYS);
@@ -74,10 +92,12 @@ lp_build_compare(LLVMBuilderRef builder,
 
    /* XXX: It is not clear if we should use the ordered or unordered operators */
 
+#if HAVE_LLVM < 0x0207
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating && util_cpu_caps.has_sse) {
          /* float[4] comparison */
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -147,6 +167,7 @@ lp_build_compare(LLVMBuilderRef builder,
          const char *pcmpgt;
          LLVMValueRef args[2];
          LLVMValueRef res;
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
 
          switch (type.width) {
          case 8:
@@ -172,7 +193,7 @@ lp_build_compare(LLVMBuilderRef builder,
          if(table[func].gt &&
             ((type.width == 8 && type.sign) ||
              (type.width != 8 && !type.sign))) {
-            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+            LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
          }
@@ -198,8 +219,9 @@ lp_build_compare(LLVMBuilderRef builder,
 
          return res;
       }
-   }
+   } /* if (type.width * type.length == 128) */
 #endif
+#endif /* HAVE_LLVM < 0x0207 */
 
    if(type.floating) {
       LLVMRealPredicate op;
@@ -233,25 +255,33 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildFCmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildFCmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildFCmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise float"
+                      " vector comparison\n", __FUNCTION__);
+         for (i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildFCmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -281,25 +311,34 @@ lp_build_compare(LLVMBuilderRef builder,
          return lp_build_undef(type);
       }
 
-#if 0
-      /* XXX: Although valid IR, no LLVM target currently support this */
+#if HAVE_LLVM >= 0x0207
       cond = LLVMBuildICmp(builder, op, a, b, "");
-      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
 #else
-      debug_printf("%s: warning: using slow element-wise int vector comparison\n",
-                   __FUNCTION__);
-      res = LLVMGetUndef(int_vec_type);
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         cond = LLVMBuildICmp(builder, op,
-                              LLVMBuildExtractElement(builder, a, index, ""),
-                              LLVMBuildExtractElement(builder, b, index, ""),
-                              "");
-         cond = LLVMBuildSelect(builder, cond,
-                                LLVMConstExtractElement(ones, index),
-                                LLVMConstExtractElement(zeros, index),
-                                "");
-         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      if (type.length == 1) {
+         cond = LLVMBuildICmp(builder, op, a, b, "");
+         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
+      }
+      else {
+         unsigned i;
+
+         res = LLVMGetUndef(int_vec_type);
+
+         debug_printf("%s: warning: using slow element-wise int"
+                      " vector comparison\n", __FUNCTION__);
+
+         for(i = 0; i < type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            cond = LLVMBuildICmp(builder, op,
+                                 LLVMBuildExtractElement(builder, a, index, ""),
+                                 LLVMBuildExtractElement(builder, b, index, ""),
+                                 "");
+            cond = LLVMBuildSelect(builder, cond,
+                                   LLVMConstExtractElement(ones, index),
+                                   LLVMConstExtractElement(zeros, index),
+                                   "");
+            res = LLVMBuildInsertElement(builder, res, cond, index, "");
+         }
       }
 #endif
    }
@@ -326,6 +365,8 @@ lp_build_cmp(struct lp_build_context *bld,
 
 /**
  * Return mask ? a : b;
+ *
+ * mask is a bitwise mask, composed of 0 or ~0 for each element.
  */
 LLVMValueRef
 lp_build_select(struct lp_build_context *bld,
@@ -339,26 +380,32 @@ lp_build_select(struct lp_build_context *bld,
    if(a == b)
       return a;
 
-   if(type.floating) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   if (type.length == 1) {
+      mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
+      res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
+   else {
+      if(type.floating) {
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+      }
 
-   a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
 
-   /* This often gets translated to PANDN, but sometimes the NOT is
-    * pre-computed and stored in another constant. The best strategy depends
-    * on available registers, so it is not a big deal -- hopefully LLVM does
-    * the right decision attending the rest of the program.
-    */
-   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+      /* This often gets translated to PANDN, but sometimes the NOT is
+       * pre-computed and stored in another constant. The best strategy depends
+       * on available registers, so it is not a big deal -- hopefully LLVM does
+       * the right decision attending the rest of the program.
+       */
+      b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
 
-   res = LLVMBuildOr(bld->builder, a, b, "");
+      res = LLVMBuildOr(bld->builder, a, b, "");
 
-   if(type.floating) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      if(type.floating) {
+         LLVMTypeRef vec_type = lp_build_vec_type(type);
+         res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      }
    }
 
    return res;
@@ -436,3 +483,13 @@ lp_build_alloca(struct lp_build_context *bld)
       return LLVMBuildAlloca(bld->builder, lp_build_elem_type(type), "");
    }
 }
+
+
+/** Return (a & ~b) */
+LLVMValueRef
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   b = LLVMBuildNot(bld->builder, b, "");
+   b = LLVMBuildAnd(bld->builder, a, b, "");
+   return b;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index a399ebf39e..00a8c75019 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -37,7 +37,7 @@
 #define LP_BLD_LOGIC_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
@@ -79,4 +79,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_alloca(struct lp_build_context *bld);
 
+
+LLVMValueRef
+lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index bc360ad77a..186f8849b8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -164,7 +164,7 @@ lp_build_unpack2(LLVMBuilderRef builder,
 
    if(dst_type.sign && src_type.sign) {
       /* Replicate the sign bit in the most significant bits */
-      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+      msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(src_type, src_type.width - 1), "");
    }
    else
       /* Most significant bits always zero */
@@ -256,13 +256,13 @@ lp_build_pack2(LLVMBuilderRef builder,
                LLVMValueRef lo,
                LLVMValueRef hi)
 {
+#if HAVE_LLVM < 0x0207
    LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+#endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
    LLVMValueRef shuffle;
    LLVMValueRef res;
 
-   dst_vec_type = lp_build_vec_type(dst_type);
-
    assert(!src_type.floating);
    assert(!dst_type.floating);
    assert(src_type.width == dst_type.width * 2);
@@ -272,11 +272,14 @@ lp_build_pack2(LLVMBuilderRef builder,
       switch(src_type.width) {
       case 32:
          if(dst_type.sign) {
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+#endif
          }
          else {
             if (util_cpu_caps.has_sse4_1) {
-               /* PACKUSDW is the only instrinsic with a consistent signature */
                return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
             }
             else {
@@ -288,9 +291,17 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       case 16:
          if(dst_type.sign)
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+#endif
          else
+#if HAVE_LLVM >= 0x0207
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
+#else
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+#endif
          break;
 
       default:
@@ -348,7 +359,7 @@ lp_build_packs2(LLVMBuilderRef builder,
    if(clamp) {
       struct lp_build_context bld;
       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      LLVMValueRef dst_max = lp_build_const_int_vec(src_type, ((unsigned long long)1 << dst_bits) - 1);
       lp_build_context_init(&bld, builder, src_type);
       lo = lp_build_min(&bld, lo, dst_max);
       hi = lp_build_min(&bld, hi, dst_max);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index fb2a34984a..41adeed220 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,7 +37,7 @@
 #define LP_BLD_PACK_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
new file mode 100644
index 0000000000..153ba5b15b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -0,0 +1,121 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "lp_bld_printf.h"
+
+
+static int
+lp_get_printf_arg_count(const char *fmt)
+{
+   int count =0;
+   const char *p = fmt;
+   int c;
+
+   while ((c = *p++)) {
+      if (c != '%')
+         continue;
+      switch (*p) {
+         case '\0':
+	    continue;
+         case '%':
+	    p++;
+	    continue;
+	 case '.':
+	    if (p[1] == '*' && p[2] == 's') {
+	       count += 2;
+	       p += 3;
+               continue;
+	    }
+	    /* fallthrough */
+	 default:
+	    count ++;
+      }
+   }
+   return count;
+}
+
+LLVMValueRef 
+lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len)
+{
+   LLVMValueRef string = LLVMAddGlobal(module, LLVMArrayType(LLVMInt8Type(), len + 1), "");
+   LLVMSetGlobalConstant(string, TRUE);
+   LLVMSetLinkage(string, LLVMInternalLinkage);
+   LLVMSetInitializer(string, LLVMConstString(str, len + 1, TRUE));
+   return string;
+}
+ 
+
+/**
+ * lp_build_printf.
+ *
+ * Build printf call in LLVM IR. The output goes to stdout.
+ * The additional variable arguments need to have type
+ * LLVMValueRef.
+ */
+LLVMValueRef
+lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...)
+{
+   va_list arglist;
+   int i = 0;
+   int argcount = lp_get_printf_arg_count(fmt);
+   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+   LLVMValueRef params[50];
+   LLVMValueRef fmtarg = lp_build_const_string_variable(module, fmt, strlen(fmt) + 1);
+   LLVMValueRef int0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef index[2];
+   LLVMValueRef func_printf = LLVMGetNamedFunction(module, "printf");
+
+   assert(Elements(params) >= argcount + 1);
+
+   index[0] = index[1] = int0;
+
+   if (!func_printf) {
+      LLVMTypeRef printf_type = LLVMFunctionType(LLVMIntType(32), NULL, 0, 1);
+      func_printf = LLVMAddFunction(module, "printf", printf_type);
+   }
+
+   params[0] = LLVMBuildGEP(builder, fmtarg, index, 2, "");
+
+   va_start(arglist, fmt);
+   for (i = 1; i <= argcount; i++) {
+      LLVMValueRef val = va_arg(arglist, LLVMValueRef);
+      LLVMTypeRef type = LLVMTypeOf(val);
+      /* printf wants doubles, so lets convert so that
+       * we can actually print them */
+      if (LLVMGetTypeKind(type) == LLVMFloatTypeKind)
+         val = LLVMBuildFPExt(builder, val, LLVMDoubleType(), "");
+      params[i] = val;
+   }
+   va_end(arglist);
+
+   return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
+}
+
diff --git a/src/gallium/auxiliary/vl/vl_csc.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 722ca35f33..83bd8f1d55 100644
--- a/src/gallium/auxiliary/vl/vl_csc.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
+ *
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,44 +10,30 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
-#ifndef vl_csc_h
-#define vl_csc_h
+#ifndef LP_BLD_PRINTF_H
+#define LP_BLD_PRINTF_H
 
-#include <pipe/p_compiler.h>
 
-struct vl_procamp
-{
-   float brightness;
-   float contrast;
-   float saturation;
-   float hue;
-};
+#include "pipe/p_compiler.h"
+#include "lp_bld.h"
 
-enum VL_CSC_COLOR_STANDARD
-{
-   VL_CSC_COLOR_STANDARD_IDENTITY,
-   VL_CSC_COLOR_STANDARD_BT_601,
-   VL_CSC_COLOR_STANDARD_BT_709
-};
+LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len);
+LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...);
 
-void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
-                       struct vl_procamp *procamp,
-                       bool full_range,
-                       float *matrix);
+#endif
 
-#endif /* vl_csc_h */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6a026e468e..eb75b9b393 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -51,9 +51,11 @@
  */
 void
 lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_view *view,
                         const struct pipe_sampler_state *sampler)
 {
+   const struct pipe_resource *texture = view->texture;
+
    memset(state, 0, sizeof *state);
 
    if(!texture)
@@ -62,7 +64,19 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    if(!sampler)
       return;
 
-   state->format            = texture->format;
+   /*
+    * We don't copy sampler state over unless it is actually enabled, to avoid
+    * spurious recompiles, as the sampler static state is part of the shader
+    * key.
+    *
+    * Ideally the state tracker or cso_cache module would make all state
+    * canonical, but until that happens it's better to be safe than sorry here.
+    *
+    * XXX: Actually there's much more than can be done here, especially
+    * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
+    */
+
+   state->format            = view->format;
    state->target            = texture->target;
    state->pot_width         = util_is_pot(texture->width0);
    state->pot_height        = util_is_pot(texture->height0);
@@ -72,10 +86,18 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_t            = sampler->wrap_t;
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
-   state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
+   if (texture->last_level) {
+      state->min_mip_filter = sampler->min_mip_filter;
+   } else {
+      state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   }
+
    state->compare_mode      = sampler->compare_mode;
-   state->compare_func      = sampler->compare_func;
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
+      state->compare_func   = sampler->compare_func;
+   }
+
    state->normalized_coords = sampler->normalized_coords;
    state->lod_bias          = sampler->lod_bias;
    state->min_lod           = sampler->min_lod;
@@ -84,6 +106,10 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->border_color[1]   = sampler->border_color[1];
    state->border_color[2]   = sampler->border_color[2];
    state->border_color[3]   = sampler->border_color[3];
+
+   /*
+    * FIXME: Handle the remainder of pipe_sampler_view.
+    */
 }
 
 
@@ -137,22 +163,24 @@ lp_build_gather(LLVMBuilderRef builder,
 
 
 /**
- * Compute the offset of a pixel.
+ * Compute the offset of a pixel block.
  *
- * x, y, y_stride are vectors
+ * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as
+ * per format description, and not individual pixels.
  */
 LLVMValueRef
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef z_stride)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
 
-   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
+   x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
 
    if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
       LLVMValueRef x_lo, x_hi;
@@ -163,6 +191,10 @@ lp_build_sample_offset(struct lp_build_context *bld,
       LLVMValueRef y_offset_lo, y_offset_hi;
       LLVMValueRef offset_lo, offset_hi;
 
+      /* XXX 1D & 3D addressing not done yet */
+      assert(!z);
+      assert(!z_stride);
+
       x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
       y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
 
@@ -170,9 +202,9 @@ lp_build_sample_offset(struct lp_build_context *bld,
       y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
 
       x_stride_lo = x_stride;
-      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
+      y_stride_lo = lp_build_const_vec(bld->type, 2*format_desc->block.bits/8);
 
-      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
+      x_stride_hi = lp_build_const_vec(bld->type, 4*format_desc->block.bits/8);
       y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
 
       x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
@@ -186,13 +218,17 @@ lp_build_sample_offset(struct lp_build_context *bld,
       offset = lp_build_add(bld, offset_hi, offset_lo);
    }
    else {
-      LLVMValueRef x_offset;
-      LLVMValueRef y_offset;
+      offset = lp_build_mul(bld, x, x_stride);
 
-      x_offset = lp_build_mul(bld, x, x_stride);
-      y_offset = lp_build_mul(bld, y, y_stride);
+      if (y && y_stride) {
+         LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+         offset = lp_build_add(bld, offset, y_offset);
+      }
 
-      offset = lp_build_add(bld, x_offset, y_offset);
+      if (z && z_stride) {
+         LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+         offset = lp_build_add(bld, offset, z_offset);
+      }
    }
 
    return offset;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5ba0925bb6..8c1af95c50 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,9 +36,10 @@
 #define LP_BLD_SAMPLE_H
 
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
-struct pipe_texture;
+struct pipe_resource;
+struct pipe_sampler_view;
 struct pipe_sampler_state;
 struct util_format_description;
 struct lp_type;
@@ -48,14 +49,14 @@ struct lp_build_context;
 /**
  * Sampler static state.
  *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * These are the bits of state from pipe_resource and pipe_sampler_state that
  * are embedded in the generated code.
  */
 struct lp_sampler_static_state
 {
    /* pipe_texture's state */
    enum pipe_format format;
-   unsigned target:2;
+   unsigned target:3;
    unsigned pot_width:1;
    unsigned pot_height:1;
    unsigned pot_depth:1;
@@ -78,7 +79,7 @@ struct lp_sampler_static_state
 /**
  * Sampler dynamic state.
  *
- * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * These are the bits of state from pipe_resource and pipe_sampler_state that
  * are computed in runtime.
  *
  * There are obtained through callbacks, as we don't want to tie the texture
@@ -113,9 +114,9 @@ struct lp_sampler_dynamic_state
                   unsigned unit);
 
    LLVMValueRef
-   (*stride)( struct lp_sampler_dynamic_state *state,
-              LLVMBuilderRef builder,
-              unsigned unit);
+   (*row_stride)( struct lp_sampler_dynamic_state *state,
+                  LLVMBuilderRef builder,
+                  unsigned unit);
 
    LLVMValueRef
    (*data_ptr)( struct lp_sampler_dynamic_state *state,
@@ -130,7 +131,7 @@ struct lp_sampler_dynamic_state
  */
 void
 lp_sampler_static_state(struct lp_sampler_static_state *state,
-                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_view *view,
                         const struct pipe_sampler_state *sampler);
 
 
@@ -148,8 +149,9 @@ lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
+                       LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr);
+                       LLVMValueRef z_stride);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 9058f76c1d..767429ba06 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -48,6 +48,7 @@
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 
@@ -65,6 +66,14 @@ struct lp_build_sample_context
 
    const struct util_format_description *format_desc;
 
+   /** regular scalar float type */
+   struct lp_type float_type;
+   struct lp_build_context float_bld;
+
+   /** regular scalar float type */
+   struct lp_type int_type;
+   struct lp_build_context int_bld;
+
    /** Incoming coordinates type and build context */
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
@@ -108,9 +117,78 @@ wrap_mode_uses_border_color(unsigned mode)
 }
 
 
+static LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], data_ptr;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
+   return data_ptr;
+}
+
+
+static LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_mipmap_level(bld, data_array, lvl);
+}
+
 
 /**
- * Gen code to fetch a texel from a texture at int coords (x, y).
+ * Dereference stride_array[mipmap_level] array to get a stride.
+ * Return stride as a vector.
+ */
+static LLVMValueRef
+lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
+                              LLVMValueRef stride_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], stride;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
+   stride = LLVMBuildLoad(bld->builder, stride, "");
+   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   return stride;
+}
+
+
+/** Dereference stride_array[0] array to get a stride (as vector). */
+static LLVMValueRef
+lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld,
+                                    LLVMValueRef stride_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_level_stride_vec(bld, stride_array, lvl);
+}
+
+
+static int
+texture_dims(enum pipe_texture_target tex)
+{
+   switch (tex) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return 2;
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      assert(0 && "bad texture target in texture_dims()");
+      return 2;
+   }
+}
+
+
+
+/**
+ * Generate code to fetch a texel from a texture at int coords (x, y, z).
+ * The computation depends on whether the texture is 1D, 2D or 3D.
  * The result, texel, will be:
  *   texel[0] = red values
  *   texel[1] = green values
@@ -121,15 +199,19 @@ static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef width,
                           LLVMValueRef height,
+                          LLVMValueRef depth,
                           LLVMValueRef x,
                           LLVMValueRef y,
+                          LLVMValueRef z,
                           LLVMValueRef y_stride,
+                          LLVMValueRef z_stride,
                           LLVMValueRef data_ptr,
                           LLVMValueRef *texel)
 {
+   const int dims = texture_dims(bld->static_state->target);
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
-   LLVMValueRef packed;
+   LLVMValueRef i, j;
    LLVMValueRef use_border = NULL;
 
    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
@@ -140,7 +222,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
    }
 
-   if (wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+   if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
@@ -153,6 +235,56 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
+   if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) {
+      LLVMValueRef b1, b2;
+      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
+      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+      if (use_border) {
+         use_border = LLVMBuildOr(bld->builder, use_border, b1, "ub_or_b1");
+         use_border = LLVMBuildOr(bld->builder, use_border, b2, "ub_or_b2");
+      }
+      else {
+         use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
+      }
+   }
+
+   /*
+    * Describe the coordinates in terms of pixel blocks.
+    *
+    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
+    * bit arithmetic. Verify this.
+    */
+
+   if (bld->format_desc->block.width == 1) {
+      i = bld->uint_coord_bld.zero;
+   }
+   else {
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
+      i = LLVMBuildURem(bld->builder, x, block_width, "");
+      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
+   }
+
+   if (bld->format_desc->block.height == 1) {
+      j = bld->uint_coord_bld.zero;
+   }
+   else {
+      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
+      j = LLVMBuildURem(bld->builder, y, block_height, "");
+      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
+   }
+
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
+   offset = lp_build_sample_offset(&bld->uint_coord_bld,
+                                   bld->format_desc,
+                                   x, y, z, y_stride, z_stride);
+
+   lp_build_fetch_rgba_soa(bld->builder,
+                           bld->format_desc,
+                           bld->texel_type,
+                           data_ptr, offset,
+                           i, j,
+                           texel);
+
    /*
     * Note: if we find an app which frequently samples the texture border
     * we might want to implement a true conditional here to avoid sampling
@@ -168,35 +300,12 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
     * the texel color results with the border color.
     */
 
-   /* convert x,y coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   /* gather the texels from the texture */
-   packed = lp_build_gather(bld->builder,
-                            bld->texel_type.length,
-                            bld->format_desc->block.bits,
-                            bld->texel_type.width,
-                            data_ptr, offset);
-
-   /* convert texels to float rgba */
-   lp_build_unpack_rgba_soa(bld->builder,
-                            bld->format_desc,
-                            bld->texel_type,
-                            packed, texel);
-
    if (use_border) {
       /* select texel color or border color depending on use_border */
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_scalar(bld->texel_type,
+            lp_build_const_vec(bld->texel_type,
                                   bld->static_state->border_color[chan]);
          texel[chan] = lp_build_select(&bld->texel_bld, use_border,
                                        border_chan, texel[chan]);
@@ -210,19 +319,22 @@ lp_build_sample_packed(struct lp_build_sample_context *bld,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef y_stride,
-                       LLVMValueRef data_ptr)
+                       LLVMValueRef data_array)
 {
    LLVMValueRef offset;
+   LLVMValueRef data_ptr;
 
    offset = lp_build_sample_offset(&bld->uint_coord_bld,
                                    bld->format_desc,
-                                   x, y, y_stride,
-                                   data_ptr);
+                                   x, y, NULL, y_stride, NULL);
 
    assert(bld->format_desc->block.width == 1);
    assert(bld->format_desc->block.height == 1);
    assert(bld->format_desc->block.bits <= bld->texel_type.width);
 
+   /* get pointer to mipmap level 0 data */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
+
    return lp_build_gather(bld->builder,
                           bld->texel_type.length,
                           bld->format_desc->block.bits,
@@ -358,8 +470,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
-   LLVMValueRef half = lp_build_const_scalar(coord_bld->type, 0.5);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -413,7 +525,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_scalar(coord_bld->type, 0.5F);
+         min = lp_build_const_vec(coord_bld->type, 0.5F);
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -434,7 +546,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = 1.0 - min */
             max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -446,7 +558,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
             coord = lp_build_clamp(coord_bld, coord, min, max);
             coord = lp_build_sub(coord_bld, coord, half);
@@ -521,7 +633,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          LLVMValueRef min, max;
          /* min = -1.0 / (2 * length) = -0.5 / length */
          min = lp_build_mul(coord_bld,
-                            lp_build_const_scalar(coord_bld->type, -0.5F),
+                            lp_build_const_vec(coord_bld->type, -0.5F),
                             lp_build_rcp(coord_bld, length_f));
          /* max = 1.0 - min */
          max = lp_build_sub(coord_bld, coord_bld->one, min);
@@ -566,7 +678,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
+   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
    LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
@@ -609,7 +721,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, 0.5F);
+            min = lp_build_const_vec(coord_bld->type, 0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -625,7 +737,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          if (bld->static_state->normalized_coords) {
             /* min = -1.0 / (2 * length) = -0.5 / length */
             min = lp_build_mul(coord_bld,
-                               lp_build_const_scalar(coord_bld->type, -0.5F),
+                               lp_build_const_vec(coord_bld->type, -0.5F),
                                lp_build_rcp(coord_bld, length_f));
             /* max = length - min */
             max = lp_build_sub(coord_bld, length_f, min);
@@ -634,7 +746,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          }
          else {
             /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_scalar(coord_bld->type, -0.5F);
+            min = lp_build_const_vec(coord_bld->type, -0.5F);
             max = lp_build_sub(coord_bld, length_f, min);
          }
          /* coord = clamp(coord, min, max) */
@@ -711,83 +823,905 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
 
 /**
- * Sample 2D texture with nearest filtering.
+ * Codegen equivalent for u_minify().
+ * Return max(1, base_size >> level);
+ */
+static LLVMValueRef
+lp_build_minify(struct lp_build_sample_context *bld,
+                LLVMValueRef base_size,
+                LLVMValueRef level)
+{
+   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
+   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+   return size;
+}
+
+
+/**
+ * Generate code to compute texture level of detail (lambda).
+ * \param s  vector of texcoord s values
+ * \param t  vector of texcoord t values
+ * \param r  vector of texcoord r values
+ * \param width  scalar int texture width
+ * \param height  scalar int texture height
+ * \param depth  scalar int texture depth
+ */
+static LLVMValueRef
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      LLVMValueRef s,
+                      LLVMValueRef t,
+                      LLVMValueRef r,
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth)
+
+{
+   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+      /* User is forcing sampling from a particular mipmap level.
+       * This is hit during mipmap generation.
+       */
+      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+   }
+   else {
+      const int dims = texture_dims(bld->static_state->target);
+      struct lp_build_context *float_bld = &bld->float_bld;
+      LLVMValueRef lod_bias = LLVMConstReal(LLVMFloatType(),
+                                            bld->static_state->lod_bias);
+      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->min_lod);
+      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->max_lod);
+
+      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+      LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+
+      LLVMValueRef s0, s1, s2;
+      LLVMValueRef t0, t1, t2;
+      LLVMValueRef r0, r1, r2;
+      LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
+      LLVMValueRef rho, lod;
+
+      /*
+       * dsdx = abs(s[1] - s[0]);
+       * dsdy = abs(s[2] - s[0]);
+       * dtdx = abs(t[1] - t[0]);
+       * dtdy = abs(t[2] - t[0]);
+       * drdx = abs(r[1] - r[0]);
+       * drdy = abs(r[2] - r[0]);
+       * XXX we're assuming a four-element quad in 2x2 layout here.
+       */
+      s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
+      s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
+      s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
+      dsdx = LLVMBuildSub(bld->builder, s1, s0, "");
+      dsdx = lp_build_abs(float_bld, dsdx);
+      dsdy = LLVMBuildSub(bld->builder, s2, s0, "");
+      dsdy = lp_build_abs(float_bld, dsdy);
+      if (dims > 1) {
+         t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
+         t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
+         t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
+         dtdx = LLVMBuildSub(bld->builder, t1, t0, "");
+         dtdx = lp_build_abs(float_bld, dtdx);
+         dtdy = LLVMBuildSub(bld->builder, t2, t0, "");
+         dtdy = lp_build_abs(float_bld, dtdy);
+         if (dims > 2) {
+            r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
+            r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
+            r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
+            drdx = LLVMBuildSub(bld->builder, r1, r0, "");
+            drdx = lp_build_abs(float_bld, drdx);
+            drdy = LLVMBuildSub(bld->builder, r2, r0, "");
+            drdy = lp_build_abs(float_bld, drdy);
+         }
+      }
+
+      /* Compute rho = max of all partial derivatives scaled by texture size.
+       * XXX this could be vectorized somewhat
+       */
+      rho = LLVMBuildMul(bld->builder,
+                         lp_build_max(float_bld, dsdx, dsdy),
+                         lp_build_int_to_float(float_bld, width), "");
+      if (dims > 1) {
+         LLVMValueRef max;
+         max = LLVMBuildMul(bld->builder,
+                            lp_build_max(float_bld, dtdx, dtdy),
+                            lp_build_int_to_float(float_bld, height), "");
+         rho = lp_build_max(float_bld, rho, max);
+         if (dims > 2) {
+            max = LLVMBuildMul(bld->builder,
+                               lp_build_max(float_bld, drdx, drdy),
+                               lp_build_int_to_float(float_bld, depth), "");
+            rho = lp_build_max(float_bld, rho, max);
+         }
+      }
+
+      /* compute lod = log2(rho) */
+      lod = lp_build_log2(float_bld, rho);
+
+      /* add lod bias */
+      lod = LLVMBuildAdd(bld->builder, lod, lod_bias, "LOD bias");
+
+      /* clamp lod */
+      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+
+      return lod;
+   }
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
+ * mipmap level index.
+ * Note: this is all scalar code.
+ * \param lod  scalar float texture level of detail
+ * \param level_out  returns integer 
  */
 static void
-lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
-                               LLVMValueRef s,
-                               LLVMValueRef t,
-                               LLVMValueRef width,
-                               LLVMValueRef height,
-                               LLVMValueRef stride,
-                               LLVMValueRef data_ptr,
-                               LLVMValueRef *texel)
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out)
 {
-   LLVMValueRef x, y;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
 
-   x = lp_build_sample_wrap_nearest(bld, s, width,
-                                    bld->static_state->pot_width,
-                                    bld->static_state->wrap_s);
-   y = lp_build_sample_wrap_nearest(bld, t, height,
-                                    bld->static_state->pot_height,
-                                    bld->static_state->wrap_t);
+   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-   lp_build_name(x, "tex.x.wrapped");
-   lp_build_name(y, "tex.y.wrapped");
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_iround(float_bld, lod);
+
+   /* clamp level to legal range of levels */
+   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
+ * two (adjacent) mipmap level indexes.  Later, we'll sample from those
+ * two mipmap levels and interpolate between them.
+ */
+static void
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_ifloor(float_bld, lod);
 
-   lp_build_sample_texel_soa(bld, width, height, x, y, stride, data_ptr, texel);
+   /* compute level 0 and clamp to legal range of levels */
+   *level0_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+   /* compute level 1 and clamp to legal range of levels */
+   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
+   *level1_out = lp_build_min(int_bld, *level1_out, last_level);
+
+   *weight_out = lp_build_fract(float_bld, lod);
 }
 
 
 /**
- * Sample 2D texture with bilinear filtering.
+ * Generate code to sample a mipmap level with nearest filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
  */
 static void
-lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              LLVMValueRef width_vec,
+                              LLVMValueRef height_vec,
+                              LLVMValueRef depth_vec,
+                              LLVMValueRef row_stride_vec,
+                              LLVMValueRef img_stride_vec,
+                              LLVMValueRef data_ptr,
                               LLVMValueRef s,
                               LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
-                              LLVMValueRef *texel)
+                              LLVMValueRef r,
+                              LLVMValueRef colors_out[4])
 {
-   LLVMValueRef s_fpart;
-   LLVMValueRef t_fpart;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x, y, z;
+
+   /*
+    * Compute integer texcoords.
+    */
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s);
+   lp_build_name(x, "tex.x.wrapped");
+
+   if (dims >= 2) {
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+                                       bld->static_state->pot_height,
+                                       bld->static_state->wrap_t);
+      lp_build_name(y, "tex.y.wrapped");
+
+      if (dims == 3) {
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
+                                          bld->static_state->pot_height,
+                                          bld->static_state->wrap_r);
+         lp_build_name(z, "tex.z.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z = r;
+      }
+      else {
+         z = NULL;
+      }
+   }
+   else {
+      y = z = NULL;
+   }
+
+   /*
+    * Get texture colors.
+    */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x, y, z,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, colors_out);
+}
+
+
+/**
+ * Generate code to sample a mipmap level with linear filtering.
+ * If sampling a cube texture, r = cube face in [0,5].
+ */
+static void
+lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             LLVMValueRef width_vec,
+                             LLVMValueRef height_vec,
+                             LLVMValueRef depth_vec,
+                             LLVMValueRef row_stride_vec,
+                             LLVMValueRef img_stride_vec,
+                             LLVMValueRef data_ptr,
+                             LLVMValueRef s,
+                             LLVMValueRef t,
+                             LLVMValueRef r,
+                             LLVMValueRef colors_out[4])
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef x0, y0, z0, x1, y1, z1;
+   LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
-   unsigned chan;
+   int chan;
 
-   lp_build_sample_wrap_linear(bld, s, width, bld->static_state->pot_width,
-                               bld->static_state->wrap_s, &x0, &x1, &s_fpart);
-   lp_build_sample_wrap_linear(bld, t, height, bld->static_state->pot_height,
-                               bld->static_state->wrap_t, &y0, &y1, &t_fpart);
+   /*
+    * Compute integer texcoords.
+    */
+   lp_build_sample_wrap_linear(bld, s, width_vec,
+                               bld->static_state->pot_width,
+                               bld->static_state->wrap_s,
+                               &x0, &x1, &s_fpart);
+   lp_build_name(x0, "tex.x0.wrapped");
+   lp_build_name(x1, "tex.x1.wrapped");
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear(bld, t, height_vec,
+                                  bld->static_state->pot_height,
+                                  bld->static_state->wrap_t,
+                                  &y0, &y1, &t_fpart);
+      lp_build_name(y0, "tex.y0.wrapped");
+      lp_build_name(y1, "tex.y1.wrapped");
+
+      if (dims == 3) {
+         lp_build_sample_wrap_linear(bld, r, depth_vec,
+                                     bld->static_state->pot_depth,
+                                     bld->static_state->wrap_r,
+                                     &z0, &z1, &r_fpart);
+         lp_build_name(z0, "tex.z0.wrapped");
+         lp_build_name(z1, "tex.z1.wrapped");
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z0 = z1 = r;  /* cube face */
+         r_fpart = NULL;
+      }
+      else {
+         z0 = z1 = NULL;
+         r_fpart = NULL;
+      }
+   }
+   else {
+      y0 = y1 = t_fpart = NULL;
+      z0 = z1 = r_fpart = NULL;
+   }
 
-   lp_build_sample_texel_soa(bld, width, height, x0, y0, stride, data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y0, stride, data_ptr, neighbors[0][1]);
-   lp_build_sample_texel_soa(bld, width, height, x0, y1, stride, data_ptr, neighbors[1][0]);
-   lp_build_sample_texel_soa(bld, width, height, x1, y1, stride, data_ptr, neighbors[1][1]);
+   /*
+    * Get texture colors.
+    */
+   /* get x0/x1 texels */
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x0, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                             x1, y0, z0,
+                             row_stride_vec, img_stride_vec,
+                             data_ptr, neighbors[0][1]);
+
+   if (dims == 1) {
+      /* Interpolate two samples from 1D image to produce one color */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan]);
+      }
+   }
+   else {
+      /* 2D/3D texture */
+      LLVMValueRef colors0[4];
+
+      /* get x0/x1 texels at y1 */
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x0, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][0]);
+      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                x1, y1, z0,
+                                row_stride_vec, img_stride_vec,
+                                data_ptr, neighbors[1][1]);
+
+      /* Bilinear interpolate the four samples from the 2D image / 3D slice */
+      for (chan = 0; chan < 4; chan++) {
+         colors0[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                          s_fpart, t_fpart,
+                                          neighbors[0][0][chan],
+                                          neighbors[0][1][chan],
+                                          neighbors[1][0][chan],
+                                          neighbors[1][1][chan]);
+      }
 
-   /* TODO: Don't interpolate missing channels */
-   for(chan = 0; chan < 4; ++chan) {
-      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
-                                     s_fpart, t_fpart,
-                                     neighbors[0][0][chan],
-                                     neighbors[0][1][chan],
-                                     neighbors[1][0][chan],
-                                     neighbors[1][1][chan]);
+      if (dims == 3) {
+         LLVMValueRef neighbors1[2][2][4];
+         LLVMValueRef colors1[4];
+
+         /* get x0/x1/y0/y1 texels at z1 */
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y0, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[0][1]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x0, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][0]);
+         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+                                   x1, y1, z1,
+                                   row_stride_vec, img_stride_vec,
+                                   data_ptr, neighbors1[1][1]);
+
+         /* Bilinear interpolate the four samples from the second Z slice */
+         for (chan = 0; chan < 4; chan++) {
+            colors1[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                             s_fpart, t_fpart,
+                                             neighbors1[0][0][chan],
+                                             neighbors1[0][1][chan],
+                                             neighbors1[1][0][chan],
+                                             neighbors1[1][1][chan]);
+         }
+
+         /* Linearly interpolate the two samples from the two 3D slices */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = lp_build_lerp(&bld->texel_bld,
+                                             r_fpart,
+                                             colors0[chan], colors1[chan]);
+         }
+      }
+      else {
+         /* 2D tex */
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = colors0[chan];
+         }
+      }
+   }
+}
+
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = -0.5 / abs(coord); */
+   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
+                                   lp_build_rcp(coord_bld, absCoord));
+   return ima;
+}
+
+
+/**
+ * Helper used by lp_build_cube_lookup()
+ * \param sign  scalar +1 or -1
+ * \param coord  float vector
+ * \param ima  float vector
+ */
+static LLVMValueRef
+lp_build_cube_coord(struct lp_build_context *coord_bld,
+                    LLVMValueRef sign, int negate_coord,
+                    LLVMValueRef coord, LLVMValueRef ima)
+{
+   /* return negate(coord) * ima * sign + 0.5; */
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
+   LLVMValueRef res;
+
+   assert(negate_coord == +1 || negate_coord == -1);
+
+   if (negate_coord == -1) {
+      coord = lp_build_negate(coord_bld, coord);
    }
+
+   res = lp_build_mul(coord_bld, coord, ima);
+   if (sign) {
+      sign = lp_build_broadcast_scalar(coord_bld, sign);
+      res = lp_build_mul(coord_bld, res, sign);
+   }
+   res = lp_build_add(coord_bld, res, half);
+
+   return res;
 }
 
 
+/** Helper used by lp_build_cube_lookup()
+ * Return (major_coord >= 0) ? pos_face : neg_face;
+ */
+static LLVMValueRef
+lp_build_cube_face(struct lp_build_sample_context *bld,
+                   LLVMValueRef major_coord,
+                   unsigned pos_face, unsigned neg_face)
+{
+   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                                    major_coord,
+                                    bld->float_bld.zero, "");
+   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
+   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
+   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
+   return res;
+}
+
+
+
+/**
+ * Generate code to do cube face selection and per-face texcoords.
+ */
+static void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef rx, ry, rz;
+   LLVMValueRef arx, ary, arz;
+   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
+   LLVMValueRef arx_ge_ary, arx_ge_arz;
+   LLVMValueRef ary_ge_arx, ary_ge_arz;
+   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+   LLVMValueRef rx_pos, ry_pos, rz_pos;
+
+   assert(bld->coord_bld.type.length == 4);
+
+   /*
+    * Use the average of the four pixel's texcoords to choose the face.
+    */
+   rx = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, s));
+   ry = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, t));
+   rz = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, r));
+
+   arx = lp_build_abs(float_bld, rx);
+   ary = lp_build_abs(float_bld, ry);
+   arz = lp_build_abs(float_bld, rz);
+
+   /*
+    * Compare sign/magnitude of rx,ry,rz to determine face
+    */
+   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
+   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
+   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
+   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
+
+   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
+   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
+   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
+   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
+
+   {
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      *face_s = bld->coord_bld.undef;
+      *face_t = bld->coord_bld.undef;
+      *face = bld->int_bld.undef;
+
+      lp_build_name(*face_s, "face_s");
+      lp_build_name(*face_t, "face_t");
+      lp_build_name(*face, "face");
+
+      lp_build_flow_scope_declare(flow_ctx, face_s);
+      lp_build_flow_scope_declare(flow_ctx, face_t);
+      lp_build_flow_scope_declare(flow_ctx, face);
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      {
+         /* +/- X face */
+         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
+         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
+         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+         *face = lp_build_cube_face(bld, rx,
+                                    PIPE_TEX_FACE_POS_X,
+                                    PIPE_TEX_FACE_NEG_X);
+      }
+      lp_build_else(&if_ctx);
+      {
+         struct lp_build_flow_context *flow_ctx2;
+         struct lp_build_if_state if_ctx2;
+
+         LLVMValueRef face_s2 = bld->coord_bld.undef;
+         LLVMValueRef face_t2 = bld->coord_bld.undef;
+         LLVMValueRef face2 = bld->int_bld.undef;
+
+         flow_ctx2 = lp_build_flow_create(bld->builder);
+         lp_build_flow_scope_begin(flow_ctx2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
+         lp_build_flow_scope_declare(flow_ctx2, &face2);
+
+         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         {
+            /* +/- Y face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            face2 = lp_build_cube_face(bld, ry,
+                                       PIPE_TEX_FACE_POS_Y,
+                                       PIPE_TEX_FACE_NEG_Y);
+         }
+         lp_build_else(&if_ctx2);
+         {
+            /* +/- Z face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            face2 = lp_build_cube_face(bld, rz,
+                                       PIPE_TEX_FACE_POS_Z,
+                                       PIPE_TEX_FACE_NEG_Z);
+         }
+         lp_build_endif(&if_ctx2);
+         lp_build_flow_scope_end(flow_ctx2);
+         lp_build_flow_destroy(flow_ctx2);
+
+         *face_s = face_s2;
+         *face_t = face_t2;
+         *face = face2;
+      }
+
+      lp_build_endif(&if_ctx);
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+
+/**
+ * Sample the texture/mipmap using given image filter and mip filter.
+ * data0_ptr and data1_ptr point to the two mipmap levels to sample
+ * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
+ * If we're using nearest miplevel sampling the '1' values will be null/unused.
+ */
+static void
+lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned img_filter,
+                       unsigned mip_filter,
+                       LLVMValueRef s,
+                       LLVMValueRef t,
+                       LLVMValueRef r,
+                       LLVMValueRef lod_fpart,
+                       LLVMValueRef width0_vec,
+                       LLVMValueRef width1_vec,
+                       LLVMValueRef height0_vec,
+                       LLVMValueRef height1_vec,
+                       LLVMValueRef depth0_vec,
+                       LLVMValueRef depth1_vec,
+                       LLVMValueRef row_stride0_vec,
+                       LLVMValueRef row_stride1_vec,
+                       LLVMValueRef img_stride0_vec,
+                       LLVMValueRef img_stride1_vec,
+                       LLVMValueRef data_ptr0,
+                       LLVMValueRef data_ptr1,
+                       LLVMValueRef *colors_out)
+{
+   LLVMValueRef colors0[4], colors1[4];
+   int chan;
+
+   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      lp_build_sample_image_nearest(bld,
+                                    width0_vec, height0_vec, depth0_vec,
+                                    row_stride0_vec, img_stride0_vec,
+                                    data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_nearest(bld,
+                                       width1_vec, height1_vec, depth1_vec,
+                                       row_stride1_vec, img_stride1_vec,
+                                       data_ptr1, s, t, r, colors1);
+      }
+   }
+   else {
+      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+
+      lp_build_sample_image_linear(bld,
+                                   width0_vec, height0_vec, depth0_vec,
+                                   row_stride0_vec, img_stride0_vec,
+                                   data_ptr0, s, t, r, colors0);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level, and interp */
+         lp_build_sample_image_linear(bld,
+                                      width1_vec, height1_vec, depth1_vec,
+                                      row_stride1_vec, img_stride1_vec,
+                                      data_ptr1, s, t, r, colors1);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* interpolate samples from the two mipmap levels */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
+                                          colors0[chan], colors1[chan]);
+      }
+   }
+   else {
+      /* use first/only level's colors */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = colors0[chan];
+      }
+   }
+}
+
+
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef width,
+                        LLVMValueRef height,
+                        LLVMValueRef depth,
+                        LLVMValueRef width_vec,
+                        LLVMValueRef height_vec,
+                        LLVMValueRef depth_vec,
+                        LLVMValueRef row_stride_array,
+                        LLVMValueRef img_stride_vec,
+                        LLVMValueRef data_array,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL, ilevel0_vec, ilevel1_vec = NULL;
+   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
+   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
+   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0, data_ptr1 = NULL;
+
+   /*
+   printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
+          mip_filter, min_filter, mag_filter);
+   */
+
+   /*
+    * Compute the level of detail (float).
+    */
+   if (min_filter != mag_filter ||
+       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* Need to compute lod either to choose mipmap levels or to
+       * distinguish between minification/magnification with one mipmap level.
+       */
+      lod = lp_build_lod_selector(bld, s, t, r, width, height, depth);
+   }
+
+   /*
+    * Compute integer mipmap level(s) to fetch texels from.
+    */
+   if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* always use mip level 0 */
+      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   }
+   else {
+      if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR);
+         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
+                                    &lod_fpart);
+         lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart);
+      }
+   }
+
+   /*
+    * Convert scalar integer mipmap levels into vectors.
+    */
+   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+
+   /*
+    * Compute width, height at mipmap level 'ilevel0'
+    */
+   width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   if (dims >= 2) {
+      height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
+      row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                      ilevel0);
+      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         img_stride0_vec = lp_build_mul(&bld->int_coord_bld,
+                                        row_stride0_vec, height0_vec);
+         if (dims == 3) {
+            depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
+         }
+      }
+   }
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* compute width, height, depth for second mipmap level at 'ilevel1' */
+      width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
+      if (dims >= 2) {
+         height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
+         row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
+                                                         ilevel1);
+         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+            img_stride1_vec = lp_build_mul(&bld->int_coord_bld,
+                                           row_stride1_vec, height1_vec);
+            if (dims ==3) {
+               depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
+            }
+         }
+      }
+   }
+
+   /*
+    * Choose cube face, recompute per-face texcoords.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+   }
+
+   /*
+    * Get pointer(s) to image data for mipmap level(s).
+    */
+   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+   }
+
+   /*
+    * Get/interpolate texture colors.
+    */
+   if (min_filter == mag_filter) {
+      /* no need to distinquish between minification and magnification */
+      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
+                             width0_vec, width1_vec,
+                             height0_vec, height1_vec,
+                             depth0_vec, depth1_vec,
+                             row_stride0_vec, row_stride1_vec,
+                             img_stride0_vec, img_stride1_vec,
+                             data_ptr0, data_ptr1,
+                             colors_out);
+   }
+   else {
+      /* Emit conditional to choose min image filter or mag image filter
+       * depending on the lod being >0 or <= 0, respectively.
+       */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef minify;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[0]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[1]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[2]);
+      lp_build_flow_scope_declare(flow_ctx, &colors_out[3]);
+
+      /* minify = lod > 0.0 */
+      minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                             lod, float_bld->zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
+      {
+         /* Use the minification filter */
+         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_else(&if_ctx);
+      {
+         /* Use the magnification filter */
+         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                colors_out);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+
 static void
 lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
                           struct lp_type dst_type,
                           LLVMValueRef packed,
                           LLVMValueRef *rgba)
 {
-   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
    unsigned chan;
 
    /* Decode the input vector components */
@@ -799,7 +1733,7 @@ lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
       input = packed;
 
       if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
 
       if(stop < 32)
          input = LLVMBuildAnd(builder, input, mask, "");
@@ -817,8 +1751,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef t,
                               LLVMValueRef width,
                               LLVMValueRef height,
-                              LLVMValueRef stride,
-                              LLVMValueRef data_ptr,
+                              LLVMValueRef stride_array,
+                              LLVMValueRef data_array,
                               LLVMValueRef *texel)
 {
    LLVMBuilderRef builder = bld->builder;
@@ -834,8 +1768,9 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
+   LLVMValueRef stride;
 
-   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
    lp_build_context_init(&h16, builder, lp_type_ufixed(16));
    lp_build_context_init(&u8n, builder, lp_type_unorm(8));
 
@@ -860,17 +1795,17 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
 
    /* subtract 0.5 (add -128) */
-   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   i32_c128 = lp_build_const_int_vec(i32.type, -128);
    s = LLVMBuildAdd(builder, s, i32_c128, "");
    t = LLVMBuildAdd(builder, t, i32_c128, "");
 
    /* compute floor (shift right 8) */
-   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
    t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
 
    /* compute fractional part (AND with 0xff) */
-   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   i32_c255 = lp_build_const_int_vec(i32.type, 255);
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
@@ -941,6 +1876,8 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
+   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
     *
@@ -958,10 +1895,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
 
    neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
    neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
@@ -1035,7 +1972,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    }
 
    assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+   res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25));
 
    /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
    for(chan = 0; chan < 3; ++chan)
@@ -1044,194 +1981,11 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 }
 
 
-static int
-texture_dims(enum pipe_texture_target tex)
-{
-   switch (tex) {
-   case PIPE_TEXTURE_1D:
-      return 1;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-      return 2;
-   case PIPE_TEXTURE_3D:
-      return 3;
-   default:
-      assert(0 && "bad texture target in texture_dims()");
-      return 2;
-   }
-}
-
-
-/**
- * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
- * \param width  scalar int texture width
- * \param height  scalar int texture height
- * \param depth  scalar int texture depth
- */
-static LLVMValueRef
-lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
-
-{
-   const int dims = texture_dims(bld->static_state->target);
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-
-   LLVMValueRef lod_bias = lp_build_const_scalar(bld->coord_bld.type,
-                                                 bld->static_state->lod_bias);
-   LLVMValueRef min_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->min_lod);
-   LLVMValueRef max_lod = lp_build_const_scalar(bld->coord_bld.type,
-                                                bld->static_state->max_lod);
-
-   LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
-   LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
-
-   LLVMValueRef s0, s1, s2;
-   LLVMValueRef t0, t1, t2;
-   LLVMValueRef r0, r1, r2;
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho, lod;
-
-   /*
-    * dsdx = abs(s[1] - s[0]);
-    * dsdy = abs(s[2] - s[0]);
-    * dtdx = abs(t[1] - t[0]);
-    * dtdy = abs(t[2] - t[0]);
-    * drdx = abs(r[1] - r[0]);
-    * drdy = abs(r[2] - r[0]);
-    * XXX we're assuming a four-element quad in 2x2 layout here.
-    */
-   s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
-   s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
-   s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
-   dsdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s1, s0));
-   dsdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, s2, s0));
-   if (dims > 1) {
-      t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
-      t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
-      t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
-      dtdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t1, t0));
-      dtdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, t2, t0));
-      if (dims > 2) {
-         r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
-         r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
-         r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
-         drdx = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r1, r0));
-         drdy = lp_build_abs(coord_bld, lp_build_sub(coord_bld, r2, r0));
-      }
-   }
-
-   /* Compute rho = max of all partial derivatives scaled by texture size.
-    * XXX this can be vectorized somewhat
-    */
-   rho = lp_build_mul(coord_bld,
-                       lp_build_max(coord_bld, dsdx, dsdy),
-                       lp_build_int_to_float(coord_bld, width));
-   if (dims > 1) {
-      LLVMValueRef max;
-      max = lp_build_mul(coord_bld,
-                         lp_build_max(coord_bld, dtdx, dtdy),
-                         lp_build_int_to_float(coord_bld, height));
-      rho = lp_build_max(coord_bld, rho, max);
-      if (dims > 2) {
-         max = lp_build_mul(coord_bld,
-                            lp_build_max(coord_bld, drdx, drdy),
-                            lp_build_int_to_float(coord_bld, depth));
-         rho = lp_build_max(coord_bld, rho, max);
-      }
-   }
-
-   /* compute lod = log2(rho) */
-   lod = lp_build_log2(coord_bld, rho);
-
-   /* add lod bias */
-   lod = lp_build_add(coord_bld, lod, lod_bias);
-
-   /* clamp lod */
-   lod = lp_build_clamp(coord_bld, lod, min_lod, max_lod);
-
-   return lod;
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
- * mipmap level index.
- * \param lod  scalar float texture level of detail
- * \param level_out  returns integer 
- */
-static void
-lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_iround(coord_bld, lod);
-
-   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_coord_bld, level,
-                               int_coord_bld->zero,
-                               last_level);
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
- */
-static void
-lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_ifloor(coord_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_coord_bld, level,
-                                int_coord_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   *level1_out = lp_build_add(int_coord_bld, *level0_out, int_coord_bld->one);
-   *level1_out = lp_build_min(int_coord_bld, *level1_out, int_coord_bld->zero);
-
-   *weight_out = lp_build_fract(coord_bld, lod);
-}
-
-
-
 /**
  * Build texture sampling code.
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
+ * \param type  vector float type to use for coords, etc.
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -1245,28 +1999,31 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     LLVMValueRef *texel)
 {
    struct lp_build_sample_context bld;
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef stride;
-   LLVMValueRef data_ptr;
+   LLVMValueRef width, width_vec;
+   LLVMValueRef height, height_vec;
+   LLVMValueRef depth, depth_vec;
+   LLVMValueRef stride_array;
+   LLVMValueRef data_array;
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
 
-   (void) lp_build_lod_selector;   /* temporary to silence warning */
-   (void) lp_build_nearest_mip_level;
-   (void) lp_build_linear_mip_levels;
-
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
    bld.builder = builder;
    bld.static_state = static_state;
    bld.dynamic_state = dynamic_state;
    bld.format_desc = util_format_description(static_state->format);
+
+   bld.float_type = lp_type_float(32);
+   bld.int_type = lp_type_int(32);
    bld.coord_type = type;
    bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
    bld.texel_type = type;
+
+   lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
    lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
@@ -1275,41 +2032,37 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    /* Get the dynamic state */
    width = dynamic_state->width(dynamic_state, builder, unit);
    height = dynamic_state->height(dynamic_state, builder, unit);
-   stride = dynamic_state->stride(dynamic_state, builder, unit);
-   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   depth = dynamic_state->depth(dynamic_state, builder, unit);
+   stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
+   data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   /* Note that data_array is an array[level] of pointers to texture images */
 
    s = coords[0];
    t = coords[1];
    r = coords[2];
 
-   width = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
-   height = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
-   stride = lp_build_broadcast_scalar(&bld.uint_coord_bld, stride);
-
-   if(static_state->target == PIPE_TEXTURE_1D)
-      t = bld.coord_bld.zero;
-
-   switch (static_state->min_img_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height,
-                                     stride, data_ptr, texel);
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      if(lp_format_is_rgba8(bld.format_desc) &&
-         is_simple_wrap_mode(static_state->wrap_s) &&
-         is_simple_wrap_mode(static_state->wrap_t))
-         lp_build_sample_2d_linear_aos(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      else
-         lp_build_sample_2d_linear_soa(&bld, s, t, width, height,
-                                       stride, data_ptr, texel);
-      break;
-   default:
-      assert(0);
+   width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
+   height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
+   depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
+
+   if (util_format_is_rgba8_variant(bld.format_desc) &&
+       static_state->target == PIPE_TEXTURE_2D &&
+       static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
+       static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+       is_simple_wrap_mode(static_state->wrap_s) &&
+       is_simple_wrap_mode(static_state->wrap_t)) {
+      /* special case */
+      lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
+                                    stride_array, data_array, texel);
+   }
+   else {
+      lp_build_sample_general(&bld, unit, s, t, r,
+                              width, height, depth,
+                              width_vec, height_vec, depth_vec,
+                              stride_array, NULL, data_array,
+                              texel);
    }
-
-   /* FIXME: respect static_state->min_mip_filter */;
-   /* FIXME: respect static_state->mag_img_filter */;
 
    lp_build_sample_compare(&bld, r, texel);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
index 740392f561..147336edb4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -37,7 +37,7 @@
 #define LP_BLD_STRUCT_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 #include <llvm-c/Target.h>
 
 #include "util/u_debug.h"
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 64e81f7b1f..278c838eac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -144,9 +144,9 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 #endif
 
          if(shift > 0)
-            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
+            tmp = LLVMBuildLShr(bld->builder, a, lp_build_const_int_vec(type4, shift*type.width), "");
          if(shift < 0)
-            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
+            tmp = LLVMBuildShl(bld->builder, a, lp_build_const_int_vec(type4, -shift*type.width), "");
 
          assert(tmp);
          if(tmp)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index b9472127a6..138ca620e6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -37,7 +37,7 @@
 #define LP_BLD_SWIZZLE_H
 
 
-#include <llvm-c/Core.h>  
+#include "gallivm/lp_bld.h"
 
 
 struct lp_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index eddb7a83fa..63b938bfa9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -35,7 +35,7 @@
 #ifndef LP_BLD_TGSI_H
 #define LP_BLD_TGSI_H
 
-#include <llvm-c/Core.h>
+#include "gallivm/lp_bld.h"
 
 
 struct tgsi_token;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 5f2c2a54ee..8901e656ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
@@ -95,6 +96,19 @@ struct lp_exec_mask {
    int cond_stack_size;
    LLVMValueRef cond_mask;
 
+   LLVMValueRef break_stack[LP_TGSI_MAX_NESTING];
+   int break_stack_size;
+   LLVMValueRef break_mask;
+
+   LLVMValueRef cont_stack[LP_TGSI_MAX_NESTING];
+   int cont_stack_size;
+   LLVMValueRef cont_mask;
+
+   LLVMBasicBlockRef loop_stack[LP_TGSI_MAX_NESTING];
+   int loop_stack_size;
+   LLVMBasicBlockRef loop_block;
+
+
    LLVMValueRef exec_mask;
 };
 
@@ -145,15 +159,33 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context
    mask->bld = bld;
    mask->has_mask = FALSE;
    mask->cond_stack_size = 0;
+   mask->loop_stack_size = 0;
+   mask->break_stack_size = 0;
+   mask->cont_stack_size = 0;
 
    mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
 }
 
 static void lp_exec_mask_update(struct lp_exec_mask *mask)
 {
-   mask->exec_mask = mask->cond_mask;
-   if (mask->cond_stack_size > 0)
-      mask->has_mask = TRUE;
+   if (mask->loop_stack_size) {
+      /*for loops we need to update the entire mask at
+       * runtime */
+      LLVMValueRef tmp;
+      tmp = LLVMBuildAnd(mask->bld->builder,
+                         mask->cont_mask,
+                         mask->break_mask,
+                         "maskcb");
+      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cond_mask,
+                                     tmp,
+                                     "maskfull");
+   } else
+      mask->exec_mask = mask->cond_mask;
+
+
+   mask->has_mask = (mask->cond_stack_size > 0 ||
+                     mask->loop_stack_size > 0);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
@@ -190,6 +222,89 @@ static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
    lp_exec_mask_update(mask);
 }
 
+static void lp_exec_bgnloop(struct lp_exec_mask *mask)
+{
+
+   if (mask->cont_stack_size == 0)
+      mask->cont_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->cont_stack_size == 0)
+      mask->break_mask = LLVMConstAllOnes(mask->int_vec_type);
+   if (mask->cond_stack_size == 0)
+      mask->cond_mask = LLVMConstAllOnes(mask->int_vec_type);
+   mask->loop_stack[mask->loop_stack_size++] = mask->loop_block;
+   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
+   LLVMBuildBr(mask->bld->builder, mask->loop_block);
+   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_break(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "break");
+
+   mask->break_stack[mask->break_stack_size++] = mask->break_mask;
+   if (mask->break_stack_size > 1) {
+      mask->break_mask = LLVMBuildAnd(mask->bld->builder,
+                                      mask->break_mask,
+                                      exec_mask, "break_full");
+   } else
+      mask->break_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_continue(struct lp_exec_mask *mask)
+{
+   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
+                                         mask->exec_mask,
+                                         "");
+
+   mask->cont_stack[mask->cont_stack_size++] = mask->cont_mask;
+   if (mask->cont_stack_size > 1) {
+      mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->cont_mask,
+                                     exec_mask, "");
+   } else
+      mask->cont_mask = exec_mask;
+
+   lp_exec_mask_update(mask);
+}
+
+
+static void lp_exec_endloop(struct lp_exec_mask *mask)
+{
+   LLVMBasicBlockRef endloop;
+   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
+                                      mask->bld->type.length);
+   /* i1cond = (mask == 0) */
+   LLVMValueRef i1cond = LLVMBuildICmp(
+      mask->bld->builder,
+      LLVMIntNE,
+      LLVMBuildBitCast(mask->bld->builder, mask->break_mask, reg_type, ""),
+      LLVMConstNull(reg_type), "");
+
+   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
+
+   LLVMBuildCondBr(mask->bld->builder,
+                   i1cond, mask->loop_block, endloop);
+
+   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
+
+   mask->loop_block = mask->loop_stack[--mask->loop_stack_size];
+   /* pop the break mask */
+   if (mask->cont_stack_size) {
+      mask->cont_mask = mask->cont_stack[--mask->cont_stack_size];
+   }
+   if (mask->break_stack_size) {
+      mask->break_mask = mask->cont_stack[--mask->break_stack_size];
+   }
+
+   lp_exec_mask_update(mask);
+}
+
 static void lp_exec_mask_store(struct lp_exec_mask *mask,
                                LLVMValueRef val,
                                LLVMValueRef dst)
@@ -360,7 +475,7 @@ emit_store(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
+      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
       value = lp_build_min(&bld->base, value, bld->base.one);
       break;
 
@@ -384,6 +499,11 @@ emit_store(
       assert(0);
       break;
 
+   case TGSI_FILE_PREDICATE:
+      /* FIXME */
+      assert(0);
+      break;
+
    default:
       assert( 0 );
    }
@@ -531,32 +651,40 @@ emit_declaration(
    unsigned first = decl->Range.First;
    unsigned last = decl->Range.Last;
    unsigned idx, i;
+   LLVMBasicBlockRef current_block =
+      LLVMGetInsertBlock(bld->base.builder);
+   LLVMBasicBlockRef first_block =
+      LLVMGetEntryBasicBlock(
+         LLVMGetBasicBlockParent(current_block));
+   LLVMValueRef first_inst =
+      LLVMGetFirstInstruction(first_block);
+
+   /* we want alloca's to be the first instruction
+    * in the function so we need to rewind the builder
+    * to the very beginning */
+   LLVMPositionBuilderBefore(bld->base.builder,
+                             first_inst);
 
    for (idx = first; idx <= last; ++idx) {
-      boolean ok;
-
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
          for (i = 0; i < NUM_CHANNELS; i++)
             bld->temps[idx][i] = lp_build_alloca(&bld->base);
-         ok = TRUE;
          break;
 
       case TGSI_FILE_OUTPUT:
          for (i = 0; i < NUM_CHANNELS; i++)
             bld->outputs[idx][i] = lp_build_alloca(&bld->base);
-         ok = TRUE;
          break;
 
       default:
          /* don't need to declare other vars */
-         ok = TRUE;
+         break;
       }
-
-      if (!ok)
-         return FALSE;
    }
 
+   LLVMPositionBuilderAtEnd(bld->base.builder,
+                            current_block);
    return TRUE;
 }
 
@@ -581,6 +709,17 @@ emit_instruction(
    if (indirect_temp_reference(inst))
       return FALSE;
 
+   /*
+    * Stores and write masks are handled in a general fashion after the long
+    * instruction opcode switch statement.
+    *
+    * Although not stricitly necessary, we avoid generating instructions for
+    * channels which won't be stored, in cases where's that easy. For some
+    * complex instructions, like texture sampling, it is more convenient to
+    * assume a full writemask and then let LLVM optimization passes eliminate
+    * redundant code.
+    */
+
    assert(info->num_dst <= 1);
    if(info->num_dst) {
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -865,7 +1004,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
-         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
          dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
       }
@@ -1126,7 +1265,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      /* XXX what about dst0 writemask? */
       emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
@@ -1349,14 +1487,15 @@ emit_instruction(
    case TGSI_OPCODE_TXP:
       emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
-      
+
    case TGSI_OPCODE_BRK:
-      /* FIXME */
-      return 0;
+      lp_exec_break(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_IF:
       tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
+                          tmp0, bld->base.zero);
       lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
       break;
 
@@ -1366,6 +1505,10 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_BGNLOOP:
+      lp_exec_bgnloop(&bld->exec_mask);
+      break;
+
    case TGSI_OPCODE_REP:
       /* deprecated */
       assert(0);
@@ -1386,6 +1529,10 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_ENDLOOP:
+      lp_exec_endloop(&bld->exec_mask);
+      break;
+
    case TGSI_OPCODE_ENDREP:
       /* deprecated */
       assert(0);
@@ -1485,8 +1632,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_CONT:
-      /* FIXME */
-      return 0;
+      lp_exec_continue(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_EMIT:
@@ -1575,7 +1721,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
             assert(num_immediates < LP_MAX_IMMEDIATES);
             for( i = 0; i < size; ++i )
                bld.immediates[num_immediates][i] =
-                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
+                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
             for( i = size; i < 4; ++i )
                bld.immediates[num_immediates][i] = bld.base.undef;
             num_immediates++;
@@ -1589,7 +1735,14 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          assert( 0 );
       }
    }
-
+   if (0) {
+      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+      LLVMValueRef function = LLVMGetBasicBlockParent(block);
+      debug_printf("11111111111111111111111111111 \n");
+      tgsi_dump(tokens, 0);
+      LLVMDumpValue(function);
+      debug_printf("2222222222222222222222222222 \n");
+   }
    tgsi_parse_free( &parse );
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index c327ba045a..796af88caa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -58,7 +58,10 @@ LLVMTypeRef
 lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
@@ -115,6 +118,9 @@ lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type)
    if(!vec_type)
       return FALSE;
 
+   if (type.length == 1)
+      return lp_check_elem_type(type, vec_type);
+
    if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
       return FALSE;
 
@@ -153,7 +159,10 @@ LLVMTypeRef
 lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
-   return LLVMVectorType(elem_type, type.length);
+   if (type.length == 1)
+      return elem_type;
+   else
+      return LLVMVectorType(elem_type, type.length);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 16946cc28a..cd59d2faa6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -37,9 +37,9 @@
 #define LP_BLD_TYPE_H
 
 
-#include <llvm-c/Core.h>  
+#include "pipe/p_compiler.h"
+#include "gallivm/lp_bld.h"
 
-#include <pipe/p_compiler.h>
 
 
 /**
@@ -103,7 +103,7 @@ struct lp_type {
    unsigned width:14;
 
    /**
-    * Vector length.
+    * Vector length.  If length==1, this is a scalar (float/int) type.
     *
     * width*length should be a power of two greater or equal to eight.
     *
@@ -139,6 +139,7 @@ struct lp_build_context
 };
 
 
+/** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
 {
@@ -148,12 +149,29 @@ lp_type_float(unsigned width)
    res_type.floating = TRUE;
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector of float type */
+static INLINE struct lp_type
+lp_type_float_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar int type */
 static INLINE struct lp_type
 lp_type_int(unsigned width)
 {
@@ -162,12 +180,28 @@ lp_type_int(unsigned width)
    memset(&res_type, 0, sizeof res_type);
    res_type.sign = TRUE;
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector int type */
+static INLINE struct lp_type
+lp_type_int_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
 }
 
 
+/** Create scalar uint type */
 static INLINE struct lp_type
 lp_type_uint(unsigned width)
 {
@@ -175,6 +209,20 @@ lp_type_uint(unsigned width)
 
    memset(&res_type, 0, sizeof res_type);
    res_type.width = width;
+   res_type.length = 1;
+
+   return res_type;
+}
+
+
+/** Create vector uint type */
+static INLINE struct lp_type
+lp_type_uint_vec(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
    res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
 
    return res_type;
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index a04df4106f..07a4268fc0 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -40,7 +40,7 @@
 #include "util/u_debug.h" /* for assert */
 
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
 
 #include <pthread.h> /* POSIX threads headers */
 #include <stdio.h> /* for perror() */
@@ -257,7 +257,7 @@ typedef unsigned pipe_condvar;
  * pipe_barrier
  */
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
 
 typedef pthread_barrier_t pipe_barrier;
 
@@ -299,22 +299,43 @@ static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
 
 #else
 
-typedef unsigned pipe_barrier;
+typedef struct {
+   unsigned count;
+   unsigned waiters;
+   pipe_mutex mutex;
+   pipe_condvar condvar;
+} pipe_barrier;
 
 static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
 {
-   /* XXX we could implement barriers with a mutex and condition var */
-   assert(0);
+   barrier->count = count;
+   barrier->waiters = 0;
+   pipe_mutex_init(barrier->mutex);
+   pipe_condvar_init(barrier->condvar);
 }
 
 static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
 {
-   assert(0);
+   assert(barrier->waiters == 0);
+   pipe_mutex_destroy(barrier->mutex);
+   pipe_condvar_destroy(barrier->condvar);
 }
 
 static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
 {
-   assert(0);
+   pipe_mutex_lock(barrier->mutex);
+
+   assert(barrier->waiters < barrier->count);
+   barrier->waiters++;
+
+   if (barrier->waiters < barrier->count) {
+      pipe_condvar_wait(barrier->condvar, barrier->mutex);
+   } else {
+      barrier->waiters = 0;
+      pipe_condvar_broadcast(barrier->condvar);
+   }
+
+   pipe_mutex_unlock(barrier->mutex);
 }
 
 
@@ -377,7 +398,7 @@ pipe_semaphore_wait(pipe_semaphore *sema)
  */
 
 typedef struct {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
    pthread_key_t key;
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
    DWORD key;
@@ -392,7 +413,7 @@ typedef struct {
 static INLINE void
 pipe_tsd_init(pipe_tsd *tsd)
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
    if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
       perror("pthread_key_create(): failed to allocate key for thread specific data");
       exit(-1);
@@ -409,7 +430,7 @@ pipe_tsd_get(pipe_tsd *tsd)
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
       pipe_tsd_init(tsd);
    }
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
    return pthread_getspecific(tsd->key);
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
    assert(0);
@@ -426,7 +447,7 @@ pipe_tsd_set(pipe_tsd *tsd, void *value)
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
       pipe_tsd_init(tsd);
    }
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
    if (pthread_setspecific(tsd->key, value) != 0) {
       perror("pthread_set_specific() failed");
       exit(-1);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 34b1b77df4..a6c50dcf0c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -48,7 +48,6 @@
 #include "util/u_debug.h"
 #include "util/u_inlines.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_state.h"
 
 
 #ifdef __cplusplus
@@ -58,8 +57,23 @@ extern "C" {
 
 struct pb_vtbl;
 struct pb_validate;
+struct pipe_fence_handle;
 
 
+#define PB_USAGE_CPU_READ  (1 << 0)
+#define PB_USAGE_CPU_WRITE (1 << 1)
+#define PB_USAGE_GPU_READ  (1 << 2)
+#define PB_USAGE_GPU_WRITE (1 << 3)
+#define PB_USAGE_UNSYNCHRONIZED (1 << 10)
+#define PB_USAGE_DONTBLOCK (1 << 9)
+
+#define PB_USAGE_CPU_READ_WRITE \
+   ( PB_USAGE_CPU_READ | PB_USAGE_CPU_WRITE )
+#define PB_USAGE_GPU_READ_WRITE \
+   ( PB_USAGE_GPU_READ | PB_USAGE_GPU_WRITE )
+#define PB_USAGE_WRITE \
+   ( PB_USAGE_CPU_WRITE | PB_USAGE_GPU_WRITE )
+
 /**
  * Buffer description.
  * 
@@ -83,7 +97,14 @@ typedef unsigned pb_size;
  */
 struct pb_buffer 
 {
-   struct pipe_buffer base;
+   /* This used to be a pipe_buffer struct:
+    */
+   struct {
+      struct pipe_reference  reference;
+      unsigned               size;
+      unsigned               alignment;
+      unsigned               usage;
+   } base;
 
    /**
     * Pointer to the virtual function table.
@@ -106,7 +127,7 @@ struct pb_vtbl
 
    /** 
     * Map the entire data store of a buffer object into the client's address.
-    * flags is bitmask of PIPE_BUFFER_FLAG_READ/WRITE. 
+    * flags is bitmask of PB_USAGE_CPU_READ/WRITE. 
     */
    void *(*map)( struct pb_buffer *buf, 
                  unsigned flags );
@@ -138,23 +159,6 @@ struct pb_vtbl
 };
 
 
-static INLINE struct pipe_buffer *
-pb_pipe_buffer( struct pb_buffer *pbuf )
-{
-   assert(pbuf);
-   return &pbuf->base;
-}
-
-
-static INLINE struct pb_buffer *
-pb_buffer( struct pipe_buffer *buf )
-{
-   assert(buf);
-   /* Could add a magic cookie check on debug builds.
-    */
-   return (struct pb_buffer *)buf;
-}
-
 
 /* Accessor functions for pb->vtbl:
  */
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index d97f749b6e..d6cf640582 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -135,7 +135,7 @@ struct fenced_buffer
    void *data;
 
    /**
-    * A bitmask of PIPE_BUFFER_USAGE_CPU/GPU_READ/WRITE describing the current
+    * A bitmask of PB_USAGE_CPU/GPU_READ/WRITE describing the current
     * buffer usage.
     */
    unsigned flags;
@@ -270,7 +270,7 @@ fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
-   assert(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+   assert(fenced_buf->flags & PB_USAGE_GPU_READ_WRITE);
    assert(fenced_buf->fence);
 
    p_atomic_inc(&fenced_buf->base.base.reference.count);
@@ -299,7 +299,7 @@ fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
    assert(fenced_buf->mgr == fenced_mgr);
 
    ops->fence_reference(ops, &fenced_buf->fence, NULL);
-   fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   fenced_buf->flags &= ~PB_USAGE_GPU_READ_WRITE;
 
    assert(fenced_buf->head.prev);
    assert(fenced_buf->head.next);
@@ -377,7 +377,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
 
          assert(!destroyed);
 
-         fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+         fenced_buf->flags &= ~PB_USAGE_GPU_READ_WRITE;
 
          ret = PIPE_OK;
       }
@@ -624,7 +624,7 @@ fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf)
    assert(fenced_buf->data);
    assert(fenced_buf->buffer);
 
-   map = pb_map(fenced_buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE);
    if(!map)
       return PIPE_ERROR;
 
@@ -644,7 +644,7 @@ fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf)
    assert(fenced_buf->data);
    assert(fenced_buf->buffer);
 
-   map = pb_map(fenced_buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ);
    if(!map)
       return PIPE_ERROR;
 
@@ -683,24 +683,24 @@ fenced_buffer_map(struct pb_buffer *buf,
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
+   assert(!(flags & PB_USAGE_GPU_READ_WRITE));
 
    /*
     * Serialize writes.
     */
-   while((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) ||
-         ((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ) &&
-          (flags & PIPE_BUFFER_USAGE_CPU_WRITE))) {
+   while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+         ((fenced_buf->flags & PB_USAGE_GPU_READ) &&
+          (flags & PB_USAGE_CPU_WRITE))) {
 
       /* 
        * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
        */
-      if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
+      if((flags & PB_USAGE_DONTBLOCK) &&
           ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
          goto done;
       }
 
-      if (flags & PIPE_BUFFER_USAGE_UNSYNCHRONIZED) {
+      if (flags & PB_USAGE_UNSYNCHRONIZED) {
          break;
       }
 
@@ -721,7 +721,7 @@ fenced_buffer_map(struct pb_buffer *buf,
 
    if(map) {
       ++fenced_buf->mapcount;
-      fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+      fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
    }
 
 done:
@@ -745,7 +745,7 @@ fenced_buffer_unmap(struct pb_buffer *buf)
          pb_unmap(fenced_buf->buffer);
       --fenced_buf->mapcount;
       if(!fenced_buf->mapcount)
-	 fenced_buf->flags &= ~PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+	 fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
    }
 
    pipe_mutex_unlock(fenced_mgr->mutex);
@@ -771,9 +771,9 @@ fenced_buffer_validate(struct pb_buffer *buf,
       goto done;
    }
 
-   assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
-   assert(!(flags & ~PIPE_BUFFER_USAGE_GPU_READ_WRITE));
-   flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   assert(flags & PB_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
+   flags &= PB_USAGE_GPU_READ_WRITE;
 
    /* Buffer cannot be validated in two different lists */
    if(fenced_buf->vl && fenced_buf->vl != vl) {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
index 0372f81d0a..004c2b939a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
@@ -59,7 +59,6 @@ extern "C" {
 #endif
 
 
-struct pipe_buffer;
 struct pipe_fence_handle;
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index 6bdce5fcb0..b706f429be 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -135,9 +135,9 @@ pb_malloc_buffer_create(pb_size size,
       return NULL;
 
    pipe_reference_init(&buf->base.base.reference, 1);
-   buf->base.base.alignment = desc->alignment;
    buf->base.base.usage = desc->usage;
    buf->base.base.size = size;
+   buf->base.base.alignment = desc->alignment;
    buf->base.vtbl = &malloc_buffer_vtbl;
 
    buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 06669917ff..cec2524da2 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -60,7 +60,6 @@ extern "C" {
 
 
 struct pb_desc;
-struct pipe_buffer;
 
 
 /** 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 86f9266c95..88501e8d72 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -227,6 +227,8 @@ pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,
                           pb_size size,
                           const struct pb_desc *desc)
 {
+   void *map;
+
    if(buf->base.base.size < size)
       return FALSE;
 
@@ -239,6 +241,13 @@ pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,
    
    if(!pb_check_usage(desc->usage, buf->base.base.usage))
       return FALSE;
+
+   map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK);
+   if (!map) {
+      return FALSE;
+   }
+
+   pb_unmap(buf->buffer);
    
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index a5dbded2bc..0dc5b31a75 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -158,7 +158,7 @@ pb_debug_buffer_fill(struct pb_debug_buffer *buf)
 {
    uint8_t *map;
    
-   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+   map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE);
    assert(map);
    if(map) {
       fill_random_pattern(map, buf->underflow_size);
@@ -180,8 +180,8 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf)
    uint8_t *map;
    
    map = pb_map(buf->buffer,
-                PIPE_BUFFER_USAGE_CPU_READ |
-                PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
+                PB_USAGE_CPU_READ |
+                PB_USAGE_UNSYNCHRONIZED);
    assert(map);
    if(map) {
       boolean underflow, overflow;
@@ -382,8 +382,8 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
    
    real_size = mgr->underflow_size + size + mgr->overflow_size;
    real_desc = *desc;
-   real_desc.usage |= PIPE_BUFFER_USAGE_CPU_WRITE;
-   real_desc.usage |= PIPE_BUFFER_USAGE_CPU_READ;
+   real_desc.usage |= PB_USAGE_CPU_WRITE;
+   real_desc.usage |= PB_USAGE_CPU_READ;
 
    buf->buffer = mgr->provider->create_buffer(mgr->provider, 
                                               real_size, 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 63195715d6..faf7c35267 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -268,8 +268,8 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    mm->buffer = buffer; 
 
    mm->map = pb_map(mm->buffer, 
-		    PIPE_BUFFER_USAGE_CPU_READ |
-		    PIPE_BUFFER_USAGE_CPU_WRITE);
+		    PB_USAGE_CPU_READ |
+		    PB_USAGE_CPU_WRITE);
    if(!mm->map)
       goto failure;
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
index cb32d25136..31f1ebbeb7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -150,7 +150,7 @@ pb_ondemand_buffer_instantiate(struct pb_ondemand_buffer *buf)
       if(!buf->buffer)
          return PIPE_ERROR_OUT_OF_MEMORY;
       
-      map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+      map = pb_map(buf->buffer, PB_USAGE_CPU_READ);
       if(!map) {
          pb_reference(&buf->buffer, NULL);
          return PIPE_ERROR;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index fea234ae8c..fdcce42878 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -284,8 +284,8 @@ pool_bufmgr_create(struct pb_manager *provider,
       goto failure;
 
    pool->map = pb_map(pool->buffer,
-                          PIPE_BUFFER_USAGE_CPU_READ |
-                          PIPE_BUFFER_USAGE_CPU_WRITE);
+                          PB_USAGE_CPU_READ |
+                          PB_USAGE_CPU_WRITE);
    if(!pool->map)
       goto failure;
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 24e2820f88..7a3305aaf3 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -315,8 +315,8 @@ pb_slab_create(struct pb_slab_manager *mgr)
    /* Note down the slab virtual address. All mappings are accessed directly 
     * through this address so it is required that the buffer is pinned. */
    slab->virtual = pb_map(slab->bo, 
-                          PIPE_BUFFER_USAGE_CPU_READ |
-                          PIPE_BUFFER_USAGE_CPU_WRITE);
+                          PB_USAGE_CPU_READ |
+                          PB_USAGE_CPU_WRITE);
    if(!slab->virtual) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err1;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index 903afc749d..b585422460 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -69,9 +69,9 @@ pb_validate_add_buffer(struct pb_validate *vl,
    if(!buf)
       return PIPE_ERROR;
 
-   assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
-   assert(!(flags & ~PIPE_BUFFER_USAGE_GPU_READ_WRITE));
-   flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   assert(flags & PB_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
+   flags &= PB_USAGE_GPU_READ_WRITE;
 
    /* We only need to store one reference for each buffer, so avoid storing
     * consecutive references for the same buffer. It might not be the most 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f675427d98..7595214bdf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -87,7 +87,7 @@ void x86_print_reg( struct x86_reg reg )
       foo++;                                    \
    if  (*foo)                                   \
       foo++;                                    \
-   debug_printf( "\n% 4x% 15s ", p->csr - p->store, foo );             \
+   debug_printf( "\n%4x %14s ", p->csr - p->store, foo );             \
 } while (0)
 
 #define DUMP_I( I ) do {                        \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index f7612d416a..319b836ffb 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -102,7 +102,7 @@ enum sse_cc {
 #define cc_Z  cc_E
 #define cc_NZ cc_NE
 
-/* Begin/end/retreive function creation:
+/* Begin/end/retrieve function creation:
  */
 
 
@@ -311,8 +311,8 @@ void x87_fucom( struct x86_function *p, struct x86_reg arg );
 
 
 
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity.  Note - doesn't track explict
+/* Retrieve a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explicit
  * manipulation of ESP by other instructions.
  */
 struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.h b/src/gallium/auxiliary/target-helpers/wrap_screen.c
index 542365112d..5fe3013938 100644
--- a/src/gallium/auxiliary/util/u_timed_winsys.h
+++ b/src/gallium/auxiliary/target-helpers/wrap_screen.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,17 +25,41 @@
  * 
  * 
  **************************************************************************/
+
 /*
- * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ * Authors:
+ *   Keith Whitwell
+ */
+
+#include "target-helpers/wrap_screen.h"
+#include "trace/tr_public.h"
+#include "identity/id_public.h"
+#include "util/u_debug.h"
+
+
+/* Centralized code to inject common wrapping layers:
  */
+struct pipe_screen *
+gallium_wrap_screen( struct pipe_screen *screen )
+{
+   /* Screen wrapping functions are required not to fail.  If it is
+    * impossible to wrap a screen, the unwrapped screen should be
+    * returned instead.  Any failure condition should be returned in
+    * an OUT argument.
+    *
+    * Otherwise it is really messy trying to clean up in this code.
+    */
+   if (debug_get_bool_option("GALLIUM_WRAP", FALSE)) {
+      screen = identity_screen_create(screen);
+   }
 
+   if (debug_get_bool_option("GALLIUM_TRACE", FALSE)) {
+      screen = trace_screen_create( screen );
+   }
 
-#ifndef U_TIMED_WINSYS_H
-#define U_TIMED_WINSYS_H
+   return screen;
+}
 
 
-struct pipe_winsys;
-struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend );
 
 
-#endif
diff --git a/src/gallium/auxiliary/target-helpers/wrap_screen.h b/src/gallium/auxiliary/target-helpers/wrap_screen.h
new file mode 100644
index 0000000000..7e76beb7c5
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/wrap_screen.h
@@ -0,0 +1,16 @@
+#ifndef WRAP_SCREEN_HELPER_H
+#define WRAP_SCREEN_HELPER_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_screen;
+
+/* Centralized code to inject common wrapping layers.  Other layers
+ * can be introduced by specific targets, but these are the generally
+ * helpful ones we probably want everywhere.
+ */
+struct pipe_screen *
+gallium_wrap_screen( struct pipe_screen *screen );
+
+
+#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index f853ea2820..11045e4ba2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1796,6 +1796,12 @@ exec_declaration(struct tgsi_exec_machine *mach,
          last = decl->Range.Last;
          mask = decl->Declaration.UsageMask;
 
+         /* XXX we could remove this special-case code since
+          * mach->InterpCoefs[first].a0 should already have the
+          * front/back-face value.  But we should first update the
+          * ureg code to emit the right UsageMask value (WRITEMASK_X).
+          * Then, we could remove the tgsi_exec_machine::Face field.
+          */
          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
             uint i;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index f918151daa..0b468a9184 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -55,7 +55,7 @@ static boolean is_digit_alpha_underscore( const char *cur )
    return is_digit( cur ) || is_alpha_underscore( cur );
 }
 
-static boolean uprcase( char c )
+static char uprcase( char c )
 {
    if (c >= 'a' && c <= 'z')
       return c += 'A' - 'a';
@@ -76,7 +76,7 @@ streq_nocase_uprcase(const char *str1,
       str1++;
       str2++;
    }
-   return TRUE;
+   return *str1 == 0 && *str2 == 0;
 }
 
 static boolean str_match_no_case( const char **pcur, const char *str )
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 3d0455de7c..f725405ade 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1158,7 +1158,7 @@ static void emit_decl_range( struct ureg_program *ureg,
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 2;
    out[0].decl.File = file;
-   out[0].decl.UsageMask = 0xf;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
    out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    out[0].decl.Semantic = 0;
 
@@ -1180,7 +1180,7 @@ emit_decl_range2D(struct ureg_program *ureg,
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 3;
    out[0].decl.File = file;
-   out[0].decl.UsageMask = 0xf;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
    out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    out[0].decl.Dimension = 1;
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index c9ec2b32bf..c3ec9ae3f4 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -393,10 +393,10 @@ static fetch_func get_fetch_func( enum pipe_format format )
       return &fetch_R8G8B8A8_SSCALED;
 
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return &fetch_A8R8G8B8_UNORM;
+      return &fetch_B8G8R8A8_UNORM;
 
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return &fetch_B8G8R8A8_UNORM;
+      return &fetch_A8R8G8B8_UNORM;
 
    case PIPE_FORMAT_R32_FIXED:
       return &fetch_R32_FIXED;
@@ -552,10 +552,10 @@ static emit_func get_emit_func( enum pipe_format format )
       return &emit_R8G8B8A8_SSCALED;
 
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return &emit_A8R8G8B8_UNORM;
+      return &emit_B8G8R8A8_UNORM;
 
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return &emit_B8G8R8A8_UNORM;
+      return &emit_A8R8G8B8_UNORM;
 
    default:
       assert(0); 
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index 03e093c11e..c13e742738 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -336,7 +336,7 @@ static boolean translate_attr( struct translate_sse *p,
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       emit_load_R32G32B32A32(p, dataXMM, srcECX);
       break;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
       emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
       break;
@@ -360,7 +360,7 @@ static boolean translate_attr( struct translate_sse *p,
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       emit_store_R32G32B32A32(p, dstEAX, dataXMM);
       break;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
       emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
       break;
diff --git a/src/gallium/auxiliary/util/.gitignore b/src/gallium/auxiliary/util/.gitignore
index 448d2f304f..5dd0408eff 100644
--- a/src/gallium/auxiliary/util/.gitignore
+++ b/src/gallium/auxiliary/util/.gitignore
@@ -1,3 +1,3 @@
-u_format_access.c
+u_format_srgb.c
 u_format_table.c
-u_format_pack.h
+u_half.c
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 0b263a9db5..7850f81e58 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -45,6 +45,7 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_sampler.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_surface.h"
 #include "util/u_rect.h"
@@ -63,11 +64,12 @@ struct blit_state
    struct pipe_sampler_state sampler;
    struct pipe_viewport_state viewport;
    struct pipe_clip_state clip;
+   struct pipe_vertex_element velem[2];
 
    void *vs;
    void *fs[TGSI_WRITEMASK_XYZW + 1];
 
-   struct pipe_buffer *vbuf;  /**< quad vertices */
+   struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
 
    float vertices[4][2][4];   /**< vertex/texcoords for quad */
@@ -114,6 +116,15 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->sampler.mag_img_filter = 0; /* set later */
    ctx->sampler.normalized_coords = 1;
 
+   /* vertex elements state */
+   memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
+   for (i = 0; i < 2; i++) {
+      ctx->velem[i].src_offset = i * 4 * sizeof(float);
+      ctx->velem[i].instance_divisor = 0;
+      ctx->velem[i].vertex_buffer_index = 0;
+      ctx->velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   }
+
    /* vertex shader - still required to provide the linkage between
     * fragment shader input semantics and vertex_element/buffers.
     */
@@ -156,7 +167,7 @@ util_destroy_blit(struct blit_state *ctx)
       if (ctx->fs[i])
          pipe->delete_fs_state(pipe, ctx->fs[i]);
 
-   pipe_buffer_reference(&ctx->vbuf, NULL);
+   pipe_resource_reference(&ctx->vbuf, NULL);
 
    FREE(ctx);
 }
@@ -175,8 +186,7 @@ get_next_slot( struct blit_state *ctx )
 
    if (!ctx->vbuf) {
       ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
-                                     32,
-                                     PIPE_BUFFER_USAGE_VERTEX,
+                                     PIPE_BIND_VERTEX_BUFFER,
                                      max_slots * sizeof ctx->vertices);
    }
    
@@ -225,7 +235,7 @@ setup_vertex_data_tex(struct blit_state *ctx,
 
    offset = get_next_slot( ctx );
 
-   pipe_buffer_write_nooverlap(ctx->pipe->screen, ctx->vbuf,
+   pipe_buffer_write_nooverlap(ctx->pipe, ctx->vbuf,
                                offset, sizeof(ctx->vertices), ctx->vertices);
 
    return offset;
@@ -270,6 +280,7 @@ regions_overlap(int srcX0, int srcY0,
 void
 util_blit_pixels_writemask(struct blit_state *ctx,
                            struct pipe_surface *src,
+                           struct pipe_sampler_view *src_sampler_view,
                            int srcX0, int srcY0,
                            int srcX1, int srcY1,
                            struct pipe_surface *dst,
@@ -280,7 +291,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_texture *tex = NULL;
+   struct pipe_sampler_view *sampler_view = NULL;
    struct pipe_framebuffer_state fb;
    const int srcW = abs(srcX1 - srcX0);
    const int srcH = abs(srcY1 - srcY0);
@@ -292,9 +303,9 @@ util_blit_pixels_writemask(struct blit_state *ctx,
           filter == PIPE_TEX_MIPFILTER_LINEAR);
 
    assert(screen->is_format_supported(screen, src->format, PIPE_TEXTURE_2D,
-                                      PIPE_TEXTURE_USAGE_SAMPLER, 0));
+                                      PIPE_BIND_SAMPLER_VIEW, 0));
    assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
-                                      PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
+                                      PIPE_BIND_RENDER_TARGET, 0));
 
    /* do the regions overlap? */
    overlap = util_same_surface(src, dst) &&
@@ -323,7 +334,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    }
    
    assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
-                                      PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
+                                      PIPE_BIND_RENDER_TARGET, 0));
 
    /* Create a temporary texture when src and dest alias or when src
     * is anything other than a single-level 2d texture.
@@ -334,7 +345,9 @@ util_blit_pixels_writemask(struct blit_state *ctx,
        src->texture->target != PIPE_TEXTURE_2D ||
        src->texture->last_level != 0)
    {
-      struct pipe_texture texTemp;
+      struct pipe_resource texTemp;
+      struct pipe_resource *tex;
+      struct pipe_sampler_view sv_templ;
       struct pipe_surface *texSurf;
       const int srcLeft = MIN2(srcX0, srcX1);
       const int srcTop = MIN2(srcY0, srcY1);
@@ -362,12 +375,20 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       texTemp.height0 = srcH;
       texTemp.depth0 = 1;
 
-      tex = screen->texture_create(screen, &texTemp);
+      tex = screen->resource_create(screen, &texTemp);
       if (!tex)
          return;
 
+      u_sampler_view_default_template(&sv_templ, tex, tex->format);
+
+      sampler_view = ctx->pipe->create_sampler_view(ctx->pipe, tex, &sv_templ);
+      if (!sampler_view) {
+         pipe_resource_reference(&tex, NULL);
+         return;
+      }
+
       texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
-                                        PIPE_BUFFER_USAGE_GPU_WRITE);
+                                        PIPE_BIND_BLIT_DESTINATION);
 
       /* load temp texture */
       if (pipe->surface_copy) {
@@ -389,33 +410,38 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       s1 = 1.0f;
       t0 = 0.0f;
       t1 = 1.0f;
+
+      pipe_resource_reference(&tex, NULL);
    }
    else {
-      pipe_texture_reference(&tex, src->texture);
-      s0 = srcX0 / (float)tex->width0;
-      s1 = srcX1 / (float)tex->width0;
-      t0 = srcY0 / (float)tex->height0;
-      t1 = srcY1 / (float)tex->height0;
+      pipe_sampler_view_reference(&sampler_view, src_sampler_view);
+      s0 = srcX0 / (float)src->texture->width0;
+      s1 = srcX1 / (float)src->texture->width0;
+      t0 = srcY0 / (float)src->texture->height0;
+      t1 = srcY1 / (float)src->texture->height0;
    }
 
+   
 
    /* save state (restored below) */
    cso_save_blend(ctx->cso);
    cso_save_depth_stencil_alpha(ctx->cso);
    cso_save_rasterizer(ctx->cso);
    cso_save_samplers(ctx->cso);
-   cso_save_sampler_textures(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
    cso_save_viewport(ctx->cso);
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
    cso_save_clip(ctx->cso);
+   cso_save_vertex_elements(ctx->cso);
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
+   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
    ctx->sampler.min_img_filter = filter;
@@ -435,7 +461,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_set_viewport(ctx->cso, &ctx->viewport);
 
    /* texture */
-   cso_set_sampler_textures(ctx->cso, 1, &tex);
+   cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
 
    if (ctx->fs[writemask] == NULL)
       ctx->fs[writemask] =
@@ -474,20 +500,22 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_restore_depth_stencil_alpha(ctx->cso);
    cso_restore_rasterizer(ctx->cso);
    cso_restore_samplers(ctx->cso);
-   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_viewport(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
    cso_restore_clip(ctx->cso);
+   cso_restore_vertex_elements(ctx->cso);
 
-   pipe_texture_reference(&tex, NULL);
+   pipe_sampler_view_reference(&sampler_view, NULL);
 }
 
 
 void
 util_blit_pixels(struct blit_state *ctx,
                  struct pipe_surface *src,
+                 struct pipe_sampler_view *src_sampler_view,
                  int srcX0, int srcY0,
                  int srcX1, int srcY1,
                  struct pipe_surface *dst,
@@ -495,7 +523,7 @@ util_blit_pixels(struct blit_state *ctx,
                  int dstX1, int dstY1,
                  float z, uint filter )
 {
-   util_blit_pixels_writemask( ctx, src, 
+   util_blit_pixels_writemask( ctx, src, src_sampler_view,
                                srcX0, srcY0,
                                srcX1, srcY1,
                                dst,
@@ -511,7 +539,7 @@ util_blit_pixels(struct blit_state *ctx,
  */
 void util_blit_flush( struct blit_state *ctx )
 {
-   pipe_buffer_reference(&ctx->vbuf, NULL);
+   pipe_resource_reference(&ctx->vbuf, NULL);
    ctx->vbuf_slot = 0;
 } 
 
@@ -526,21 +554,23 @@ void util_blit_flush( struct blit_state *ctx )
  */
 void
 util_blit_pixels_tex(struct blit_state *ctx,
-                 struct pipe_texture *tex,
-                 int srcX0, int srcY0,
-                 int srcX1, int srcY1,
-                 struct pipe_surface *dst,
-                 int dstX0, int dstY0,
-                 int dstX1, int dstY1,
-                 float z, uint filter)
+                     struct pipe_sampler_view *src_sampler_view,
+                     int srcX0, int srcY0,
+                     int srcX1, int srcY1,
+                     struct pipe_surface *dst,
+                     int dstX0, int dstY0,
+                     int dstX1, int dstY1,
+                     float z, uint filter)
 {
    struct pipe_framebuffer_state fb;
    float s0, t0, s1, t1;
    unsigned offset;
+   struct pipe_resource *tex = src_sampler_view->texture;
 
    assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
           filter == PIPE_TEX_MIPFILTER_LINEAR);
 
+   assert(tex);
    assert(tex->width0 != 0);
    assert(tex->height0 != 0);
 
@@ -551,7 +581,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
 
    assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
                                                  PIPE_TEXTURE_2D,
-                                                 PIPE_TEXTURE_USAGE_RENDER_TARGET,
+                                                 PIPE_BIND_RENDER_TARGET,
                                                  0));
 
    /* save state (restored below) */
@@ -559,17 +589,19 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_depth_stencil_alpha(ctx->cso);
    cso_save_rasterizer(ctx->cso);
    cso_save_samplers(ctx->cso);
-   cso_save_sampler_textures(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
    cso_save_clip(ctx->cso);
+   cso_save_vertex_elements(ctx->cso);
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
+   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
    ctx->sampler.min_img_filter = filter;
@@ -589,7 +621,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_set_viewport(ctx->cso, &ctx->viewport);
 
    /* texture */
-   cso_set_sampler_textures(ctx->cso, 1, &tex);
+   cso_set_fragment_sampler_views(ctx->cso, 1, &src_sampler_view);
 
    /* shaders */
    cso_set_fragment_shader_handle(ctx->cso, ctx->fs[TGSI_WRITEMASK_XYZW]);
@@ -623,9 +655,10 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_depth_stencil_alpha(ctx->cso);
    cso_restore_rasterizer(ctx->cso);
    cso_restore_samplers(ctx->cso);
-   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
    cso_restore_clip(ctx->cso);
+   cso_restore_vertex_elements(ctx->cso);
 }
diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h
index a102021529..464ff9aace 100644
--- a/src/gallium/auxiliary/util/u_blit.h
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -37,7 +37,7 @@ extern "C" {
    
 struct pipe_context;
 struct pipe_surface;
-struct pipe_texture;
+struct pipe_resource;
 struct cso_context;
 
 
@@ -53,6 +53,7 @@ util_destroy_blit(struct blit_state *ctx);
 extern void
 util_blit_pixels(struct blit_state *ctx,
                  struct pipe_surface *src,
+                 struct pipe_sampler_view *src_sampler_view,
                  int srcX0, int srcY0,
                  int srcX1, int srcY1,
                  struct pipe_surface *dst,
@@ -63,6 +64,7 @@ util_blit_pixels(struct blit_state *ctx,
 void
 util_blit_pixels_writemask(struct blit_state *ctx,
                            struct pipe_surface *src,
+                           struct pipe_sampler_view *src_sampler_view,
                            int srcX0, int srcY0,
                            int srcX1, int srcY1,
                            struct pipe_surface *dst,
@@ -73,7 +75,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
 extern void
 util_blit_pixels_tex(struct blit_state *ctx,
-                     struct pipe_texture *tex,
+                     struct pipe_sampler_view *src_sampler_view,
                      int srcX0, int srcY0,
                      int srcX1, int srcY1,
                      struct pipe_surface *dst,
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 0ba09d33bf..956aedc8a1 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -45,6 +45,7 @@
 #include "util/u_draw_quad.h"
 #include "util/u_pack_color.h"
 #include "util/u_rect.h"
+#include "util/u_sampler.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_texture.h"
 
@@ -55,7 +56,7 @@ struct blitter_context_priv
    struct blitter_context blitter;
 
    struct pipe_context *pipe; /**< pipe context */
-   struct pipe_buffer *vbuf;  /**< quad */
+   struct pipe_resource *vbuf;  /**< quad */
 
    float vertices[4][2][4];   /**< {pos, color} or {pos, texcoord} */
 
@@ -88,12 +89,16 @@ struct blitter_context_priv
    void *dsa_write_depth_keep_stencil;
    void *dsa_keep_depth_stencil;
 
+   void *velem_state;
+
    /* Sampler state for clamping to a miplevel. */
    void *sampler_state[PIPE_MAX_TEXTURE_LEVELS];
 
    /* Rasterizer state. */
    void *rs_state;
 
+   struct pipe_sampler_view *sampler_view;
+
    /* Viewport state. */
    struct pipe_viewport_state viewport;
 
@@ -104,10 +109,11 @@ struct blitter_context_priv
 struct blitter_context *util_blitter_create(struct pipe_context *pipe)
 {
    struct blitter_context_priv *ctx;
-   struct pipe_blend_state blend = { 0 };
-   struct pipe_depth_stencil_alpha_state dsa = { { 0 } };
-   struct pipe_rasterizer_state rs_state = { 0 };
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_rasterizer_state rs_state;
    struct pipe_sampler_state *sampler_state;
+   struct pipe_vertex_element velem[2];
    unsigned i;
 
    ctx = CALLOC_STRUCT(blitter_context_priv);
@@ -122,17 +128,21 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    ctx->blitter.saved_rs_state = INVALID_PTR;
    ctx->blitter.saved_fs = INVALID_PTR;
    ctx->blitter.saved_vs = INVALID_PTR;
+   ctx->blitter.saved_velem_state = INVALID_PTR;
    ctx->blitter.saved_fb_state.nr_cbufs = ~0;
-   ctx->blitter.saved_num_textures = ~0;
+   ctx->blitter.saved_num_sampler_views = ~0;
    ctx->blitter.saved_num_sampler_states = ~0;
+   ctx->blitter.saved_num_vertex_buffers = ~0;
 
    /* blend state objects */
+   memset(&blend, 0, sizeof(blend));
    ctx->blend_keep_color = pipe->create_blend_state(pipe, &blend);
 
    blend.rt[0].colormask = PIPE_MASK_RGBA;
    ctx->blend_write_color = pipe->create_blend_state(pipe, &blend);
 
    /* depth stencil alpha state objects */
+   memset(&dsa, 0, sizeof(dsa));
    ctx->dsa_keep_depth_stencil =
       pipe->create_depth_stencil_alpha_state(pipe, &dsa);
 
@@ -159,6 +169,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    sampler_state->wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
    sampler_state->wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
    sampler_state->wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler_state->normalized_coords = TRUE;
    /* The sampler state objects which sample from a specified mipmap level
     * are created on-demand. */
 
@@ -170,6 +181,16 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    rs_state.flatshade = 1;
    ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state);
 
+   /* vertex elements state */
+   memset(&velem[0], 0, sizeof(velem[0]) * 2);
+   for (i = 0; i < 2; i++) {
+      velem[i].src_offset = i * 4 * sizeof(float);
+      velem[i].instance_divisor = 0;
+      velem[i].vertex_buffer_index = 0;
+      velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   }
+   ctx->velem_state = pipe->create_vertex_elements_state(pipe, 2, &velem[0]);
+
    /* fragment shaders are created on-demand */
 
    /* vertex shaders */
@@ -196,8 +217,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
 
    /* create the vertex buffer */
    ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
-                                  32,
-                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  PIPE_BIND_VERTEX_BUFFER,
                                   sizeof(ctx->vertices));
 
    return &ctx->blitter;
@@ -219,6 +239,7 @@ void util_blitter_destroy(struct blitter_context *blitter)
    pipe->delete_rasterizer_state(pipe, ctx->rs_state);
    pipe->delete_vs_state(pipe, ctx->vs_col);
    pipe->delete_vs_state(pipe, ctx->vs_tex);
+   pipe->delete_vertex_elements_state(pipe, ctx->velem_state);
 
    for (i = 0; i < PIPE_MAX_TEXTURE_TYPES; i++) {
       if (ctx->fs_texfetch_col[i])
@@ -235,7 +256,11 @@ void util_blitter_destroy(struct blitter_context *blitter)
       if (ctx->sampler_state[i])
          pipe->delete_sampler_state(pipe, ctx->sampler_state[i]);
 
-   pipe_buffer_reference(&ctx->vbuf, NULL);
+   if (ctx->sampler_view) {
+      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
+   }
+
+   pipe_resource_reference(&ctx->vbuf, NULL);
    FREE(ctx);
 }
 
@@ -246,7 +271,8 @@ static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
           ctx->blitter.saved_dsa_state != INVALID_PTR &&
           ctx->blitter.saved_rs_state != INVALID_PTR &&
           ctx->blitter.saved_fs != INVALID_PTR &&
-          ctx->blitter.saved_vs != INVALID_PTR);
+          ctx->blitter.saved_vs != INVALID_PTR &&
+          ctx->blitter.saved_velem_state != INVALID_PTR);
 }
 
 static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
@@ -259,12 +285,14 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
    pipe->bind_rasterizer_state(pipe, ctx->blitter.saved_rs_state);
    pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
    pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
+   pipe->bind_vertex_elements_state(pipe, ctx->blitter.saved_velem_state);
 
    ctx->blitter.saved_blend_state = INVALID_PTR;
    ctx->blitter.saved_dsa_state = INVALID_PTR;
    ctx->blitter.saved_rs_state = INVALID_PTR;
    ctx->blitter.saved_fs = INVALID_PTR;
    ctx->blitter.saved_vs = INVALID_PTR;
+   ctx->blitter.saved_velem_state = INVALID_PTR;
 
    pipe->set_stencil_ref(pipe, &ctx->blitter.saved_stencil_ref);
 
@@ -285,11 +313,18 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
       ctx->blitter.saved_num_sampler_states = ~0;
    }
 
-   if (ctx->blitter.saved_num_textures != ~0) {
-      pipe->set_fragment_sampler_textures(pipe,
-                                          ctx->blitter.saved_num_textures,
-                                          ctx->blitter.saved_textures);
-      ctx->blitter.saved_num_textures = ~0;
+   if (ctx->blitter.saved_num_sampler_views != ~0) {
+      pipe->set_fragment_sampler_views(pipe,
+                                       ctx->blitter.saved_num_sampler_views,
+                                       ctx->blitter.saved_sampler_views);
+      ctx->blitter.saved_num_sampler_views = ~0;
+   }
+
+   if (ctx->blitter.saved_num_vertex_buffers != ~0) {
+      pipe->set_vertex_buffers(pipe,
+                                       ctx->blitter.saved_num_vertex_buffers,
+                                       ctx->blitter.saved_vertex_buffers);
+      ctx->blitter.saved_num_vertex_buffers = ~0;
    }
 }
 
@@ -424,7 +459,7 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx)
    struct pipe_context *pipe = ctx->pipe;
 
    /* write vertices and draw them */
-   pipe_buffer_write(pipe->screen, ctx->vbuf,
+   pipe_buffer_write(pipe, ctx->vbuf,
                      0, sizeof(ctx->vertices), ctx->vertices);
 
    util_draw_vertex_buffer(pipe, ctx->vbuf, 0, PIPE_PRIM_TRIANGLE_FAN,
@@ -569,6 +604,7 @@ void util_blitter_clear(struct blitter_context *blitter,
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
 
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
    pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs));
    pipe->bind_vs_state(pipe, ctx->vs_col);
 
@@ -600,9 +636,10 @@ static void util_blitter_do_copy(struct blitter_context *blitter,
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_framebuffer_state fb_state;
+   struct pipe_sampler_view viewTempl, *view;
 
    assert(blitter->saved_fb_state.nr_cbufs != ~0);
-   assert(blitter->saved_num_textures != ~0);
+   assert(blitter->saved_num_sampler_views != ~0);
    assert(blitter->saved_num_sampler_states != ~0);
    assert(src->texture->target < PIPE_MAX_TEXTURE_TYPES);
 
@@ -630,11 +667,24 @@ static void util_blitter_do_copy(struct blitter_context *blitter,
       fb_state.zsbuf = 0;
    }
 
+   u_sampler_view_default_template(&viewTempl,
+                                   src->texture,
+                                   src->texture->format);
+   view = pipe->create_sampler_view(pipe,
+                                    src->texture,
+                                    &viewTempl);
+
+   if (ctx->sampler_view) {
+      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
+   }
+   ctx->sampler_view = view;
+
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_vs_state(pipe, ctx->vs_tex);
    pipe->bind_fragment_sampler_states(pipe, 1,
       blitter_get_sampler_state(ctx, src->level));
-   pipe->set_fragment_sampler_textures(pipe, 1, &src->texture);
+   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
+   pipe->set_fragment_sampler_views(pipe, 1, &view);
    pipe->set_framebuffer_state(pipe, &fb_state);
 
    /* set texture coordinates */
@@ -672,8 +722,8 @@ static void util_blitter_overlap_copy(struct blitter_context *blitter,
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
 
-   struct pipe_texture texTemp;
-   struct pipe_texture *texture;
+   struct pipe_resource texTemp;
+   struct pipe_resource *texture;
    struct pipe_surface *tex_surf;
 
    /* check whether the states are properly saved */
@@ -687,13 +737,13 @@ static void util_blitter_overlap_copy(struct blitter_context *blitter,
    texTemp.height0 = height;
    texTemp.depth0 = 1;
 
-   texture = screen->texture_create(screen, &texTemp);
+   texture = screen->resource_create(screen, &texTemp);
    if (!texture)
       return;
 
    tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0,
-				      PIPE_BUFFER_USAGE_GPU_READ | 
-				      PIPE_BUFFER_USAGE_GPU_WRITE);
+				      PIPE_BIND_BLIT_SOURCE | 
+				      PIPE_BIND_BLIT_DESTINATION);
 
    /* blit from the src to the temp */
    util_blitter_do_copy(blitter, tex_surf, 0, 0,
@@ -705,7 +755,7 @@ static void util_blitter_overlap_copy(struct blitter_context *blitter,
 			width, height,
 			FALSE);
    pipe_surface_reference(&tex_surf, NULL);
-   pipe_texture_reference(&texture, NULL);
+   pipe_resource_reference(&texture, NULL);
    blitter_restore_CSOs(ctx);
 }
 
@@ -739,8 +789,8 @@ void util_blitter_copy(struct blitter_context *blitter,
 		   
    is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
    is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
-   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
-                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
+   dst_tex_usage = is_depth || is_stencil ? PIPE_BIND_DEPTH_STENCIL :
+                                            PIPE_BIND_RENDER_TARGET;
 
    /* check if we can sample from and render to the surfaces */
    /* (assuming copying a stencil buffer is not possible) */
@@ -748,7 +798,7 @@ void util_blitter_copy(struct blitter_context *blitter,
        !screen->is_format_supported(screen, dst->format, dst->texture->target,
                                     dst_tex_usage, 0) ||
        !screen->is_format_supported(screen, src->format, src->texture->target,
-                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
+                                    PIPE_BIND_SAMPLER_VIEW, 0)) {
       util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
                         width, height);
       return;
@@ -785,7 +835,7 @@ void util_blitter_fill(struct blitter_context *blitter,
    /* check if we can render to the surface */
    if (util_format_is_depth_or_stencil(dst->format) || /* unlikely, but you never know */
        !screen->is_format_supported(screen, dst->format, dst->texture->target,
-                                    PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+                                    PIPE_BIND_RENDER_TARGET, 0)) {
       util_surface_fill(pipe, dst, dstx, dsty, width, height, value);
       return;
    }
@@ -807,6 +857,7 @@ void util_blitter_fill(struct blitter_context *blitter,
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1));
    pipe->bind_vs_state(pipe, ctx->vs_col);
+   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
 
    /* set a framebuffer state */
    fb_state.width = dst->width;
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 92008fce99..f6e3ce4874 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -43,6 +43,7 @@ struct blitter_context
    /* Private members, really. */
    void *saved_blend_state;   /**< blend state */
    void *saved_dsa_state;     /**< depth stencil alpha state */
+   void *saved_velem_state;   /**< vertex elements state */
    void *saved_rs_state;      /**< rasterizer state */
    void *saved_fs, *saved_vs; /**< fragment shader, vertex shader */
 
@@ -52,10 +53,13 @@ struct blitter_context
    struct pipe_clip_state saved_clip;
 
    int saved_num_sampler_states;
-   void *saved_sampler_states[32];
+   void *saved_sampler_states[PIPE_MAX_SAMPLERS];
 
-   int saved_num_textures;
-   struct pipe_texture *saved_textures[32]; /* is 32 enough? */
+   int saved_num_sampler_views;
+   struct pipe_sampler_view *saved_sampler_views[PIPE_MAX_SAMPLERS];
+
+   int saved_num_vertex_buffers;
+   struct pipe_vertex_buffer saved_vertex_buffers[PIPE_MAX_ATTRIBS];
 };
 
 /**
@@ -173,6 +177,13 @@ void util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
 }
 
 static INLINE
+void util_blitter_save_vertex_elements(struct blitter_context *blitter,
+                                       void *state)
+{
+   blitter->saved_velem_state = state;
+}
+
+static INLINE
 void util_blitter_save_stencil_ref(struct blitter_context *blitter,
                                    const struct pipe_stencil_ref *state)
 {
@@ -234,17 +245,30 @@ void util_blitter_save_fragment_sampler_states(
           num_sampler_states * sizeof(void *));
 }
 
-static INLINE
-void util_blitter_save_fragment_sampler_textures(
-                  struct blitter_context *blitter,
-                  int num_textures,
-                  struct pipe_texture **textures)
+static INLINE void
+util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
+                                         int num_views,
+                                         struct pipe_sampler_view **views)
+{
+   assert(num_views <= Elements(blitter->saved_sampler_views));
+
+   blitter->saved_num_sampler_views = num_views;
+   memcpy(blitter->saved_sampler_views,
+          views,
+          num_views * sizeof(struct pipe_sampler_view *));
+}
+
+static INLINE void
+util_blitter_save_vertex_buffers(struct blitter_context *blitter,
+                                         int num_vertex_buffers,
+                                         struct pipe_vertex_buffer *vertex_buffers)
 {
-   assert(num_textures <= Elements(blitter->saved_textures));
+   assert(num_vertex_buffers <= Elements(blitter->saved_vertex_buffers));
 
-   blitter->saved_num_textures = num_textures;
-   memcpy(blitter->saved_textures, textures,
-          num_textures * sizeof(struct pipe_texture *));
+   blitter->saved_num_vertex_buffers = num_vertex_buffers;
+   memcpy(blitter->saved_vertex_buffers,
+          vertex_buffers,
+          num_vertex_buffers * sizeof(struct pipe_vertex_buffer));
 }
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
new file mode 100644
index 0000000000..919967b55a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -0,0 +1,73 @@
+#ifndef UTIL_BOX_INLINES_H
+#define UTIL_BOX_INLINES_H
+
+#include "pipe/p_state.h"
+
+static INLINE
+void u_box_1d( unsigned x,
+	       unsigned w,
+	       struct pipe_box *box )
+{
+   box->x = x;
+   box->y = 0;
+   box->z = 0;
+   box->width = w;
+   box->height = 1;
+   box->depth = 1;
+}
+
+static INLINE
+void u_box_2d( unsigned x,
+	       unsigned y,
+	       unsigned w,
+	       unsigned h,
+	       struct pipe_box *box )
+{
+   box->x = x;
+   box->y = y;
+   box->z = 0;
+   box->width = w;
+   box->height = h;
+   box->depth = 1;
+}
+
+static INLINE
+void u_box_origin_2d( unsigned w,
+		      unsigned h,
+		      struct pipe_box *box )
+{
+   box->x = 0;
+   box->y = 0;
+   box->z = 0;
+   box->width = w;
+   box->height = h;
+   box->depth = 1;
+}
+
+static INLINE
+void u_box_2d_zslice( unsigned x,
+		      unsigned y,
+		      unsigned z,
+		      unsigned w,
+		      unsigned h,
+		      struct pipe_box *box )
+{
+   box->x = x;
+   box->y = y;
+   box->z = z;
+   box->width = w;
+   box->height = h;
+   box->depth = 1;
+}
+
+static INLINE
+struct pipe_subresource u_subresource( unsigned face,
+				       unsigned level )
+{
+   struct pipe_subresource subresource;
+   subresource.face = face;
+   subresource.level = level;
+   return subresource;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 94be682c4b..dd044973f9 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -421,45 +421,51 @@ void debug_dump_image(const char *prefix,
 #endif
 }
 
-void debug_dump_surface(const char *prefix,
+void debug_dump_surface(struct pipe_context *pipe,
+			const char *prefix,
                         struct pipe_surface *surface)     
 {
-   struct pipe_texture *texture;
-   struct pipe_screen *screen;
+   struct pipe_resource *texture;
    struct pipe_transfer *transfer;
    void *data;
 
    if (!surface)
       return;
 
+   /* XXX: this doesn't necessarily work, as the driver may be using
+    * temporary storage for the surface which hasn't been propagated
+    * back into the texture.  Need to nail down the semantics of views
+    * and transfers a bit better before we can say if extra work needs
+    * to be done here:
+    */
    texture = surface->texture;
-   screen = texture->screen;
 
-   transfer = screen->get_tex_transfer(screen, texture, surface->face,
-                                       surface->level, surface->zslice,
-                                       PIPE_TRANSFER_READ, 0, 0, surface->width,
-                                       surface->height);
+   transfer = pipe_get_transfer(pipe, texture, surface->face,
+				     surface->level, surface->zslice,
+				     PIPE_TRANSFER_READ, 0, 0, surface->width,
+				     surface->height);
    
-   data = screen->transfer_map(screen, transfer);
+   data = pipe->transfer_map(pipe, transfer);
    if(!data)
       goto error;
    
    debug_dump_image(prefix, 
                     texture->format,
                     util_format_get_blocksize(texture->format), 
-                    util_format_get_nblocksx(texture->format, transfer->width),
-                    util_format_get_nblocksy(texture->format, transfer->height),
+                    util_format_get_nblocksx(texture->format, surface->width),
+                    util_format_get_nblocksy(texture->format, surface->height),
                     transfer->stride,
                     data);
    
-   screen->transfer_unmap(screen, transfer);
+   pipe->transfer_unmap(pipe, transfer);
 error:
-   screen->tex_transfer_destroy(transfer);
+   pipe->transfer_destroy(pipe, transfer);
 }
 
 
-void debug_dump_texture(const char *prefix,
-                        struct pipe_texture *texture)
+void debug_dump_texture(struct pipe_context *pipe,
+                        const char *prefix,
+                        struct pipe_resource *texture)
 {
    struct pipe_surface *surface;
    struct pipe_screen *screen;
@@ -471,9 +477,9 @@ void debug_dump_texture(const char *prefix,
 
    /* XXX for now, just dump image for face=0, level=0 */
    surface = screen->get_tex_surface(screen, texture, 0, 0, 0,
-                                     PIPE_TEXTURE_USAGE_SAMPLER);
+                                     PIPE_BIND_SAMPLER_VIEW);
    if (surface) {
-      debug_dump_surface(prefix, surface);
+      debug_dump_surface(pipe, prefix, surface);
       screen->tex_surface_destroy(surface);
    }
 }
@@ -511,27 +517,28 @@ struct bmp_rgb_quad {
 };
 
 void
-debug_dump_surface_bmp(const char *filename,
+debug_dump_surface_bmp(struct pipe_context *pipe,
+		       const char *filename,
                        struct pipe_surface *surface)
 {
 #ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
    struct pipe_transfer *transfer;
-   struct pipe_texture *texture = surface->texture;
-   struct pipe_screen *screen = texture->screen;
+   struct pipe_resource *texture = surface->texture;
 
-   transfer = screen->get_tex_transfer(screen, texture, surface->face,
-                                       surface->level, surface->zslice,
-                                       PIPE_TRANSFER_READ, 0, 0, surface->width,
-                                       surface->height);
+   transfer = pipe_get_transfer(pipe, texture, surface->face,
+				surface->level, surface->zslice,
+				PIPE_TRANSFER_READ, 0, 0, surface->width,
+				surface->height);
 
-   debug_dump_transfer_bmp(filename, transfer);
+   debug_dump_transfer_bmp(pipe, filename, transfer);
 
-   screen->tex_transfer_destroy(transfer);
+   pipe->transfer_destroy(pipe, transfer);
 #endif
 }
 
 void
-debug_dump_transfer_bmp(const char *filename,
+debug_dump_transfer_bmp(struct pipe_context *pipe,
+                        const char *filename,
                         struct pipe_transfer *transfer)
 {
 #ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
@@ -540,17 +547,20 @@ debug_dump_transfer_bmp(const char *filename,
    if (!transfer)
       goto error1;
 
-   rgba = MALLOC(transfer->width*transfer->height*4*sizeof(float));
+   rgba = MALLOC(transfer->box.width *
+		 transfer->box.height *
+		 transfer->box.depth *
+		 4*sizeof(float));
    if(!rgba)
       goto error1;
 
-   pipe_get_tile_rgba(transfer, 0, 0,
-                      transfer->width, transfer->height,
+   pipe_get_tile_rgba(pipe, transfer, 0, 0,
+                      transfer->box.width, transfer->box.height,
                       rgba);
 
    debug_dump_float_rgba_bmp(filename,
-                             transfer->width, transfer->height,
-                             rgba, transfer->width);
+                             transfer->box.width, transfer->box.height,
+                             rgba, transfer->box.width);
 
    FREE(rgba);
 error1:
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 0f4768f344..b6d0b508e3 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -312,30 +312,35 @@ debug_memory_end(unsigned long beginning);
 
 
 #ifdef DEBUG
+struct pipe_context;
 struct pipe_surface;
 struct pipe_transfer;
-struct pipe_texture;
+struct pipe_resource;
 
 void debug_dump_image(const char *prefix,
                       unsigned format, unsigned cpp,
                       unsigned width, unsigned height,
                       unsigned stride,
                       const void *data);
-void debug_dump_surface(const char *prefix,
+void debug_dump_surface(struct pipe_context *pipe,
+			const char *prefix,
                         struct pipe_surface *surface);   
-void debug_dump_texture(const char *prefix,
-                        struct pipe_texture *texture);
-void debug_dump_surface_bmp(const char *filename,
+void debug_dump_texture(struct pipe_context *pipe,
+			const char *prefix,
+                        struct pipe_resource *texture);
+void debug_dump_surface_bmp(struct pipe_context *pipe,
+                            const char *filename,
                             struct pipe_surface *surface);
-void debug_dump_transfer_bmp(const char *filename,
+void debug_dump_transfer_bmp(struct pipe_context *pipe,
+                             const char *filename,
                              struct pipe_transfer *transfer);
 void debug_dump_float_rgba_bmp(const char *filename,
                                unsigned width, unsigned height,
                                float *rgba, unsigned stride);
 #else
 #define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
-#define debug_dump_surface(prefix, surface) ((void)0)
-#define debug_dump_surface_bmp(filename, surface) ((void)0)
+#define debug_dump_surface(pipe, prefix, surface) ((void)0)
+#define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
 #define debug_dump_transfer_bmp(filename, transfer) ((void)0)
 #define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
 #endif
diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h
new file mode 100644
index 0000000000..99f260bf96
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h
@@ -0,0 +1,88 @@
+#ifndef U_DIRTY_SURFACES_H_
+#define U_DIRTY_SURFACES_H_
+
+#include "util/u_double_list.h"
+#include "util/u_math.h"
+
+typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *);
+
+struct util_dirty_surfaces
+{
+   struct list_head dirty_list;
+};
+
+struct util_dirty_surface
+{
+   struct pipe_surface base;
+   struct list_head dirty_list;
+};
+
+static INLINE void
+util_dirty_surfaces_init(struct util_dirty_surfaces *ds)
+{
+   LIST_INITHEAD(&ds->dirty_list);
+}
+
+static INLINE void
+util_dirty_surfaces_use_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, util_dirty_surface_flush_t flush)
+{
+   struct list_head *p, *next;
+   for(p = dss->dirty_list.next; p != &dss->dirty_list; p = next)
+   {
+      struct util_dirty_surface *ds = LIST_ENTRY(struct util_dirty_surface, p, dirty_list);
+      next = p->next;
+
+      flush(pipe, &ds->base);
+   }
+}
+
+static INLINE void
+util_dirty_surfaces_use_levels_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, unsigned first, unsigned last, util_dirty_surface_flush_t flush)
+{
+   struct list_head *p, *next;
+   if(first > last)
+      return;
+   for(p = dss->dirty_list.next; p != &dss->dirty_list; p = next)
+   {
+      struct util_dirty_surface *ds = LIST_ENTRY(struct util_dirty_surface, p, dirty_list);
+      next = p->next;
+
+      if(ds->base.level >= first && ds->base.level <= last)
+	 flush(pipe, &ds->base);
+   }
+}
+
+static INLINE void
+util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util_dirty_surfaces *dss, struct pipe_sampler_view *psv, struct pipe_sampler_state *pss, util_dirty_surface_flush_t flush)
+{
+   if(!LIST_IS_EMPTY(&dss->dirty_list))
+      util_dirty_surfaces_use_levels_for_sampling(pipe, dss, (unsigned)pss->min_lod + psv->first_level, MIN2((unsigned)ceilf(pss->max_lod) + psv->first_level, psv->last_level), flush);
+}
+
+static INLINE void
+util_dirty_surface_init(struct util_dirty_surface *ds)
+{
+   LIST_INITHEAD(&ds->dirty_list);
+}
+
+static INLINE boolean
+util_dirty_surface_is_dirty(struct util_dirty_surface *ds)
+{
+   return !LIST_IS_EMPTY(&ds->dirty_list);
+}
+
+static INLINE void
+util_dirty_surface_set_dirty(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
+{
+   if(LIST_IS_EMPTY(&ds->dirty_list))
+      LIST_ADDTAIL(&ds->dirty_list, &dss->dirty_list);
+}
+
+static INLINE void
+util_dirty_surface_set_clean(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
+{
+   if(!LIST_IS_EMPTY(&ds->dirty_list))
+      LIST_DELINIT(&ds->dirty_list);
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
index 37ed789f95..220860ebf4 100644
--- a/src/gallium/auxiliary/util/u_dl.c
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -78,3 +78,16 @@ util_dl_close(struct util_dl_library *library)
    (void)library;
 #endif
 }
+
+
+const char *
+util_dl_error(void)
+{
+#if defined(PIPE_OS_UNIX)
+   return dlerror();
+#elif defined(PIPE_OS_WINDOWS)
+   return "unknown error";
+#else
+   return "unknown error";
+#endif
+}
diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h
index 85296c58af..2853b447c6 100644
--- a/src/gallium/auxiliary/util/u_dl.h
+++ b/src/gallium/auxiliary/util/u_dl.h
@@ -70,4 +70,11 @@ void
 util_dl_close(struct util_dl_library *library);
 
 
+/**
+ * Return most recent error message.
+ */
+const char *
+util_dl_error(void);
+
+
 #endif /* U_DL_H_ */
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index 14506e8451..b37b48b5ae 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -30,6 +30,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_draw_quad.h"
+#include "util/u_memory.h"
 
 
 /**
@@ -38,15 +39,13 @@
  */
 void 
 util_draw_vertex_buffer(struct pipe_context *pipe,
-                        struct pipe_buffer *vbuf,
+                        struct pipe_resource *vbuf,
                         uint offset,
                         uint prim_type,
                         uint num_verts,
                         uint num_attribs)
 {
    struct pipe_vertex_buffer vbuffer;
-   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
-   uint i;
 
    assert(num_attribs <= PIPE_MAX_ATTRIBS);
 
@@ -58,15 +57,7 @@ util_draw_vertex_buffer(struct pipe_context *pipe,
    vbuffer.max_index = num_verts - 1;
    pipe->set_vertex_buffers(pipe, 1, &vbuffer);
 
-   /* tell pipe about the vertex attributes */
-   for (i = 0; i < num_attribs; i++) {
-      velements[i].src_offset = i * 4 * sizeof(float);
-      velements[i].instance_divisor = 0;
-      velements[i].vertex_buffer_index = 0;
-      velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-      velements[i].nr_components = 4;
-   }
-   pipe->set_vertex_elements(pipe, num_attribs, velements);
+   /* note: vertex elements already set by caller */
 
    /* draw */
    pipe->draw_arrays(pipe, prim_type, 0, num_verts);
@@ -76,60 +67,63 @@ util_draw_vertex_buffer(struct pipe_context *pipe,
 
 /**
  * Draw screen-aligned textured quad.
- * Note: this function allocs/destroys a vertex buffer and isn't especially
- * efficient.
+ * Note: this isn't especially efficient.
  */
 void 
 util_draw_texquad(struct pipe_context *pipe,
                   float x0, float y0, float x1, float y1, float z)
 {
-   struct pipe_buffer *vbuf;
-   uint numAttribs = 2, vertexBytes, i, j;
-
-   vertexBytes = 4 * (4 * numAttribs * sizeof(float));
-
-   /* XXX create one-time */
-   vbuf = pipe_buffer_create(pipe->screen, 32,
-                             PIPE_BUFFER_USAGE_VERTEX, vertexBytes);
-   if (vbuf) {
-      float *v = (float *) pipe_buffer_map(pipe->screen, vbuf,
-                                           PIPE_BUFFER_USAGE_CPU_WRITE);
-      if (v) {
-         /*
-          * Load vertex buffer
-          */
-         for (i = j = 0; i < 4; i++) {
-            v[j + 2] = z;   /* z */
-            v[j + 3] = 1.0; /* w */
-            v[j + 6] = 0.0; /* r */
-            v[j + 7] = 1.0; /* q */
-            j += 8;
-         }
-
-         v[0] = x0;
-         v[1] = y0;
-         v[4] = 0.0; /*s*/
-         v[5] = 0.0; /*t*/
-
-         v[8] = x1;
-         v[9] = y0;
-         v[12] = 1.0;
-         v[13] = 0.0;
-
-         v[16] = x1;
-         v[17] = y1;
-         v[20] = 1.0;
-         v[21] = 1.0;
-
-         v[24] = x0;
-         v[25] = y1;
-         v[28] = 0.0;
-         v[29] = 1.0;
-
-         pipe_buffer_unmap(pipe->screen, vbuf);
-         util_draw_vertex_buffer(pipe, vbuf, 0, PIPE_PRIM_TRIANGLE_FAN, 4, 2);
-      }
-
-      pipe_buffer_reference(&vbuf, NULL);
+   uint numAttribs = 2, i, j;
+   uint vertexBytes = 4 * (4 * numAttribs * sizeof(float));
+   struct pipe_resource *vbuf = NULL;  
+   uint *v = NULL;
+
+   v = MALLOC(vertexBytes);
+   if (v == NULL)
+      goto out;
+
+   /*
+    * Load vertex buffer
+    */
+   for (i = j = 0; i < 4; i++) {
+      v[j + 2] = z;   /* z */
+      v[j + 3] = 1.0; /* w */
+      v[j + 6] = 0.0; /* r */
+      v[j + 7] = 1.0; /* q */
+      j += 8;
    }
+
+   v[0] = x0;
+   v[1] = y0;
+   v[4] = 0.0; /*s*/
+   v[5] = 0.0; /*t*/
+
+   v[8] = x1;
+   v[9] = y0;
+   v[12] = 1.0;
+   v[13] = 0.0;
+
+   v[16] = x1;
+   v[17] = y1;
+   v[20] = 1.0;
+   v[21] = 1.0;
+
+   v[24] = x0;
+   v[25] = y1;
+   v[28] = 0.0;
+   v[29] = 1.0;
+	 
+   vbuf = pipe_user_buffer_create(pipe->screen, v, vertexBytes,
+				  PIPE_BIND_VERTEX_BUFFER);
+   if (!vbuf) 
+      goto out;
+
+   util_draw_vertex_buffer(pipe, vbuf, 0, PIPE_PRIM_TRIANGLE_FAN, 4, 2);
+
+out:
+   if (vbuf)
+      pipe_resource_reference(&vbuf, NULL);
+   
+   if (v)
+      FREE(v);
 }
diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h
index 00d3f5b715..42eb184428 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.h
+++ b/src/gallium/auxiliary/util/u_draw_quad.h
@@ -33,11 +33,11 @@
 extern "C" {
 #endif
 
-struct pipe_buffer;
+struct pipe_resource;
 
 extern void 
 util_draw_vertex_buffer(struct pipe_context *pipe,
-                        struct pipe_buffer *vbuf, uint offset,
+                        struct pipe_resource *vbuf, uint offset,
                         uint num_attribs, uint num_verts, uint prim_type);
 
 
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index 379f18ef38..bdc73ac47d 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -92,7 +92,7 @@ util_dump_tex_filter(unsigned value, boolean shortened);
 
 void
 util_dump_template(struct os_stream *stream,
-                   const struct pipe_texture *templat);
+                   const struct pipe_resource *templat);
 
 void
 util_dump_rasterizer_state(struct os_stream *stream,
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index ae7afd7311..79fd38ef5c 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -255,14 +255,14 @@ util_dump_enum_func(struct os_stream *stream, unsigned value)
 
 
 void
-util_dump_template(struct os_stream *stream, const struct pipe_texture *templat)
+util_dump_template(struct os_stream *stream, const struct pipe_resource *templat)
 {
    if(!templat) {
       util_dump_null(stream);
       return;
    }
 
-   util_dump_struct_begin(stream, "pipe_texture");
+   util_dump_struct_begin(stream, "pipe_resource");
 
    util_dump_member(stream, int, templat, target);
    util_dump_member(stream, format, templat, format);
@@ -280,7 +280,9 @@ util_dump_template(struct os_stream *stream, const struct pipe_texture *templat)
    util_dump_member_end(stream);
 
    util_dump_member(stream, uint, templat, last_level);
-   util_dump_member(stream, uint, templat, tex_usage);
+   util_dump_member(stream, uint, templat, _usage);
+   util_dump_member(stream, uint, templat, bind);
+   util_dump_member(stream, uint, templat, flags);
 
    util_dump_struct_end(stream);
 }
@@ -653,16 +655,13 @@ util_dump_transfer(struct os_stream *stream, const struct pipe_transfer *state)
 
    util_dump_struct_begin(stream, "pipe_transfer");
 
-   util_dump_member(stream, uint, state, width);
-   util_dump_member(stream, uint, state, height);
+   util_dump_member(stream, ptr, state, resource);
+//   util_dump_member(stream, uint, state, box);
 
    util_dump_member(stream, uint, state, stride);
-   util_dump_member(stream, uint, state, usage);
+   util_dump_member(stream, uint, state, slice_stride);
 
-   util_dump_member(stream, ptr, state, texture);
-   util_dump_member(stream, uint, state, face);
-   util_dump_member(stream, uint, state, level);
-   util_dump_member(stream, uint, state, zslice);
+//   util_dump_member(stream, ptr, state, data);
 
    util_dump_struct_end(stream);
 }
@@ -700,7 +699,6 @@ util_dump_vertex_element(struct os_stream *stream, const struct pipe_vertex_elem
    util_dump_member(stream, uint, state, src_offset);
 
    util_dump_member(stream, uint, state, vertex_buffer_index);
-   util_dump_member(stream, uint, state, nr_components);
 
    util_dump_member(stream, format, state, src_format);
 
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
new file mode 100644
index 0000000000..9d1c1713a7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -0,0 +1,111 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DYNARRAY_H
+#define U_DYNARRAY_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+
+/* A zero-initialized version of this is guaranteed to represent an
+ * empty array.
+ *
+ * Also, size <= capacity and data != 0 if and only if capacity != 0
+ * capacity will always be the allocation size of data
+ */
+struct util_dynarray
+{
+   void *data;
+   unsigned size;
+   unsigned capacity;
+};
+
+static INLINE void
+util_dynarray_init(struct util_dynarray *buf)
+{
+   memset(buf, 0, sizeof(*buf));
+}
+
+static INLINE void
+util_dynarray_fini(struct util_dynarray *buf)
+{
+   if(buf->data)
+   {
+      FREE(buf->data);
+      util_dynarray_init(buf);
+   }
+}
+
+/* use util_dynarray_trim to reduce the allocated storage */
+static INLINE void *
+util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
+{
+   char *p;
+   if(newsize > buf->capacity)
+   {
+      unsigned newcap = buf->capacity << 1;
+      if(newsize > newcap)
+	      newcap = newsize;
+      buf->data = REALLOC(buf->data, buf->capacity, newcap);
+      buf->capacity = newcap;
+   }
+
+   p = (char *)buf->data + buf->size;
+   buf->size = newsize;
+   return p;
+}
+
+static INLINE void *
+util_dynarray_grow(struct util_dynarray *buf, int diff)
+{
+   return util_dynarray_resize(buf, buf->size + diff);
+}
+
+static INLINE void
+util_dynarray_trim(struct util_dynarray *buf)
+{
+   if (buf->size != buf->capacity) {
+      if (buf->size) {
+         buf->data = REALLOC(buf->data, buf->capacity, buf->size);
+         buf->capacity = buf->size;
+      }
+      else {
+         FREE(buf->data);
+         buf->data = 0;
+         buf->capacity = 0;
+      }
+   }
+}
+
+#define util_dynarray_append(buf, type, v) do {type __v = (v); memcpy(util_dynarray_grow((buf), sizeof(type)), &__v, sizeof(type));} while(0)
+#define util_dynarray_top_ptr(buf, type) (type*)((char*)(buf)->data + (buf)->size - sizeof(type))
+#define util_dynarray_top(buf, type) *util_dynarray_top_ptr(buf, type)
+#define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
+#define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
+#define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
+
+#endif /* U_DYNARRAY_H */
+
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
new file mode 100644
index 0000000000..c50c807eb8
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -0,0 +1,286 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format accessor functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "u_math.h"
+#include "u_memory.h"
+#include "u_rect.h"
+#include "u_format.h"
+
+
+void
+util_format_read_4f(enum pipe_format format,
+                    float *dst, unsigned dst_stride,
+                    const void *src, unsigned src_stride,
+                    unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   const struct util_format_description *format_desc;
+   const uint8_t *src_row;
+   float *dst_row;
+
+   format_desc = util_format_description(format);
+
+   assert(x % format_desc->block.width == 0);
+   assert(y % format_desc->block.height == 0);
+
+   src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8);
+   dst_row = dst;
+
+   format_desc->unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, w, h);
+}
+
+
+void
+util_format_write_4f(enum pipe_format format,
+                     const float *src, unsigned src_stride,
+                     void *dst, unsigned dst_stride,
+                     unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   const struct util_format_description *format_desc;
+   uint8_t *dst_row;
+   const float *src_row;
+
+   format_desc = util_format_description(format);
+
+   assert(x % format_desc->block.width == 0);
+   assert(y % format_desc->block.height == 0);
+
+   dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8);
+   src_row = src;
+
+   format_desc->pack_rgba_float(dst_row, dst_stride, src_row, src_stride, w, h);
+}
+
+
+void
+util_format_read_4ub(enum pipe_format format, uint8_t *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   const struct util_format_description *format_desc;
+   const uint8_t *src_row;
+   uint8_t *dst_row;
+
+   format_desc = util_format_description(format);
+
+   assert(x % format_desc->block.width == 0);
+   assert(y % format_desc->block.height == 0);
+
+   src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8);
+   dst_row = dst;
+
+   format_desc->unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, w, h);
+}
+
+
+void
+util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   const struct util_format_description *format_desc;
+   uint8_t *dst_row;
+   const uint8_t *src_row;
+
+   format_desc = util_format_description(format);
+
+   assert(x % format_desc->block.width == 0);
+   assert(y % format_desc->block.height == 0);
+
+   dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8);
+   src_row = src;
+
+   format_desc->pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, w, h);
+}
+
+
+static INLINE boolean
+util_format_fits_8unorm(const struct util_format_description *format_desc)
+{
+   unsigned chan;
+
+   switch (format_desc->layout) {
+
+   case UTIL_FORMAT_LAYOUT_S3TC:
+   case UTIL_FORMAT_LAYOUT_RGTC:
+      /*
+       * These are straight forward.
+       */
+
+      return TRUE;
+
+   case UTIL_FORMAT_LAYOUT_PLAIN:
+      /*
+       * For these we can find a generic rule.
+       */
+
+      for (chan = 0; chan < format_desc->nr_channels; ++chan) {
+         switch (format_desc->channel[chan].type) {
+         case UTIL_FORMAT_TYPE_VOID:
+            break;
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if (!format_desc->channel[chan].normalized ||
+                format_desc->channel[chan].size > 8) {
+               return FALSE;
+            }
+            break;
+         default:
+            return FALSE;
+         }
+      }
+      return TRUE;
+
+   default:
+      /*
+       * Handle all others on a case by case basis.
+       */
+
+      switch (format_desc->format) {
+      case PIPE_FORMAT_R1_UNORM:
+      case PIPE_FORMAT_UYVY:
+      case PIPE_FORMAT_YUYV:
+      case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      case PIPE_FORMAT_G8R8_G8B8_UNORM:
+         return TRUE;
+
+      default:
+         return FALSE;
+      }
+   }
+}
+
+
+void
+util_format_translate(enum pipe_format dst_format,
+                      void *dst, unsigned dst_stride,
+                      unsigned dst_x, unsigned dst_y,
+                      enum pipe_format src_format,
+                      const void *src, unsigned src_stride,
+                      unsigned src_x, unsigned src_y,
+                      unsigned width, unsigned height)
+{
+   const struct util_format_description *dst_format_desc;
+   const struct util_format_description *src_format_desc;
+   uint8_t *dst_row;
+   const uint8_t *src_row;
+   unsigned y_step;
+   unsigned dst_step;
+   unsigned src_step;
+
+   if (dst_format == src_format) {
+      /*
+       * Trivial case.
+       */
+
+      util_copy_rect(dst, dst_format, dst_stride,  dst_x, dst_y,
+                     width, height, src, (int)src_stride,
+                     src_x, src_y);
+      return;
+   }
+
+   dst_format_desc = util_format_description(dst_format);
+   src_format_desc = util_format_description(src_format);
+
+   assert(dst_x % dst_format_desc->block.width == 0);
+   assert(dst_y % dst_format_desc->block.height == 0);
+   assert(src_x % src_format_desc->block.width == 0);
+   assert(src_y % src_format_desc->block.height == 0);
+
+   dst_row = (uint8_t *)dst + dst_y*dst_stride + dst_x*(dst_format_desc->block.bits/8);
+   src_row = (const uint8_t *)src + src_y*src_stride + src_x*(src_format_desc->block.bits/8);
+
+   /*
+    * This works because all pixel formats have pixel blocks with power of two
+    * sizes.
+    */
+
+   y_step = MAX2(dst_format_desc->block.height, src_format_desc->block.height);
+   assert(y_step % dst_format_desc->block.height == 0);
+   assert(y_step % src_format_desc->block.height == 0);
+
+   dst_step = y_step / dst_format_desc->block.height * dst_stride;
+   src_step = y_step / src_format_desc->block.height * src_stride;
+
+   /*
+    * TODO: double formats will loose precision
+    * TODO: Add a special case for formats that are mere swizzles of each other
+    */
+
+   if (util_format_fits_8unorm(src_format_desc) ||
+       util_format_fits_8unorm(dst_format_desc)) {
+      unsigned tmp_stride;
+      uint8_t *tmp_row;
+
+      tmp_stride = width * 4 * sizeof *tmp_row;
+      tmp_row = MALLOC(y_step * tmp_stride);
+      if (!tmp_row)
+         return;
+
+      while (height >= y_step) {
+         src_format_desc->unpack_rgba_8unorm(tmp_row, tmp_stride, src_row, src_stride, width, y_step);
+         dst_format_desc->pack_rgba_8unorm(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step);
+
+         dst_row += dst_step;
+         src_row += src_step;
+         height -= y_step;
+      }
+
+      if (height) {
+         src_format_desc->unpack_rgba_8unorm(tmp_row, tmp_stride, src_row, src_stride, width, height);
+         dst_format_desc->pack_rgba_8unorm(dst_row, dst_stride, tmp_row, tmp_stride, width, height);
+      }
+
+      FREE(tmp_row);
+   }
+   else {
+      unsigned tmp_stride;
+      float *tmp_row;
+
+      tmp_stride = width * 4 * sizeof *tmp_row;
+      tmp_row = MALLOC(y_step * tmp_stride);
+      if (!tmp_row)
+         return;
+
+      while (height >= y_step) {
+         src_format_desc->unpack_rgba_float(tmp_row, tmp_stride, src_row, src_stride, width, y_step);
+         dst_format_desc->pack_rgba_float(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step);
+
+         dst_row += dst_step;
+         src_row += src_step;
+         height -= y_step;
+      }
+
+      if (height) {
+         src_format_desc->unpack_rgba_float(tmp_row, tmp_stride, src_row, src_stride, width, height);
+         dst_format_desc->pack_rgba_float(dst_row, dst_stride, tmp_row, tmp_stride, width, height);
+      }
+
+      FREE(tmp_row);
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 96a0fa6550..016e73c4a1 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -47,31 +47,41 @@
 # - color space: rgb, yub, sz
 #
 # See also:
-# - http://msdn.microsoft.com/en-us/library/ee416489.aspx (D3D9)
-# - http://msdn.microsoft.com/en-us/library/ee415668.aspx (D3D9 -> D3D10)
-# - http://msdn.microsoft.com/en-us/library/ee418116.aspx (D3D10)
+# - http://msdn.microsoft.com/en-us/library/bb172558.aspx (D3D9)
+# - http://msdn.microsoft.com/en-us/library/bb205073.aspx#mapping_texture_formats (D3D9 -> D3D10)
+# - http://msdn.microsoft.com/en-us/library/bb173059.aspx (D3D10)
 #
 # Note that GL doesn't really specify the layout of internal formats. See
 # OpenGL 2.1 specification, Table 3.16, on the "Correspondence of sized
 # internal formats to base in- ternal formats, and desired component
 # resolutions for each sized internal format."
 
+# None
+# Described as regular uint_8 bytes, i.e. PIPE_FORMAT_R8_USCALED
+PIPE_FORMAT_NONE                  , plain, 1, 1, u8  ,     ,     ,     , x001, rgb
+
 # Typical rendertarget formats
 PIPE_FORMAT_B8G8R8A8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb
-PIPE_FORMAT_B8G8R8X8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
+PIPE_FORMAT_B8G8R8X8_UNORM        , plain, 1, 1, un8 , un8 , un8 , x8  , zyx1, rgb
 PIPE_FORMAT_A8R8G8B8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
-PIPE_FORMAT_X8R8G8B8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
+PIPE_FORMAT_X8R8G8B8_UNORM        , plain, 1, 1, x8  , un8 , un8 , un8 , yzw1, rgb
 PIPE_FORMAT_A8B8G8R8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, rgb
-PIPE_FORMAT_X8B8G8R8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , wzy1, rgb
+PIPE_FORMAT_X8B8G8R8_UNORM        , plain, 1, 1, x8  , un8 , un8 , un8 , wzy1, rgb
+# PIPE_FORMAT_R8G8B8A8_UNORM is below
+PIPE_FORMAT_R8G8B8X8_UNORM        , plain, 1, 1, un8 , un8 , un8 , x8  , xyz1, rgb
+PIPE_FORMAT_B5G5R5X1_UNORM        , plain, 1, 1, un5 , un5 , un5 , x1  , zyx1, rgb
 PIPE_FORMAT_B5G5R5A1_UNORM        , plain, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
 PIPE_FORMAT_B4G4R4A4_UNORM        , plain, 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
+PIPE_FORMAT_B4G4R4X4_UNORM        , plain, 1, 1, un4 , un4 , un4 , x4  , zyx1, rgb
 PIPE_FORMAT_B5G6R5_UNORM          , plain, 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
 PIPE_FORMAT_R10G10B10A2_UNORM     , plain, 1, 1, un10, un10, un10, un2 , xyzw, rgb
+PIPE_FORMAT_B10G10R10A2_UNORM     , plain, 1, 1, un10, un10, un10, un2 , zyxw, rgb
 
 # Luminance/Intensity/Alpha formats
 PIPE_FORMAT_L8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , xxx1, rgb
 PIPE_FORMAT_A8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , 000x, rgb
 PIPE_FORMAT_I8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , xxxx, rgb
+PIPE_FORMAT_L4A4_UNORM            , plain, 1, 1, un4 , un4 ,     ,     , xxxy, rgb
 PIPE_FORMAT_L8A8_UNORM            , plain, 1, 1, un8 , un8 ,     ,     , xxxy, rgb
 PIPE_FORMAT_L16_UNORM             , plain, 1, 1, un16,     ,     ,     , xxx1, rgb
 
@@ -79,44 +89,65 @@ PIPE_FORMAT_L16_UNORM             , plain, 1, 1, un16,     ,     ,     , xxx1, r
 PIPE_FORMAT_L8_SRGB               , plain, 1, 1, un8 ,     ,     ,     , xxx1, srgb 
 PIPE_FORMAT_L8A8_SRGB             , plain, 1, 1, un8 , un8 ,     ,     , xxxy, srgb 
 PIPE_FORMAT_R8G8B8_SRGB           , plain, 1, 1, un8 , un8 , un8 ,     , xyz1, srgb 
+PIPE_FORMAT_R8G8B8A8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb 
 PIPE_FORMAT_A8B8G8R8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, srgb
-PIPE_FORMAT_X8B8G8R8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , wzy1, srgb
+PIPE_FORMAT_X8B8G8R8_SRGB         , plain, 1, 1, x8  , un8 , un8 , un8 , wzy1, srgb
 PIPE_FORMAT_B8G8R8A8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, srgb
-PIPE_FORMAT_B8G8R8X8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , zyx1, srgb
+PIPE_FORMAT_B8G8R8X8_SRGB         , plain, 1, 1, un8 , un8 , un8 , x8  , zyx1, srgb
 PIPE_FORMAT_A8R8G8B8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, srgb
-PIPE_FORMAT_X8R8G8B8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , yzw1, srgb
+PIPE_FORMAT_X8R8G8B8_SRGB         , plain, 1, 1, x8  , un8 , un8 , un8 , yzw1, srgb
 
 # Mixed-sign formats (typically used for bump map textures)
 PIPE_FORMAT_R8SG8SB8UX8U_NORM     , plain, 1, 1, sn8 , sn8 , un8 , x8  , xyz1, rgb
+PIPE_FORMAT_R10SG10SB10SA2U_NORM  , plain, 1, 1, sn10, sn10, sn10, un2 , xyzw, rgb
 PIPE_FORMAT_R5SG5SB6U_NORM        , plain, 1, 1, sn5 , sn5 , un6 ,     , xyz1, rgb
 
 # Depth-stencil formats
-PIPE_FORMAT_S8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , _x__, zs 
-PIPE_FORMAT_Z16_UNORM             , plain, 1, 1, un16,     ,     ,     , x___, zs 
-PIPE_FORMAT_Z32_UNORM             , plain, 1, 1, un32,     ,     ,     , x___, zs 
-PIPE_FORMAT_Z32_FLOAT             , plain, 1, 1, f32 ,     ,     ,     , x___, zs 
-PIPE_FORMAT_Z24S8_UNORM           , plain, 1, 1, un24, un8 ,     ,     , xy__, zs 
-PIPE_FORMAT_S8Z24_UNORM           , plain, 1, 1, un8 , un24,     ,     , yx__, zs 
-PIPE_FORMAT_Z24X8_UNORM           , plain, 1, 1, un24, un8 ,     ,     , x___, zs 
-PIPE_FORMAT_X8Z24_UNORM           , plain, 1, 1, un8 , un24,     ,     , y___, zs 
+PIPE_FORMAT_S8_USCALED              , plain, 1, 1, u8  ,     ,     ,     , _x__, zs
+PIPE_FORMAT_Z16_UNORM               , plain, 1, 1, un16,     ,     ,     , x___, zs
+PIPE_FORMAT_Z32_UNORM               , plain, 1, 1, un32,     ,     ,     , x___, zs
+PIPE_FORMAT_Z32_FLOAT               , plain, 1, 1, f32 ,     ,     ,     , x___, zs
+PIPE_FORMAT_Z24_UNORM_S8_USCALED    , plain, 1, 1, un24, u8  ,     ,     , xy__, zs
+PIPE_FORMAT_S8_USCALED_Z24_UNORM    , plain, 1, 1, u8 ,  un24,     ,     , yx__, zs
+PIPE_FORMAT_Z24X8_UNORM             , plain, 1, 1, un24, x8  ,     ,     , x___, zs
+PIPE_FORMAT_X8Z24_UNORM             , plain, 1, 1, x8  , un24,     ,     , y___, zs
+PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32,  u8  , x24 ,     , xy__, zs
 
 # YUV formats
 # http://www.fourcc.org/yuv.php#UYVY
 PIPE_FORMAT_UYVY                 , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, yuv
 # http://www.fourcc.org/yuv.php#YUYV (a.k.a http://www.fourcc.org/yuv.php#YUY2)
-# XXX: u_tile.c's ycbcr_get_tile_rgba actually interprets it as VYUY but the 
-# intent should be to match D3DFMT_YUY2
 PIPE_FORMAT_YUYV                 , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, yuv
+# same subsampling but with rgb channels
+PIPE_FORMAT_R8G8_B8G8_UNORM      , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, rgb
+PIPE_FORMAT_G8R8_G8B8_UNORM      , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, rgb
+
+# some special formats not fitting anywhere else
+PIPE_FORMAT_R10G10B10A2_USCALED   , plain,      1,  1, u10 , u10 , u10 , u2  , xyzw, rgb
+PIPE_FORMAT_R11G11B10_FLOAT       , plain,      1,  1, f11 , f11 , f10 ,     , xyz1, rgb
+PIPE_FORMAT_R9G9B9E5_FLOAT        , other,      1,  1, x32 ,     ,     ,     , xyz1, rgb
+PIPE_FORMAT_R1_UNORM              , other,      8,  1, x8  ,     ,     ,     , x001, rgb
+# A.k.a. D3DFMT_CxV8U8
+PIPE_FORMAT_R8G8Bx_SNORM          , other,      1,  1, sn8 , sn8 ,     ,     , xyz1, rgb
 
 # Compressed formats
-PIPE_FORMAT_DXT1_RGB              , compressed, 4, 4, x64 ,     ,     ,     , xyz1, rgb
-PIPE_FORMAT_DXT1_RGBA             , compressed, 4, 4, x64 ,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT3_RGBA             , compressed, 4, 4, x128,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT5_RGBA             , compressed, 4, 4, x128,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT1_SRGB             , compressed, 4, 4, x64 ,     ,     ,     , xyz1, srgb
-PIPE_FORMAT_DXT1_SRGBA            , compressed, 4, 4, x64 ,     ,     ,     , xyzw, srgb
-PIPE_FORMAT_DXT3_SRGBA            , compressed, 4, 4, x128,     ,     ,     , xyzw, srgb
-PIPE_FORMAT_DXT5_SRGBA            , compressed, 4, 4, x128,     ,     ,     , xyzw, srgb
+# - http://en.wikipedia.org/wiki/S3_Texture_Compression
+# - http://www.opengl.org/registry/specs/EXT/texture_compression_s3tc.txt
+# - http://www.opengl.org/registry/specs/ARB/texture_compression_rgtc.txt
+# - http://msdn.microsoft.com/en-us/library/bb694531.aspx
+PIPE_FORMAT_DXT1_RGB              , s3tc, 4, 4, x64 ,     ,     ,     , xyz1, rgb
+PIPE_FORMAT_DXT1_RGBA             , s3tc, 4, 4, x64 ,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT3_RGBA             , s3tc, 4, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT5_RGBA             , s3tc, 4, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT1_SRGB             , s3tc, 4, 4, x64 ,     ,     ,     , xyz1, srgb
+PIPE_FORMAT_DXT1_SRGBA            , s3tc, 4, 4, x64 ,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_DXT3_SRGBA            , s3tc, 4, 4, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_DXT5_SRGBA            , s3tc, 4, 4, x128,     ,     ,     , xyzw, srgb
+
+PIPE_FORMAT_RGTC1_UNORM           , rgtc, 4, 4, x64,      ,     ,     , x001, rgb
+PIPE_FORMAT_RGTC1_SNORM           , rgtc, 4, 4, x64,      ,     ,     , x001, rgb
+PIPE_FORMAT_RGTC2_UNORM           , rgtc, 4, 4, x128,     ,     ,     , xy01, rgb
+PIPE_FORMAT_RGTC2_SNORM           , rgtc, 4, 4, x128,     ,     ,     , xy01, rgb
 
 # Straightforward D3D10-like formats (also used for 
 # vertex buffer element description)
@@ -148,10 +179,10 @@ PIPE_FORMAT_R32_SSCALED           , plain, 1, 1, s32 ,     ,     ,     , x001, r
 PIPE_FORMAT_R32G32_SSCALED        , plain, 1, 1, s32 , s32 ,     ,     , xy01, rgb
 PIPE_FORMAT_R32G32B32_SSCALED     , plain, 1, 1, s32 , s32 , s32 ,     , xyz1, rgb
 PIPE_FORMAT_R32G32B32A32_SSCALED  , plain, 1, 1, s32 , s32 , s32 , s32 , xyzw, rgb
-PIPE_FORMAT_R32_FIXED             , plain, 1, 1, h32 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_FIXED          , plain, 1, 1, h32 , h32 ,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_FIXED       , plain, 1, 1, h32 , h32 , h32 ,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_FIXED    , plain, 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
+PIPE_FORMAT_R16_FLOAT             , plain, 1, 1, f16 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R16G16_FLOAT          , plain, 1, 1, f16 , f16 ,     ,     , xy01, rgb
+PIPE_FORMAT_R16G16B16_FLOAT       , plain, 1, 1, f16 , f16 , f16 ,     , xyz1, rgb
+PIPE_FORMAT_R16G16B16A16_FLOAT    , plain, 1, 1, f16 , f16 , f16 , f16 , xyzw, rgb
 PIPE_FORMAT_R16_UNORM             , plain, 1, 1, un16,     ,     ,     , x001, rgb
 PIPE_FORMAT_R16G16_UNORM          , plain, 1, 1, un16, un16,     ,     , xy01, rgb
 PIPE_FORMAT_R16G16B16_UNORM       , plain, 1, 1, un16, un16, un16,     , xyz1, rgb
@@ -184,3 +215,18 @@ PIPE_FORMAT_R8_SSCALED            , plain, 1, 1, s8  ,     ,     ,     , x001, r
 PIPE_FORMAT_R8G8_SSCALED          , plain, 1, 1, s8  , s8  ,     ,     , xy01, rgb
 PIPE_FORMAT_R8G8B8_SSCALED        , plain, 1, 1, s8  , s8  , s8  ,     , xyz1, rgb
 PIPE_FORMAT_R8G8B8A8_SSCALED      , plain, 1, 1, s8  , s8  , s8  , s8  , xyzw, rgb
+
+# GL-specific vertex buffer element formats
+# A.k.a. GL_FIXED
+PIPE_FORMAT_R32_FIXED             , plain, 1, 1, h32 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_FIXED          , plain, 1, 1, h32 , h32 ,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_FIXED       , plain, 1, 1, h32 , h32 , h32 ,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_FIXED    , plain, 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
+
+# D3D9-specific vertex buffer element formats
+# See also:
+# - http://msdn.microsoft.com/en-us/library/bb172533.aspx
+# A.k.a. D3DDECLTYPE_UDEC3
+PIPE_FORMAT_R10G10B10X2_USCALED   , plain, 1, 1, u10 , u10 , u10  , x2 , xyz1, rgb
+# A.k.a. D3DDECLTYPE_DEC3N
+PIPE_FORMAT_R10G10B10X2_SNORM     , plain, 1, 1, sn10, sn10, sn10 , x2 , xyz1, rgb
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index e8fa0022b5..5e3dc694be 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -56,15 +56,23 @@ enum util_format_layout {
     *
     * This is for formats like YV12 where there is less than one sample per
     * pixel.
-    *
-    * XXX: This could actually b
     */
    UTIL_FORMAT_LAYOUT_SUBSAMPLED = 3,
 
    /**
-    * An unspecified compression algorithm.
+    * S3 Texture Compression formats.
+    */
+   UTIL_FORMAT_LAYOUT_S3TC = 4,
+
+   /**
+    * Red-Green Texture Compression formats.
+    */
+   UTIL_FORMAT_LAYOUT_RGTC = 5,
+
+   /**
+    * Everything else that doesn't fit in any of the above layouts.
     */
-   UTIL_FORMAT_LAYOUT_COMPRESSED = 4
+   UTIL_FORMAT_LAYOUT_OTHER = 6
 };
 
 
@@ -120,9 +128,15 @@ struct util_format_channel_description
 struct util_format_description
 {
    enum pipe_format format;
+
    const char *name;
 
    /**
+    * Short name, striped of the prefix, lower case.
+    */
+   const char *short_name;
+
+   /**
     * Pixel block dimensions.
     */
    struct util_format_block block;
@@ -140,6 +154,15 @@ struct util_format_description
    unsigned is_array:1;
 
    /**
+    * Whether the pixel format can be described as a bitfield structure.
+    *
+    * In particular:
+    * - pixel depth must be 8, 16, or 32 bits;
+    * - all channels must be unsigned, signed, or void
+    */
+   unsigned is_bitmask:1;
+
+   /**
     * Whether channels have mixed types (ignoring UTIL_FORMAT_TYPE_VOID).
     */
    unsigned is_mixed:1;
@@ -166,6 +189,117 @@ struct util_format_description
     * Colorspace transformation.
     */
    enum util_format_colorspace colorspace;
+
+   /**
+    * Unpack pixel blocks to R8G8B8A8_UNORM.
+    *
+    * Only defined for non-depth-stencil formats.
+    */
+   void
+   (*unpack_rgba_8unorm)(uint8_t *dst, unsigned dst_stride,
+                         const uint8_t *src, unsigned src_stride,
+                         unsigned width, unsigned height);
+
+   /**
+    * Pack pixel blocks from R8G8B8A8_UNORM.
+    *
+    * Only defined for non-depth-stencil formats.
+    */
+   void
+   (*pack_rgba_8unorm)(uint8_t *dst, unsigned dst_stride,
+                       const uint8_t *src, unsigned src_stride,
+                       unsigned width, unsigned height);
+
+   /**
+    * Unpack pixel blocks to R32G32B32A32_FLOAT.
+    *
+    * Only defined for non-depth-stencil formats.
+    */
+   void
+   (*unpack_rgba_float)(float *dst, unsigned dst_stride,
+                        const uint8_t *src, unsigned src_stride,
+                        unsigned width, unsigned height);
+
+   /**
+    * Pack pixel blocks from R32G32B32A32_FLOAT.
+    *
+    * Only defined for non-depth-stencil formats.
+    */
+   void
+   (*pack_rgba_float)(uint8_t *dst, unsigned dst_stride,
+                      const float *src, unsigned src_stride,
+                      unsigned width, unsigned height);
+
+   /**
+    * Fetch a single pixel (i, j) from a block.
+    *
+    * Only defined for non-depth-stencil formats.
+    */
+   void
+   (*fetch_rgba_float)(float *dst,
+                       const uint8_t *src,
+                       unsigned i, unsigned j);
+
+   /**
+    * Unpack pixels to Z32_UNORM.
+    *
+    * Only defined for depth formats.
+    */
+   void
+   (*unpack_z_32unorm)(uint32_t *dst, unsigned dst_stride,
+                       const uint8_t *src, unsigned src_stride,
+                       unsigned width, unsigned height);
+
+   /**
+    * Pack pixels from Z32_FLOAT.
+    *
+    * Only defined for depth formats.
+    */
+   void
+   (*pack_z_32unorm)(uint8_t *dst, unsigned dst_stride,
+                     const uint32_t *src, unsigned src_stride,
+                     unsigned width, unsigned height);
+
+   /**
+    * Unpack pixels to Z32_FLOAT.
+    *
+    * Only defined for depth formats.
+    */
+   void
+   (*unpack_z_float)(float *dst, unsigned dst_stride,
+                     const uint8_t *src, unsigned src_stride,
+                     unsigned width, unsigned height);
+
+   /**
+    * Pack pixels from Z32_FLOAT.
+    *
+    * Only defined for depth formats.
+    */
+   void
+   (*pack_z_float)(uint8_t *dst, unsigned dst_stride,
+                   const float *src, unsigned src_stride,
+                   unsigned width, unsigned height);
+
+   /**
+    * Unpack pixels to S8_USCALED.
+    *
+    * Only defined for stencil formats.
+    */
+   void
+   (*unpack_s_8uscaled)(uint8_t *dst, unsigned dst_stride,
+                        const uint8_t *src, unsigned src_stride,
+                        unsigned width, unsigned height);
+
+   /**
+    * Pack pixels from S8_USCALED.
+    *
+    * Only defined for stencil formats.
+    */
+   void
+   (*pack_s_8uscaled)(uint8_t *dst, unsigned dst_stride,
+                      const uint8_t *src, unsigned src_stride,
+                      unsigned width, unsigned height);
+
 };
 
 
@@ -186,8 +320,8 @@ util_format_name(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return "???";
    }
 
@@ -195,16 +329,16 @@ util_format_name(enum pipe_format format)
 }
 
 static INLINE boolean 
-util_format_is_compressed(enum pipe_format format)
+util_format_is_s3tc(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return FALSE;
    }
 
-   return desc->layout == UTIL_FORMAT_LAYOUT_COMPRESSED ? TRUE : FALSE;
+   return desc->layout == UTIL_FORMAT_LAYOUT_S3TC ? TRUE : FALSE;
 }
 
 static INLINE boolean 
@@ -212,8 +346,8 @@ util_format_is_depth_or_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return FALSE;
    }
 
@@ -225,8 +359,8 @@ util_format_is_depth_and_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return FALSE;
    }
 
@@ -238,6 +372,34 @@ util_format_is_depth_and_stencil(enum pipe_format format)
            desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE) ? TRUE : FALSE;
 }
 
+/**
+ * Whether this format is a rgab8 variant.
+ *
+ * That is, any format that matches the
+ *
+ *   PIPE_FORMAT_?8?8?8?8_UNORM
+ */
+static INLINE boolean
+util_format_is_rgba8_variant(const struct util_format_description *desc)
+{
+   unsigned chan;
+
+   if(desc->block.width != 1 ||
+      desc->block.height != 1 ||
+      desc->block.bits != 32)
+      return FALSE;
+
+   for(chan = 0; chan < 4; ++chan) {
+      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
+         return FALSE;
+      if(desc->channel[chan].size != 8)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
 
 /**
  * Return total bits needed for the pixel format per block.
@@ -247,8 +409,8 @@ util_format_get_blocksizebits(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return 0;
    }
 
@@ -273,8 +435,8 @@ util_format_get_blockwidth(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return 1;
    }
 
@@ -286,8 +448,8 @@ util_format_get_blockheight(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
 
-   assert(format);
-   if (!format) {
+   assert(desc);
+   if (!desc) {
       return 1;
    }
 
@@ -400,6 +562,16 @@ util_format_has_alpha(enum pipe_format format)
    }
 }
 
+/**
+ * Return the number of components stored.
+ * Formats with block size != 1x1 will always have 1 component (the block).
+ */
+static INLINE unsigned
+util_format_get_nr_components(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   return desc->nr_channels;
+}
 
 /*
  * Format access functions.
@@ -429,6 +601,19 @@ util_format_write_4ub(enum pipe_format format,
                       void *dst, unsigned dst_stride, 
                       unsigned x, unsigned y, unsigned w, unsigned h);
 
+/*
+ * Generic format conversion;
+ */
+
+void
+util_format_translate(enum pipe_format dst_format,
+                      void *dst, unsigned dst_stride,
+                      unsigned dst_x, unsigned dst_y,
+                      enum pipe_format src_format,
+                      const void *src, unsigned src_stride,
+                      unsigned src_x, unsigned src_y,
+                      unsigned width, unsigned height);
+
 #ifdef __cplusplus
 } // extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/util/u_format_access.py b/src/gallium/auxiliary/util/u_format_access.py
deleted file mode 100644
index 00424779d2..0000000000
--- a/src/gallium/auxiliary/util/u_format_access.py
+++ /dev/null
@@ -1,341 +0,0 @@
-#!/usr/bin/env python
-
-'''
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Pixel format accessor functions.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-'''
-
-
-import math
-import sys
-
-from u_format_pack import *
-
-
-def is_format_supported(format):
-    '''Determines whether we actually have the plumbing necessary to generate the 
-    to read/write to/from this format.'''
-
-    # FIXME: Ideally we would support any format combination here.
-
-    # XXX: It should be straightforward to support srgb
-    if format.colorspace not in ('rgb', 'zs'):
-        return False
-
-    if format.layout != PLAIN:
-        return False
-
-    for i in range(4):
-        channel = format.channels[i]
-        if channel.type not in (VOID, UNSIGNED, FLOAT):
-            return False
-
-    # We can only read a color from a depth/stencil format if the depth channel is present
-    if format.colorspace == 'zs' and format.swizzles[0] == SWIZZLE_NONE:
-        return False
-
-    return True
-
-
-def native_type(format):
-    '''Get the native appropriate for a format.'''
-
-    if format.layout == PLAIN:
-        if not format.is_array():
-            # For arithmetic pixel formats return the integer type that matches the whole pixel
-            return 'uint%u_t' % format.block_size()
-        else:
-            # For array pixel formats return the integer type that matches the color channel
-            channel = format.channels[0]
-            if channel.type == UNSIGNED:
-                return 'uint%u_t' % channel.size
-            elif channel.type == SIGNED:
-                return 'int%u_t' % channel.size
-            elif channel.type == FLOAT:
-                if channel.size == 32:
-                    return 'float'
-                elif channel.size == 64:
-                    return 'double'
-                else:
-                    assert False
-            else:
-                assert False
-    else:
-        assert False
-
-
-def generate_srgb_tables():
-    print 'static ubyte srgb_to_linear[256] = {'
-    for i in range(256):
-        print '   %s,' % (int(math.pow((i / 255.0 + 0.055) / 1.055, 2.4) * 255))
-    print '};'
-    print
-    print 'static ubyte linear_to_srgb[256] = {'
-    print '   0,'
-    for i in range(1, 256):
-        print '   %s,' % (int((1.055 * math.pow(i / 255.0, 0.41666) - 0.055) * 255))
-    print '};'
-    print
-
-
-def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
-    '''Generate the function to read pixels from a particular format'''
-
-    name = format.short_name()
-
-    src_native_type = native_type(format)
-
-    print 'static void'
-    print 'util_format_%s_read_%s(%s *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, dst_suffix, dst_native_type)
-    print '{'
-    print '   unsigned x, y;'
-    print '   const uint8_t *src_row = src + y0*src_stride;'
-    print '   %s *dst_row = dst;' % dst_native_type
-    print '   for (y = 0; y < h; ++y) {'
-    print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
-    print '      %s *dst_pixel = dst_row;' %dst_native_type
-    print '      for (x = 0; x < w; ++x) {'
-
-    names = ['']*4
-    if format.colorspace == 'rgb':
-        for i in range(4):
-            swizzle = format.swizzles[i]
-            if swizzle < 4:
-                names[swizzle] += 'rgba'[i]
-    elif format.colorspace == 'zs':
-        swizzle = format.swizzles[0]
-        if swizzle < 4:
-            names[swizzle] = 'z'
-        else:
-            assert False
-    else:
-        assert False
-
-    if format.layout == PLAIN:
-        if not format.is_array():
-            print '         %s pixel = *src_pixel++;' % src_native_type
-            shift = 0;
-            for i in range(4):
-                src_channel = format.channels[i]
-                width = src_channel.size
-                if names[i]:
-                    value = 'pixel'
-                    mask = (1 << width) - 1
-                    if shift:
-                        value = '(%s >> %u)' % (value, shift)
-                    if shift + width < format.block_size():
-                        value = '(%s & 0x%x)' % (value, mask)
-                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
-                shift += width
-        else:
-            for i in range(4):
-                src_channel = format.channels[i]
-                if names[i]:
-                    value = 'src_pixel[%u]' % i
-                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
-            print '         src_pixel += %u;' % (format.nr_channels())
-    else:
-        assert False
-
-    for i in range(4):
-        if format.colorspace == 'rgb':
-            swizzle = format.swizzles[i]
-            if swizzle < 4:
-                value = names[swizzle]
-            elif swizzle == SWIZZLE_0:
-                value = '0'
-            elif swizzle == SWIZZLE_1:
-                value = get_one(dst_channel)
-            else:
-                assert False
-        elif format.colorspace == 'zs':
-            if i < 3:
-                value = 'z'
-            else:
-                value = get_one(dst_channel)
-        else:
-            assert False
-        print '         *dst_pixel++ = %s; /* %s */' % (value, 'rgba'[i])
-
-    print '      }'
-    print '      src_row += src_stride;'
-    print '      dst_row += dst_stride/sizeof(*dst_row);'
-    print '   }'
-    print '}'
-    print
-    
-
-def generate_format_write(format, src_channel, src_native_type, src_suffix):
-    '''Generate the function to write pixels to a particular format'''
-
-    name = format.short_name()
-
-    dst_native_type = native_type(format)
-
-    print 'static void'
-    print 'util_format_%s_write_%s(const %s *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
-    print '{'
-    print '   unsigned x, y;'
-    print '   uint8_t *dst_row = dst + y0*dst_stride;'
-    print '   const %s *src_row = src;' % src_native_type
-    print '   for (y = 0; y < h; ++y) {'
-    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
-    print '      const %s *src_pixel = src_row;' %src_native_type
-    print '      for (x = 0; x < w; ++x) {'
-
-    inv_swizzle = format.inv_swizzles()
-
-    if format.layout == PLAIN:
-        if not format.is_array():
-            print '         %s pixel = 0;' % dst_native_type
-            shift = 0;
-            for i in range(4):
-                dst_channel = format.channels[i]
-                width = dst_channel.size
-                if inv_swizzle[i] is not None:
-                    value = 'src_pixel[%u]' % inv_swizzle[i]
-                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-                    if shift:
-                        value = '(%s << %u)' % (value, shift)
-                    print '         pixel |= %s;' % value
-                shift += width
-            print '         *dst_pixel++ = pixel;'
-        else:
-            for i in range(4):
-                dst_channel = format.channels[i]
-                if inv_swizzle[i] is not None:
-                    value = 'src_pixel[%u]' % inv_swizzle[i]
-                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-                    print '         *dst_pixel++ = %s;' % value
-    else:
-        assert False
-    print '         src_pixel += 4;'
-
-    print '      }'
-    print '      dst_row += dst_stride;'
-    print '      src_row += src_stride/sizeof(*src_row);'
-    print '   }'
-    print '}'
-    print
-    
-
-def generate_read(formats, dst_channel, dst_native_type, dst_suffix):
-    '''Generate the dispatch function to read pixels from any format'''
-
-    for format in formats:
-        if is_format_supported(format):
-            generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
-
-    print 'void'
-    print 'util_format_read_%s(enum pipe_format format, %s *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
-    print '{'
-    print '   void (*func)(%s *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % dst_native_type
-    print '   switch(format) {'
-    for format in formats:
-        if is_format_supported(format):
-            print '   case %s:' % format.name
-            print '      func = &util_format_%s_read_%s;' % (format.short_name(), dst_suffix)
-            print '      break;'
-    print '   default:'
-    print '      debug_printf("unsupported format\\n");'
-    print '      return;'
-    print '   }'
-    print '   func(dst, dst_stride, (const uint8_t *)src, src_stride, x, y, w, h);'
-    print '}'
-    print
-
-
-def generate_write(formats, src_channel, src_native_type, src_suffix):
-    '''Generate the dispatch function to write pixels to any format'''
-
-    for format in formats:
-        if is_format_supported(format):
-            generate_format_write(format, src_channel, src_native_type, src_suffix)
-
-    print 'void'
-    print 'util_format_write_%s(enum pipe_format format, const %s *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
-    
-    print '{'
-    print '   void (*func)(const %s *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % src_native_type
-    print '   switch(format) {'
-    for format in formats:
-        if is_format_supported(format):
-            print '   case %s:' % format.name
-            print '      func = &util_format_%s_write_%s;' % (format.short_name(), src_suffix)
-            print '      break;'
-    print '   default:'
-    print '      debug_printf("unsupported format\\n");'
-    print '      return;'
-    print '   }'
-    print '   func(src, src_stride, (uint8_t *)dst, dst_stride, x, y, w, h);'
-    print '}'
-    print
-
-
-def main():
-    formats = []
-    for arg in sys.argv[1:]:
-        formats.extend(parse(arg))
-
-    print '/* This file is autogenerated by u_format_access.py from u_format.csv. Do not edit directly. */'
-    print
-    # This will print the copyright message on the top of this file
-    print __doc__.strip()
-    print
-    print '#include "pipe/p_compiler.h"'
-    print '#include "u_math.h"'
-    print '#include "u_format_pack.h"'
-    print
-
-    generate_srgb_tables()
-
-    type = Channel(FLOAT, False, 32)
-    native_type = 'float'
-    suffix = '4f'
-
-    generate_read(formats, type, native_type, suffix)
-    generate_write(formats, type, native_type, suffix)
-
-    type = Channel(UNSIGNED, True, 8)
-    native_type = 'uint8_t'
-    suffix = '4ub'
-
-    generate_read(formats, type, native_type, suffix)
-    generate_write(formats, type, native_type, suffix)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/gallium/auxiliary/util/u_format_other.c b/src/gallium/auxiliary/util/u_format_other.c
new file mode 100644
index 0000000000..723fa8c3bf
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_other.c
@@ -0,0 +1,267 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "u_math.h"
+#include "u_format_other.h"
+
+
+void
+util_format_r9g9b9e5_float_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height)
+{
+
+}
+
+void
+util_format_r9g9b9e5_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                      const float *src_row, unsigned src_stride,
+                                      unsigned width, unsigned height)
+{
+
+}
+
+void
+util_format_r9g9b9e5_float_fetch_rgba_float(float *dst, const uint8_t *src,
+                                       unsigned i, unsigned j)
+{
+
+}
+
+
+void
+util_format_r9g9b9e5_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+
+}
+
+
+void
+util_format_r9g9b9e5_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+
+}
+
+
+void
+util_format_r1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                  const uint8_t *src_row, unsigned src_stride,
+                                  unsigned width, unsigned height)
+{
+
+}
+
+
+void
+util_format_r1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                const float *src_row, unsigned src_stride,
+                                unsigned width, unsigned height)
+{
+
+}
+
+
+void
+util_format_r1_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                 unsigned i, unsigned j)
+{
+
+}
+
+
+void
+util_format_r1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                   const uint8_t *src_row, unsigned src_stride,
+                                   unsigned width, unsigned height)
+{
+
+}
+
+
+void
+util_format_r1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                 const uint8_t *src_row, unsigned src_stride,
+                                 unsigned width, unsigned height)
+{
+}
+
+
+/*
+ * PIPE_FORMAT_R8G8Bx_SNORM
+ *
+ * A.k.a. D3DFMT_CxV8U8
+ */
+
+
+void
+util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                      const uint8_t *src_row, unsigned src_stride,
+                                      unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for(y = 0; y < height; y += 1) {
+      float *dst = dst_row;
+      const uint16_t *src = (const uint16_t *)src_row;
+      for(x = 0; x < width; x += 1) {
+         uint16_t value = *src++;
+         int16_t r, g;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r = ((int16_t)(value << 8)) >> 8;
+         g = ((int16_t)(value << 0)) >> 8;
+
+         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
+         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
+         dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+      }
+      src_row += src_stride;
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; y += 1) {
+      uint8_t *dst = dst_row;
+      const uint16_t *src = (const uint16_t *)src_row;
+      for(x = 0; x < width; x += 1) {
+         uint16_t value = *src++;
+         int16_t r, g;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r = ((int16_t)(value << 8)) >> 8;
+         g = ((int16_t)(value << 0)) >> 8;
+
+         dst[0] = (uint8_t)(((uint16_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
+         dst[1] = (uint8_t)(((uint16_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
+         dst[2] = (uint8_t)sqrtf(0x7f*0x7f - r * r - g * g) * 0xff / 0x7f; /* b */
+         dst[3] = 255; /* a */
+         dst += 4;
+      }
+      src_row += src_stride;
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_r8g8bx_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                    const float *src_row, unsigned src_stride,
+                                    unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; y += 1) {
+      const float *src = src_row;
+      uint16_t *dst = (uint16_t *)dst_row;
+      for(x = 0; x < width; x += 1) {
+         uint16_t value = 0;
+
+         value |= (uint16_t)(((int8_t)(CLAMP(src[0], -1, 1) * 0x7f)) & 0xff) ;
+         value |= (uint16_t)((((int8_t)(CLAMP(src[1], -1, 1) * 0x7f)) & 0xff) << 8) ;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 4;
+      }
+      dst_row += dst_stride;
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_r8g8bx_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                     const uint8_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for(y = 0; y < height; y += 1) {
+      const uint8_t *src = src_row;
+      uint16_t *dst = (uint16_t *)dst_row;
+      for(x = 0; x < width; x += 1) {
+         uint16_t value = 0;
+
+         value |= src[0] >> 1;
+         value |= (src[1] >> 1) << 8;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 4;
+      }
+      dst_row += dst_stride;
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                     unsigned i, unsigned j)
+{
+   uint16_t value = *(const uint16_t *)src;
+   int16_t r, g;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+
+   r = ((int16_t)(value << 8)) >> 8;
+   g = ((int16_t)(value << 0)) >> 8;
+
+   dst[0] = r * (1.0f/0x7f); /* r */
+   dst[1] = g * (1.0f/0x7f); /* g */
+   dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */
+   dst[3] = 1.0f; /* a */
+}
diff --git a/src/gallium/auxiliary/util/u_format_other.h b/src/gallium/auxiliary/util/u_format_other.h
new file mode 100644
index 0000000000..98c97be33f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_other.h
@@ -0,0 +1,108 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FORMAT_OTHER_H_
+#define U_FORMAT_OTHER_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+void
+util_format_r9g9b9e5_float_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height);
+
+void
+util_format_r9g9b9e5_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                      const float *src_row, unsigned src_stride,
+                                      unsigned width, unsigned height);
+
+void
+util_format_r9g9b9e5_float_fetch_rgba_float(float *dst, const uint8_t *src,
+                                       unsigned i, unsigned j);
+
+void
+util_format_r9g9b9e5_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height);
+
+void
+util_format_r9g9b9e5_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height);
+
+void
+util_format_r1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                  const uint8_t *src_row, unsigned src_stride,
+                                  unsigned width, unsigned height);
+
+void
+util_format_r1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                const float *src_row, unsigned src_stride,
+                                unsigned width, unsigned height);
+
+void
+util_format_r1_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                 unsigned i, unsigned j);
+
+void
+util_format_r1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                   const uint8_t *src_row, unsigned src_stride,
+                                   unsigned width, unsigned height);
+
+void
+util_format_r1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                 const uint8_t *src_row, unsigned src_stride,
+                                 unsigned width, unsigned height);
+
+void
+util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                      const uint8_t *src_row, unsigned src_stride,
+                                      unsigned width, unsigned height);
+
+void
+util_format_r8g8bx_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                    const float *src_row, unsigned src_stride,
+                                    unsigned width, unsigned height);
+
+void
+util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                     unsigned i, unsigned j);
+
+void
+util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height);
+
+void
+util_format_r8g8bx_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                     const uint8_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height);
+
+#endif /* U_FORMAT_OTHER_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index 3f33f7cc02..0c1bbc84c1 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -3,7 +3,7 @@
 '''
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -38,6 +38,7 @@
 
 
 import sys
+import math
 
 from u_format_parse import *
 
@@ -45,19 +46,35 @@ from u_format_parse import *
 def generate_format_type(format):
     '''Generate a structure that describes the format.'''
 
+    assert format.layout == PLAIN
+    
     print 'union util_format_%s {' % format.short_name()
-    if format.is_bitmask():
+    
+    if format.block_size() in (8, 16, 32, 64):
         print '   uint%u_t value;' % (format.block_size(),)
+
+    use_bitfields = False
+    for channel in format.channels:
+        if channel.size % 8 or not is_pot(channel.size):
+            use_bitfields = True
+
     print '   struct {'
     for channel in format.channels:
-        if format.is_bitmask() and not format.is_array():
+        if use_bitfields:
             if channel.type == VOID:
                 if channel.size:
                     print '      unsigned %s:%u;' % (channel.name, channel.size)
             elif channel.type == UNSIGNED:
                 print '      unsigned %s:%u;' % (channel.name, channel.size)
-            elif channel.type == SIGNED:
+            elif channel.type in (SIGNED, FIXED):
                 print '      int %s:%u;' % (channel.name, channel.size)
+            elif channel.type == FLOAT:
+                if channel.size == 64:
+                    print '      double %s;' % (channel.name)
+                elif channel.size == 32:
+                    print '      float %s;' % (channel.name)
+                else:
+                    print '      unsigned %s:%u;' % (channel.name, channel.size)
             else:
                 assert 0
         else:
@@ -88,7 +105,7 @@ def generate_format_type(format):
 def bswap_format(format):
     '''Generate a structure that describes the format.'''
 
-    if format.is_bitmask() and not format.is_array():
+    if format.is_bitmask() and not format.is_array() and format.block_size() > 8:
         print '#ifdef PIPE_ARCH_BIG_ENDIAN'
         print '   pixel.value = util_bswap%u(pixel.value);' % format.block_size()
         print '#endif'
@@ -105,12 +122,10 @@ def is_format_supported(format):
 
     for i in range(4):
         channel = format.channels[i]
-        if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT):
+        if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT, FIXED):
+            return False
+        if channel.type == FLOAT and channel.size not in (16, 32, 64):
             return False
-
-    # We can only read a color from a depth/stencil format if the depth channel is present
-    if format.colorspace == 'zs' and format.swizzles[0] == SWIZZLE_NONE:
-        return False
 
     return True
 
@@ -124,15 +139,17 @@ def native_type(format):
             return 'uint%u_t' % format.block_size()
         else:
             # For array pixel formats return the integer type that matches the color channel
-            type = format.channels[0]
-            if type.type == UNSIGNED:
-                return 'uint%u_t' % type.size
-            elif type.type == SIGNED:
-                return 'int%u_t' % type.size
-            elif type.type == FLOAT:
-                if type.size == 32:
+            channel = format.channels[0]
+            if channel.type in (UNSIGNED, VOID):
+                return 'uint%u_t' % channel.size
+            elif channel.type in (SIGNED, FIXED):
+                return 'int%u_t' % channel.size
+            elif channel.type == FLOAT:
+                if channel.size == 16:
+                    return 'uint16_t'
+                elif channel.size == 32:
                     return 'float'
-                elif type.size == 64:
+                elif channel.size == 64:
                     return 'double'
                 else:
                     assert False
@@ -171,37 +188,35 @@ def get_one_shift(type):
     assert False
 
 
-def get_one(type):
+def value_to_native(type, value):
     '''Get the value of unity for this type.'''
-    if type.type == 'FLOAT' or not type.norm:
-        return 1
+    if type.type == FLOAT:
+        return value
+    if type.type == FIXED:
+        return int(value * (1 << (type.size/2)))
+    if not type.norm:
+        return int(value)
+    if type.type == UNSIGNED:
+        return int(value * ((1 << type.size) - 1))
+    if type.type == SIGNED:
+        return int(value * ((1 << (type.size - 1)) - 1))
+    assert False
+
+
+def native_to_constant(type, value):
+    '''Get the value of unity for this type.'''
+    if type.type == FLOAT:
+        if type.size <= 32:
+            return "%ff" % value 
+        else:
+            return "%ff" % value 
     else:
-        return (1 << get_one_shift(type)) - 1
-
-
-def generate_clamp():
-    '''Code generate the clamping functions for each type.
-
-    We don't use a macro so that arguments with side effects, 
-    like *src_pixel++ are correctly handled.
-    '''
-
-    for suffix, native_type in [
-        ('', 'double'),
-        ('f', 'float'),
-        ('ui', 'unsigned int'),
-        ('si', 'int'),
-    ]:
-        print 'static INLINE %s' % native_type
-        print 'clamp%s(%s value, %s lbound, %s ubound)' % (suffix, native_type, native_type, native_type)
-        print '{'
-        print '   if(value < lbound)'
-        print '      return lbound;'
-        print '   if(value > ubound)'
-        print '      return ubound;'
-        print '   return value;'
-        print '}'
-        print
+        return str(int(value))
+
+
+def get_one(type):
+    '''Get the value of unity for this type.'''
+    return value_to_native(type, 1)
 
 
 def clamp_expr(src_channel, dst_channel, dst_native_type, value):
@@ -211,274 +226,448 @@ def clamp_expr(src_channel, dst_channel, dst_native_type, value):
     if src_channel == dst_channel:
         return value
 
-    # Pick the approriate clamp function
-    if src_channel.type == FLOAT:
-        if src_channel.size == 32:
-            func = 'clampf'
-        elif src_channel.size == 64:
-            func = 'clamp'
-        else:
-            assert False
-    elif src_channel.type == UNSIGNED:
-        func = 'clampui'
-    elif src_channel.type == SIGNED:
-        func = 'clampsi'
-    else:
-        assert False
-
     src_min = src_channel.min()
     src_max = src_channel.max()
     dst_min = dst_channel.min()
     dst_max = dst_channel.max()
+    
+    # Translate the destination range to the src native value
+    dst_min_native = value_to_native(src_channel, dst_min)
+    dst_max_native = value_to_native(src_channel, dst_max)
 
     if src_min < dst_min and src_max > dst_max:
-        return 'CLAMP(%s, %s, %s)' % (value, dst_min, dst_max)
+        return 'CLAMP(%s, %s, %s)' % (value, dst_min_native, dst_max_native)
 
     if src_max > dst_max:
-        return 'MIN2(%s, %s)' % (value, dst_max)
+        return 'MIN2(%s, %s)' % (value, dst_max_native)
         
     if src_min < dst_min:
-        return 'MAX2(%s, %s)' % (value, dst_min)
+        return 'MAX2(%s, %s)' % (value, dst_min_native)
 
     return value
 
 
-def conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=True):
+def conversion_expr(src_channel, 
+                    dst_channel, dst_native_type, 
+                    value, 
+                    clamp=True, 
+                    src_colorspace = RGB, 
+                    dst_colorspace = RGB):
     '''Generate the expression to convert a value between two types.'''
 
+    if src_colorspace != dst_colorspace:
+        if src_colorspace == SRGB:
+            assert src_channel.type == UNSIGNED
+            assert src_channel.norm
+            assert src_channel.size == 8
+            assert dst_colorspace == RGB
+            if dst_channel.type == FLOAT:
+                return 'util_format_srgb_8unorm_to_linear_float(%s)' % value
+            else:
+                assert dst_channel.type == UNSIGNED
+                assert dst_channel.norm
+                assert dst_channel.size == 8
+                return 'util_format_srgb_to_linear_8unorm(%s)' % value
+        elif dst_colorspace == SRGB:
+            assert dst_channel.type == UNSIGNED
+            assert dst_channel.norm
+            assert dst_channel.size == 8
+            assert src_colorspace == RGB
+            if src_channel.type == FLOAT:
+                return 'util_format_linear_float_to_srgb_8unorm(%s)' % value
+            else:
+                assert src_channel.type == UNSIGNED
+                assert src_channel.norm
+                assert src_channel.size == 8
+                return 'util_format_linear_to_srgb_8unorm(%s)' % value
+        elif src_colorspace == ZS:
+            pass
+        elif dst_colorspace == ZS:
+            pass
+        else:
+            assert 0
+
     if src_channel == dst_channel:
         return value
 
-    if src_channel.type == FLOAT and dst_channel.type == FLOAT:
-        return '(%s)%s' % (dst_native_type, value)
-    
-    if not src_channel.norm and not dst_channel.norm:
-        return '(%s)%s' % (dst_native_type, value)
+    src_type = src_channel.type
+    src_size = src_channel.size
+    src_norm = src_channel.norm
 
-    if clamp:
-        value = clamp_expr(src_channel, dst_channel, dst_native_type, value)
+    # Promote half to float
+    if src_type == FLOAT and src_size == 16:
+        value = 'util_half_to_float(%s)' % value
+        src_size = 32
 
-    if dst_channel.type == FLOAT:
-        if src_channel.norm:
-            one = get_one(src_channel)
-            if src_channel.size <= 23:
-                scale = '(1.0f/0x%x)' % one
-            else:
-                # bigger than single precision mantissa, use double
-                scale = '(1.0/0x%x)' % one
-            value = '(%s * %s)' % (value, scale)
-        return '(%s)%s' % (dst_native_type, value)
+    # Special case for float <-> ubytes for more accurate results
+    # Done before clamping since these functions already take care of that
+    if src_type == UNSIGNED and src_norm and src_size == 8 and dst_channel.type == FLOAT and dst_channel.size == 32:
+        return 'ubyte_to_float(%s)' % value
+    if src_type == FLOAT and src_size == 32 and dst_channel.type == UNSIGNED and dst_channel.norm and dst_channel.size == 8:
+        return 'float_to_ubyte(%s)' % value
 
-    if src_channel.type == FLOAT:
-        if dst_channel.norm:
-            dst_one = get_one(dst_channel)
-            if dst_channel.size <= 23:
-                scale = '0x%x' % dst_one
-            else:
-                # bigger than single precision mantissa, use double
-                scale = '(double)0x%x' % dst_one
-            value = '(%s * %s)' % (value, scale)
-        return '(%s)%s' % (dst_native_type, value)
+    if clamp:
+        if dst_channel.type != FLOAT or src_type != FLOAT:
+            value = clamp_expr(src_channel, dst_channel, dst_native_type, value)
 
-    if not src_channel.norm and not dst_channel.norm:
-        # neither is normalized -- just cast
-        return '(%s)%s' % (dst_native_type, value)
+    if src_type in (SIGNED, UNSIGNED) and dst_channel.type in (SIGNED, UNSIGNED):
+        if not src_norm and not dst_channel.norm:
+            # neither is normalized -- just cast
+            return '(%s)%s' % (dst_native_type, value)
 
-    if src_channel.type in (SIGNED, UNSIGNED) and dst_channel.type in (SIGNED, UNSIGNED):
         src_one = get_one(src_channel)
         dst_one = get_one(dst_channel)
 
-        if src_one > dst_one and src_channel.norm:
+        if src_one > dst_one and src_norm and dst_channel.norm:
             # We can just bitshift
             src_shift = get_one_shift(src_channel)
             dst_shift = get_one_shift(dst_channel)
             value = '(%s >> %s)' % (value, src_shift - dst_shift)
         else:
             # We need to rescale using an intermediate type big enough to hold the multiplication of both
-            tmp_native_type = intermediate_native_type(src_channel.size + dst_channel.size, src_channel.sign and dst_channel.sign)
-            value = '(%s)%s' % (tmp_native_type, value)
+            tmp_native_type = intermediate_native_type(src_size + dst_channel.size, src_channel.sign and dst_channel.sign)
+            value = '((%s)%s)' % (tmp_native_type, value)
             value = '(%s * 0x%x / 0x%x)' % (value, dst_one, src_one)
         value = '(%s)%s' % (dst_native_type, value)
         return value
 
-    assert False
+    # Promote to either float or double
+    if src_type != FLOAT:
+        if src_norm or src_type == FIXED:
+            one = get_one(src_channel)
+            if src_size <= 23:
+                value = '(%s * (1.0f/0x%x))' % (value, one)
+                if dst_channel.size <= 32:
+                    value = '(float)%s' % value
+                src_size = 32
+            else:
+                # bigger than single precision mantissa, use double
+                value = '(%s * (1.0/0x%x))' % (value, one)
+                src_size = 64
+            src_norm = False
+        else:
+            if src_size <= 23 or dst_channel.size <= 32:
+                value = '(float)%s' % value
+                src_size = 32
+            else:
+                # bigger than single precision mantissa, use double
+                value = '(double)%s' % value
+                src_size = 64
+        src_type = FLOAT
 
+    # Convert double or float to non-float
+    if dst_channel.type != FLOAT:
+        if dst_channel.norm or dst_channel.type == FIXED:
+            dst_one = get_one(dst_channel)
+            if dst_channel.size <= 23:
+                value = '(%s * 0x%x)' % (value, dst_one)
+            else:
+                # bigger than single precision mantissa, use double
+                value = '(%s * (double)0x%x)' % (value, dst_one)
+        value = '(%s)%s' % (dst_native_type, value)
+    else:
+        # Cast double to float when converting to either half or float
+        if dst_channel.size <= 32 and src_size > 32:
+            value = '(float)%s' % value
+            src_size = 32
 
-def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix):
-    '''Generate the function to unpack pixels from a particular format'''
+        if dst_channel.size == 16:
+            value = 'util_float_to_half(%s)' % value
+        elif dst_channel.size == 64 and src_size < 64:
+            value = '(double)%s' % value
 
-    name = format.short_name()
+    return value
 
-    src_native_type = native_type(format)
 
-    print 'static INLINE void'
-    print 'util_format_%s_unpack_%s(%s *dst, const void *src)' % (name, dst_suffix, dst_native_type)
-    print '{'
-    print '   union util_format_%s pixel;' % format.short_name()
-    print '   memcpy(&pixel, src, sizeof pixel);'
-    bswap_format(format)
+def generate_unpack_kernel(format, dst_channel, dst_native_type):
 
+    if not is_format_supported(format):
+        return
+    
     assert format.layout == PLAIN
 
-    for i in range(4):
-        swizzle = format.swizzles[i]
-        if swizzle < 4:
-            src_channel = format.channels[swizzle]
-            value = 'pixel.chan.%s' % src_channel.name 
-            value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-        elif swizzle == SWIZZLE_0:
-            value = '0'
-        elif swizzle == SWIZZLE_1:
-            value = get_one(dst_channel)
-        elif swizzle == SWIZZLE_NONE:
-            value = '0'
-        else:
-            assert False
-        if format.colorspace == ZS:
-            if i == 3:
-                value = get_one(dst_channel)
-            elif i >= 1:
-                value = 'dst[0]'
-        print '   dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
+    src_native_type = native_type(format)
 
-    print '}'
-    print
+    if format.is_bitmask():
+        depth = format.block_size()
+        print '         uint%u_t value = *(const uint%u_t *)src;' % (depth, depth) 
+
+        # Declare the intermediate variables
+        for i in range(format.nr_channels()):
+            src_channel = format.channels[i]
+            if src_channel.type == UNSIGNED:
+                print '         uint%u_t %s;' % (depth, src_channel.name)
+            elif src_channel.type == SIGNED:
+                print '         int%u_t %s;' % (depth, src_channel.name)
+
+        if depth > 8:
+            print '#ifdef PIPE_ARCH_BIG_ENDIAN'
+            print '         value = util_bswap%u(value);' % depth
+            print '#endif'
+
+        # Compute the intermediate unshifted values 
+        shift = 0
+        for i in range(format.nr_channels()):
+            src_channel = format.channels[i]
+            value = 'value'
+            if src_channel.type == UNSIGNED:
+                if shift:
+                    value = '%s >> %u' % (value, shift)
+                if shift + src_channel.size < depth:
+                    value = '(%s) & 0x%x' % (value, (1 << src_channel.size) - 1)
+            elif src_channel.type == SIGNED:
+                if shift + src_channel.size < depth:
+                    # Align the sign bit
+                    lshift = depth - (shift + src_channel.size)
+                    value = '%s << %u' % (value, lshift)
+                # Cast to signed
+                value = '(int%u_t)(%s) ' % (depth, value)
+                if src_channel.size < depth:
+                    # Align the LSB bit
+                    rshift = depth - src_channel.size
+                    value = '(%s) >> %u' % (value, rshift)
+            else:
+                value = None
+                
+            if value is not None:
+                print '         %s = %s;' % (src_channel.name, value)
+                
+            shift += src_channel.size
+
+        # Convert, swizzle, and store final values
+        for i in range(4):
+            swizzle = format.swizzles[i]
+            if swizzle < 4:
+                src_channel = format.channels[swizzle]
+                src_colorspace = format.colorspace
+                if src_colorspace == SRGB and i == 3:
+                    # Alpha channel is linear
+                    src_colorspace = RGB
+                value = src_channel.name 
+                value = conversion_expr(src_channel, 
+                                        dst_channel, dst_native_type, 
+                                        value,
+                                        src_colorspace = src_colorspace)
+            elif swizzle == SWIZZLE_0:
+                value = '0'
+            elif swizzle == SWIZZLE_1:
+                value = get_one(dst_channel)
+            elif swizzle == SWIZZLE_NONE:
+                value = '0'
+            else:
+                assert False
+            print '         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
+        
+    else:
+        print '         union util_format_%s pixel;' % format.short_name()
+        print '         memcpy(&pixel, src, sizeof pixel);'
+        bswap_format(format)
+    
+        for i in range(4):
+            swizzle = format.swizzles[i]
+            if swizzle < 4:
+                src_channel = format.channels[swizzle]
+                src_colorspace = format.colorspace
+                if src_colorspace == SRGB and i == 3:
+                    # Alpha channel is linear
+                    src_colorspace = RGB
+                value = 'pixel.chan.%s' % src_channel.name 
+                value = conversion_expr(src_channel, 
+                                        dst_channel, dst_native_type, 
+                                        value,
+                                        src_colorspace = src_colorspace)
+            elif swizzle == SWIZZLE_0:
+                value = '0'
+            elif swizzle == SWIZZLE_1:
+                value = get_one(dst_channel)
+            elif swizzle == SWIZZLE_NONE:
+                value = '0'
+            else:
+                assert False
+            print '         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
     
 
-def generate_format_pack(format, src_channel, src_native_type, src_suffix):
-    '''Generate the function to pack pixels to a particular format'''
-
-    name = format.short_name()
+def generate_pack_kernel(format, src_channel, src_native_type):
 
+    if not is_format_supported(format):
+        return
+    
     dst_native_type = native_type(format)
 
-    print 'static INLINE void'
-    print 'util_format_%s_pack_%s(void *dst, %s r, %s g, %s b, %s a)' % (name, src_suffix, src_native_type, src_native_type, src_native_type, src_native_type)
-    print '{'
-    print '   union util_format_%s pixel;' % format.short_name()
-
     assert format.layout == PLAIN
 
     inv_swizzle = format.inv_swizzles()
 
-    for i in range(4):
-        dst_channel = format.channels[i]
-        width = dst_channel.size
-        if inv_swizzle[i] is None:
-            continue
-        value = 'rgba'[inv_swizzle[i]]
-        value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
-        if format.colorspace == ZS:
-            if i == 3:
-                value = get_one(dst_channel)
-            elif i >= 1:
-                value = '0'
-        print '   pixel.chan.%s = %s;' % (dst_channel.name, value)
+    if format.is_bitmask():
+        depth = format.block_size()
+        print '         uint%u_t value = 0;' % depth 
+
+        shift = 0
+        for i in range(4):
+            dst_channel = format.channels[i]
+            if inv_swizzle[i] is not None:
+                value ='src[%u]' % inv_swizzle[i]
+                dst_colorspace = format.colorspace
+                if dst_colorspace == SRGB and inv_swizzle[i] == 3:
+                    # Alpha channel is linear
+                    dst_colorspace = RGB
+                value = conversion_expr(src_channel, 
+                                        dst_channel, dst_native_type, 
+                                        value,
+                                        dst_colorspace = dst_colorspace)
+                if dst_channel.type in (UNSIGNED, SIGNED):
+                    if shift + dst_channel.size < depth:
+                        value = '(%s) & 0x%x' % (value, (1 << dst_channel.size) - 1)
+                    if shift:
+                        value = '(%s) << %u' % (value, shift)
+                    if dst_channel.type == SIGNED:
+                        # Cast to unsigned
+                        value = '(uint%u_t)(%s) ' % (depth, value)
+                else:
+                    value = None
+                if value is not None:
+                    print '         value |= %s;' % (value)
+                
+            shift += dst_channel.size
+
+        if depth > 8:
+            print '#ifdef PIPE_ARCH_BIG_ENDIAN'
+            print '         value = util_bswap%u(value);' % depth
+            print '#endif'
+        
+        print '         *(uint%u_t *)dst = value;' % depth 
+
+    else:
+        print '         union util_format_%s pixel;' % format.short_name()
+    
+        for i in range(4):
+            dst_channel = format.channels[i]
+            width = dst_channel.size
+            if inv_swizzle[i] is None:
+                continue
+            dst_colorspace = format.colorspace
+            if dst_colorspace == SRGB and inv_swizzle[i] == 3:
+                # Alpha channel is linear
+                dst_colorspace = RGB
+            value ='src[%u]' % inv_swizzle[i]
+            value = conversion_expr(src_channel, 
+                                    dst_channel, dst_native_type, 
+                                    value, 
+                                    dst_colorspace = dst_colorspace)
+            print '         pixel.chan.%s = %s;' % (dst_channel.name, value)
+    
+        bswap_format(format)
+        print '         memcpy(dst, &pixel, sizeof pixel);'
+    
+
+def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the function to unpack pixels from a particular format'''
+
+    name = format.short_name()
+
+    print 'static INLINE void'
+    print 'util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type)
+    print '{'
+
+    if is_format_supported(format):
+        print '   unsigned x, y;'
+        print '   for(y = 0; y < height; y += %u) {' % (format.block_height,)
+        print '      %s *dst = dst_row;' % (dst_native_type)
+        print '      const uint8_t *src = src_row;'
+        print '      for(x = 0; x < width; x += %u) {' % (format.block_width,)
+        
+        generate_unpack_kernel(format, dst_channel, dst_native_type)
+    
+        print '         src += %u;' % (format.block_size() / 8,)
+        print '         dst += 4;'
+        print '      }'
+        print '      src_row += src_stride;'
+        print '      dst_row += dst_stride/sizeof(*dst_row);'
+        print '   }'
 
-    bswap_format(format)
-    print '   memcpy(dst, &pixel, sizeof pixel);'
     print '}'
     print
     
 
-def generate_unpack(formats, dst_channel, dst_native_type, dst_suffix):
-    '''Generate the dispatch function to unpack pixels from any format'''
+def generate_format_pack(format, src_channel, src_native_type, src_suffix):
+    '''Generate the function to pack pixels to a particular format'''
 
-    for format in formats:
-        if is_format_supported(format):
-            generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix)
+    name = format.short_name()
 
     print 'static INLINE void'
-    print 'util_format_unpack_%s(enum pipe_format format, %s *dst, const void *src)' % (dst_suffix, dst_native_type)
+    print 'util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type)
     print '{'
-    print '   void (*func)(%s *dst, const void *src);' % dst_native_type
-    print '   switch(format) {'
-    for format in formats:
-        if is_format_supported(format):
-            print '   case %s:' % format.name
-            print '      func = &util_format_%s_unpack_%s;' % (format.short_name(), dst_suffix)
-            print '      break;'
-    print '   default:'
-    print '      debug_printf("unsupported format\\n");'
-    print '      return;'
-    print '   }'
-    print '   func(dst, src);'
+    
+    if is_format_supported(format):
+        print '   unsigned x, y;'
+        print '   for(y = 0; y < height; y += %u) {' % (format.block_height,)
+        print '      const %s *src = src_row;' % (src_native_type)
+        print '      uint8_t *dst = dst_row;'
+        print '      for(x = 0; x < width; x += %u) {' % (format.block_width,)
+    
+        generate_pack_kernel(format, src_channel, src_native_type)
+            
+        print '         src += 4;'
+        print '         dst += %u;' % (format.block_size() / 8,)
+        print '      }'
+        print '      dst_row += dst_stride;'
+        print '      src_row += src_stride/sizeof(*src_row);'
+        print '   }'
+        
     print '}'
     print
+    
 
+def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the function to unpack pixels from a particular format'''
 
-def generate_pack(formats, src_channel, src_native_type, src_suffix):
-    '''Generate the dispatch function to pack pixels to any format'''
-
-    for format in formats:
-        if is_format_supported(format):
-            generate_format_pack(format, src_channel, src_native_type, src_suffix)
+    name = format.short_name()
 
     print 'static INLINE void'
-    print 'util_format_pack_%s(enum pipe_format format, void *dst, %s r, %s g, %s b, %s a)' % (src_suffix, src_native_type, src_native_type, src_native_type, src_native_type)
+    print 'util_format_%s_fetch_%s(%s *dst, const uint8_t *src, unsigned i, unsigned j)' % (name, dst_suffix, dst_native_type)
     print '{'
-    print '   void (*func)(void *dst, %s r, %s g, %s b, %s a);' % (src_native_type, src_native_type, src_native_type, src_native_type)
-    print '   switch(format) {'
-    for format in formats:
-        if is_format_supported(format):
-            print '   case %s:' % format.name
-            print '      func = &util_format_%s_pack_%s;' % (format.short_name(), src_suffix)
-            print '      break;'
-    print '   default:'
-    print '      debug_printf("%s: unsupported format\\n", __FUNCTION__);'
-    print '      return;'
-    print '   }'
-    print '   func(dst, r, g, b, a);'
+
+    if is_format_supported(format):
+        generate_unpack_kernel(format, dst_channel, dst_native_type)
+
     print '}'
     print
 
 
-def main():
-    formats = []
-    for arg in sys.argv[1:]:
-        formats.extend(parse(arg))
+def is_format_hand_written(format):
+    return format.layout in ('s3tc', 'subsampled', 'other') or format.colorspace == ZS
 
-    print '/* This file is autogenerated by u_format_pack.py from u_format.csv. Do not edit directly. */'
-    print
-    # This will print the copyright message on the top of this file
-    print __doc__.strip()
 
-    print
-    print '#ifndef U_FORMAT_PACK_H'
-    print '#define U_FORMAT_PACK_H'
+def generate(formats):
     print
     print '#include "pipe/p_compiler.h"'
     print '#include "u_math.h"'
+    print '#include "u_half.h"'
     print '#include "u_format.h"'
+    print '#include "u_format_other.h"'
+    print '#include "u_format_srgb.h"'
+    print '#include "u_format_yuv.h"'
+    print '#include "u_format_zs.h"'
     print
 
-    generate_clamp()
-
     for format in formats:
-        if format.layout == PLAIN:
-            generate_format_type(format)
-
-    channel = Channel(FLOAT, False, 32)
-    native_type = 'float'
-    suffix = '4f'
+        if not is_format_hand_written(format):
+            
+            if is_format_supported(format):
+                generate_format_type(format)
 
-    generate_unpack(formats, channel, native_type, suffix)
-    generate_pack(formats, channel, native_type, suffix)
+            channel = Channel(FLOAT, False, 32)
+            native_type = 'float'
+            suffix = 'rgba_float'
 
-    channel = Channel(UNSIGNED, True, 8)
-    native_type = 'uint8_t'
-    suffix = '4ub'
+            generate_format_unpack(format, channel, native_type, suffix)
+            generate_format_pack(format, channel, native_type, suffix)
+            generate_format_fetch(format, channel, native_type, suffix)
 
-    generate_unpack(formats, channel, native_type, suffix)
-    generate_pack(formats, channel, native_type, suffix)
-
-    print
-    print '#ifdef __cplusplus'
-    print '}'
-    print '#endif'
-    print
-    print '#endif /* ! U_FORMAT_PACK_H */'
+            channel = Channel(UNSIGNED, True, 8)
+            native_type = 'uint8_t'
+            suffix = 'rgba_8unorm'
 
+            generate_format_unpack(format, channel, native_type, suffix)
+            generate_format_pack(format, channel, native_type, suffix)
 
-if __name__ == '__main__':
-    main()
diff --git a/src/gallium/auxiliary/util/u_format_parse.py b/src/gallium/auxiliary/util/u_format_parse.py
index 250926418e..7076c676aa 100755
--- a/src/gallium/auxiliary/util/u_format_parse.py
+++ b/src/gallium/auxiliary/util/u_format_parse.py
@@ -73,18 +73,22 @@ class Channel:
         '''Maximum representable number.'''
         if self.type == FLOAT:
             return VERY_LARGE
+        if self.type == FIXED:
+            return (1 << (self.size/2)) - 1
         if self.norm:
             return 1
         if self.type == UNSIGNED:
             return (1 << self.size) - 1
         if self.type == SIGNED:
-            return self.size - 1
+            return (1 << (self.size - 1)) - 1
         assert False
     
     def min(self):
         '''Minimum representable number.'''
         if self.type == FLOAT:
             return -VERY_LARGE
+        if self.type == FIXED:
+            return -(1 << (self.size/2))
         if self.type == UNSIGNED:
             return 0
         if self.norm:
@@ -134,6 +138,8 @@ class Format:
         return nr_channels
 
     def is_array(self):
+        if self.layout != PLAIN:
+            return False
         ref_channel = self.channels[0]
         for channel in self.channels[1:]:
             if channel.size and (channel.size != ref_channel.size or channel.size % 8):
@@ -141,7 +147,11 @@ class Format:
         return True
 
     def is_mixed(self):
+        if self.layout != PLAIN:
+            return False
         ref_channel = self.channels[0]
+        if ref_channel.type == VOID:
+           ref_channel = self.channels[1]
         for channel in self.channels[1:]:
             if channel.type != VOID:
                 if channel.type != ref_channel.type:
@@ -154,29 +164,29 @@ class Format:
         return is_pot(self.block_size())
 
     def is_int(self):
+        if self.layout != PLAIN:
+            return False
         for channel in self.channels:
             if channel.type not in (VOID, UNSIGNED, SIGNED):
                 return False
         return True
 
     def is_float(self):
+        if self.layout != PLAIN:
+            return False
         for channel in self.channels:
             if channel.type not in (VOID, FLOAT):
                 return False
         return True
 
     def is_bitmask(self):
-        if self.block_size() > 32:
+        if self.layout != PLAIN:
             return False
-        if not self.is_pot():
+        if self.block_size() not in (8, 16, 32):
             return False
         for channel in self.channels:
-            if not is_pot(channel.size):
-                return True
             if channel.type not in (VOID, UNSIGNED, SIGNED):
                 return False
-            if channel.size >= 32:
-                return False
         return True
 
     def inv_swizzles(self):
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c
new file mode 100644
index 0000000000..79dee2b423
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -0,0 +1,763 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright (c) 2008 VMware, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "u_dl.h"
+#include "u_math.h"
+#include "u_format.h"
+#include "u_format_s3tc.h"
+
+
+#if defined(_WIN32) || defined(WIN32)
+#define DXTN_LIBNAME "dxtn.dll"
+#elif defined(__APPLE__)
+#define DXTN_LIBNAME "libtxc_dxtn.dylib"
+#else
+#define DXTN_LIBNAME "libtxc_dxtn.so"
+#endif
+
+
+static void
+util_format_dxt1_rgb_fetch_stub(int src_stride,
+                                const uint8_t *src,
+                                int col, int row,
+                                uint8_t *dst)
+{
+   assert(0);
+}
+
+
+static void
+util_format_dxt1_rgba_fetch_stub(int src_stride,
+                                 const uint8_t *src,
+                                 int col, int row,
+                                 uint8_t *dst )
+{
+   assert(0);
+}
+
+
+static void
+util_format_dxt3_rgba_fetch_stub(int src_stride,
+                                 const uint8_t *src,
+                                 int col, int row,
+                                 uint8_t *dst )
+{
+   assert(0);
+}
+
+
+static void
+util_format_dxt5_rgba_fetch_stub(int src_stride,
+                                 const uint8_t *src,
+                                 int col, int row,
+                                 uint8_t *dst )
+{
+   assert(0);
+}
+
+
+static void
+util_format_dxtn_pack_stub(int src_comps,
+                           int width, int height,
+                           const uint8_t *src,
+                           enum util_format_dxtn dst_format,
+                           uint8_t *dst,
+                           int dst_stride)
+{
+   assert(0);
+}
+
+
+boolean util_format_s3tc_enabled = FALSE;
+
+util_format_dxtn_fetch_t util_format_dxt1_rgb_fetch = util_format_dxt1_rgb_fetch_stub;
+util_format_dxtn_fetch_t util_format_dxt1_rgba_fetch = util_format_dxt1_rgba_fetch_stub;
+util_format_dxtn_fetch_t util_format_dxt3_rgba_fetch = util_format_dxt3_rgba_fetch_stub;
+util_format_dxtn_fetch_t util_format_dxt5_rgba_fetch = util_format_dxt5_rgba_fetch_stub;
+
+util_format_dxtn_pack_t util_format_dxtn_pack = util_format_dxtn_pack_stub;
+
+
+void
+util_format_s3tc_init(void)
+{
+   static boolean first_time = TRUE;
+   struct util_dl_library *library = NULL;
+   util_dl_proc fetch_2d_texel_rgb_dxt1;
+   util_dl_proc fetch_2d_texel_rgba_dxt1;
+   util_dl_proc fetch_2d_texel_rgba_dxt3;
+   util_dl_proc fetch_2d_texel_rgba_dxt5;
+   util_dl_proc tx_compress_dxtn;
+
+   if (!first_time)
+      return;
+   first_time = FALSE;
+
+   if (util_format_s3tc_enabled)
+      return;
+
+   library = util_dl_open(DXTN_LIBNAME);
+   if (!library) {
+      debug_printf("couldn't open " DXTN_LIBNAME ", software DXTn "
+         "compression/decompression unavailable");
+      return;
+   }
+
+   fetch_2d_texel_rgb_dxt1 =
+         util_dl_get_proc_address(library, "fetch_2d_texel_rgb_dxt1");
+   fetch_2d_texel_rgba_dxt1 =
+         util_dl_get_proc_address(library, "fetch_2d_texel_rgba_dxt1");
+   fetch_2d_texel_rgba_dxt3 =
+         util_dl_get_proc_address(library, "fetch_2d_texel_rgba_dxt3");
+   fetch_2d_texel_rgba_dxt5 =
+         util_dl_get_proc_address(library, "fetch_2d_texel_rgba_dxt5");
+   tx_compress_dxtn =
+         util_dl_get_proc_address(library, "tx_compress_dxtn");
+
+   if (!util_format_dxt1_rgb_fetch ||
+       !util_format_dxt1_rgba_fetch ||
+       !util_format_dxt3_rgba_fetch ||
+       !util_format_dxt5_rgba_fetch ||
+       !util_format_dxtn_pack) {
+      debug_printf("couldn't reference all symbols in " DXTN_LIBNAME
+                   ", software DXTn compression/decompression "
+                   "unavailable");
+      util_dl_close(library);
+      return;
+   }
+
+   util_format_dxt1_rgb_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgb_dxt1;
+   util_format_dxt1_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt1;
+   util_format_dxt3_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt3;
+   util_format_dxt5_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt5;
+   util_format_dxtn_pack = (util_format_dxtn_pack_t)tx_compress_dxtn;
+   util_format_s3tc_enabled = TRUE;
+}
+
+
+/*
+ * Pixel fetch.
+ */
+
+void
+util_format_dxt1_rgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgb_fetch(0, src, i, j, dst);
+}
+
+void
+util_format_dxt1_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgba_fetch(0, src, i, j, dst);
+}
+
+void
+util_format_dxt3_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt3_rgba_fetch(0, src, i, j, dst);
+}
+
+void
+util_format_dxt5_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt5_rgba_fetch(0, src, i, j, dst);
+}
+
+void
+util_format_dxt1_rgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp[4];
+   util_format_dxt1_rgb_fetch(0, src, i, j, tmp);
+   dst[0] = ubyte_to_float(tmp[0]);
+   dst[1] = ubyte_to_float(tmp[1]);
+   dst[2] = ubyte_to_float(tmp[2]);
+   dst[3] = 1.0;
+}
+
+void
+util_format_dxt1_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp[4];
+   util_format_dxt1_rgba_fetch(0, src, i, j, tmp);
+   dst[0] = ubyte_to_float(tmp[0]);
+   dst[1] = ubyte_to_float(tmp[1]);
+   dst[2] = ubyte_to_float(tmp[2]);
+   dst[3] = ubyte_to_float(tmp[3]);
+}
+
+void
+util_format_dxt3_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp[4];
+   util_format_dxt3_rgba_fetch(0, src, i, j, tmp);
+   dst[0] = ubyte_to_float(tmp[0]);
+   dst[1] = ubyte_to_float(tmp[1]);
+   dst[2] = ubyte_to_float(tmp[2]);
+   dst[3] = ubyte_to_float(tmp[3]);
+}
+
+void
+util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp[4];
+   util_format_dxt5_rgba_fetch(0, src, i, j, tmp);
+   dst[0] = ubyte_to_float(tmp[0]);
+   dst[1] = ubyte_to_float(tmp[1]);
+   dst[2] = ubyte_to_float(tmp[2]);
+   dst[3] = ubyte_to_float(tmp[3]);
+}
+
+
+/*
+ * Block decompression.
+ */
+
+void
+util_format_dxt1_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               util_format_dxt1_rgb_fetch(0, src, i, j, dst);
+            }
+         }
+         src += 8;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt1_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               util_format_dxt1_rgba_fetch(0, src, i, j, dst);
+            }
+         }
+         src += 8;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt3_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               util_format_dxt3_rgba_fetch(0, src, i, j, dst);
+            }
+         }
+         src += 16;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               util_format_dxt5_rgba_fetch(0, src, i, j, dst);
+            }
+         }
+         src += 16;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt1_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp[4];
+               util_format_dxt1_rgb_fetch(0, src, i, j, tmp);
+               dst[0] = ubyte_to_float(tmp[0]);
+               dst[1] = ubyte_to_float(tmp[1]);
+               dst[2] = ubyte_to_float(tmp[2]);
+               dst[3] = 1.0;
+            }
+         }
+         src += 8;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt1_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp[4];
+               util_format_dxt1_rgba_fetch(0, src, i, j, tmp);
+               dst[0] = ubyte_to_float(tmp[0]);
+               dst[1] = ubyte_to_float(tmp[1]);
+               dst[2] = ubyte_to_float(tmp[2]);
+               dst[3] = ubyte_to_float(tmp[3]);
+            }
+         }
+         src += 8;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt3_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp[4];
+               util_format_dxt3_rgba_fetch(0, src, i, j, tmp);
+               dst[0] = ubyte_to_float(tmp[0]);
+               dst[1] = ubyte_to_float(tmp[1]);
+               dst[2] = ubyte_to_float(tmp[2]);
+               dst[3] = ubyte_to_float(tmp[3]);
+            }
+         }
+         src += 16;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp[4];
+               util_format_dxt5_rgba_fetch(0, src, i, j, tmp);
+               dst[0] = ubyte_to_float(tmp[0]);
+               dst[1] = ubyte_to_float(tmp[1]);
+               dst[2] = ubyte_to_float(tmp[2]);
+               dst[3] = ubyte_to_float(tmp[3]);
+            }
+         }
+         src += 16;
+      }
+      src_row += src_stride;
+   }
+}
+
+
+/*
+ * Block compression.
+ */
+
+void
+util_format_dxt1_rgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][3];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 3; ++k) {
+                  tmp[j][i][k] = src[(y + j)*src_stride/sizeof(*src) + i*4 + k];
+               }
+            }
+         }
+         util_format_dxtn_pack(3, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT1_RGB, dst, dst_stride);
+         src += 4*4;
+         dst += 8;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt1_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = src[(y + j)*src_stride/sizeof(*src) + i*4 + k];
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT1_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 8;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt3_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = src[(y + j)*src_stride/sizeof(*src) + i*4 + k];
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT3_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 16;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = src[(y + j)*src_stride/sizeof(*src) + i*4 + k];
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT5_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 16;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt1_rgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const float *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][3];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 3; ++k) {
+                  tmp[j][i][k] = float_to_ubyte(src[(y + j)*src_stride/sizeof(*src) + i*4 + k]);
+               }
+            }
+         }
+         util_format_dxtn_pack(3, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT1_RGB, dst, dst_stride);
+         src += 4*4;
+         dst += 8;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt1_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const float *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = float_to_ubyte(src[(y + j)*src_stride/sizeof(*src) + i*4 + k]);
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT1_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 8;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt3_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const float *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = float_to_ubyte(src[(y + j)*src_stride/sizeof(*src) + i*4 + k]);
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT3_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 16;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_dxt5_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j, k;
+   for(y = 0; y < height; y += 4) {
+      const float *src = src_row;
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += 4) {
+         uint8_t tmp[4][4][4];
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               for(k = 0; k < 4; ++k) {
+                  tmp[j][i][k] = float_to_ubyte(src[(y + j)*src_stride/sizeof(*src) + i*4 + k]);
+               }
+            }
+         }
+         util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], UTIL_FORMAT_DXT5_RGBA, dst, dst_stride);
+         src += 4*4;
+         dst += 16;
+      }
+      src_row += src_stride;
+      dst_row += 4*dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+/*
+ * SRGB variants.
+ *
+ * FIXME: shunts to RGB for now
+ */
+
+void
+util_format_dxt1_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgb_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgb_fetch_rgba_8unorm(dst, src, i, j);
+}
+
+void
+util_format_dxt1_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgba_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgba_fetch_rgba_8unorm(dst, src, i, j);
+}
+
+void
+util_format_dxt3_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt3_rgba_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt3_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt3_rgba_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt3_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt3_rgba_fetch_rgba_8unorm(dst, src, i, j);
+}
+
+void
+util_format_dxt5_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt5_rgba_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt5_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt5_rgba_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt5_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt5_rgba_fetch_rgba_8unorm(dst, src, i, j);
+}
+
+void
+util_format_dxt1_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgb_unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgb_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgb_fetch_rgba_float(dst, src, i, j);
+}
+
+void
+util_format_dxt1_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgba_unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt1_rgba_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt1_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt1_rgba_fetch_rgba_float(dst, src, i, j);
+}
+
+void
+util_format_dxt3_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt3_rgba_unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt3_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt3_rgba_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt3_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt3_rgba_fetch_rgba_float(dst, src, i, j);
+}
+
+void
+util_format_dxt5_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt5_rgba_unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt5_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_dxt5_rgba_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_dxt5_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   util_format_dxt5_rgba_fetch_rgba_float(dst, src, i, j);
+}
+
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.h b/src/gallium/auxiliary/util/u_format_s3tc.h
new file mode 100644
index 0000000000..97770abd42
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_s3tc.h
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FORMAT_S3TC_H_
+#define U_FORMAT_S3TC_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+enum util_format_dxtn {
+  UTIL_FORMAT_DXT1_RGB = 0x83F0,
+  UTIL_FORMAT_DXT1_RGBA = 0x83F1,
+  UTIL_FORMAT_DXT3_RGBA = 0x83F2,
+  UTIL_FORMAT_DXT5_RGBA = 0x83F3
+};
+
+
+typedef void
+(*util_format_dxtn_fetch_t)( int src_stride,
+                             const uint8_t *src,
+                             int col, int row,
+                             uint8_t *dst );
+
+typedef void
+(*util_format_dxtn_pack_t)( int src_comps,
+                            int width, int height,
+                            const uint8_t *src,
+                            enum util_format_dxtn dst_format,
+                            uint8_t *dst,
+                            int dst_stride);
+
+extern boolean util_format_s3tc_enabled;
+
+extern util_format_dxtn_fetch_t util_format_dxt1_rgb_fetch;
+extern util_format_dxtn_fetch_t util_format_dxt1_rgba_fetch;
+extern util_format_dxtn_fetch_t util_format_dxt3_rgba_fetch;
+extern util_format_dxtn_fetch_t util_format_dxt5_rgba_fetch;
+
+extern util_format_dxtn_pack_t util_format_dxtn_pack;
+
+
+void
+util_format_s3tc_init(void);
+
+
+void
+util_format_dxt1_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt3_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt3_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt5_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j);
+
+
+void
+util_format_dxt1_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt3_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt1_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt1_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt3_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt3_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+void
+util_format_dxt5_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_dxt5_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
+
+
+#endif /* U_FORMAT_S3TC_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_srgb.h b/src/gallium/auxiliary/util/u_format_srgb.h
new file mode 100644
index 0000000000..43213fbeb3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_srgb.h
@@ -0,0 +1,106 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SRGB translation.
+ *
+ * @author Brian Paul <brianp@vmware.com>
+ * @author Michal Krol <michal@vmware.com>
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef U_FORMAT_SRGB_H_
+#define U_FORMAT_SRGB_H_
+
+
+#include "pipe/p_compiler.h"
+#include "u_math.h"
+
+
+extern const float
+util_format_srgb_8unorm_to_linear_float_table[256];
+
+extern const uint8_t
+util_format_srgb_to_linear_8unorm_table[256];
+
+extern const uint8_t
+util_format_linear_to_srgb_8unorm_table[256];
+
+
+/**
+ * Convert a unclamped linear float to srgb value in the [0,255].
+ * XXX this hasn't been tested (render to srgb surface).
+ * XXX this needs optimization.
+ */
+static INLINE uint8_t
+util_format_linear_float_to_srgb_8unorm(float x)
+{
+   if (x >= 1.0f)
+      return 255;
+   else if (x >= 0.0031308f)
+      return float_to_ubyte(1.055f * powf(x, 0.41666f) - 0.055f);
+   else if (x > 0.0f)
+      return float_to_ubyte(12.92f * x);
+   else
+      return 0;
+}
+
+
+/**
+ * Convert an 8-bit sRGB value from non-linear space to a
+ * linear RGB value in [0, 1].
+ * Implemented with a 256-entry lookup table.
+ */
+static INLINE float
+util_format_srgb_8unorm_to_linear_float(uint8_t x)
+{
+   return util_format_srgb_8unorm_to_linear_float_table[x];
+}
+
+
+/**
+ * Convert a 8bit normalized value from linear to srgb.
+ */
+static INLINE uint8_t
+util_format_linear_to_srgb_8unorm(uint8_t x)
+{
+   return util_format_linear_to_srgb_8unorm_table[x];
+}
+
+
+/**
+ * Convert a 8bit normalized value from srgb to linear.
+ */
+static INLINE uint8_t
+util_format_srgb_to_linear_8unorm(uint8_t x)
+{
+   return util_format_srgb_to_linear_8unorm_table[x];
+}
+
+
+#endif /* U_FORMAT_SRGB_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_srgb.py b/src/gallium/auxiliary/util/u_format_srgb.py
new file mode 100644
index 0000000000..a4c76dc00b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_srgb.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+'''
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SRGB translation.
+ *
+ * @author Brian Paul <brianp@vmware.com>
+ * @author Michal Krol <michal@vmware.com>
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+'''
+
+
+import sys
+import math
+
+
+def srgb_to_linear(x):
+    if x <= 0.04045:
+        return x / 12.92
+    else:
+        return math.pow((x + 0.055) / 1.055, 2.4)
+
+
+def linear_to_srgb(x):
+    if x >= 0.0031308:
+        return 1.055 * math.pow(x, 0.41666) - 0.055
+    else:
+        return 12.92 * x
+
+def generate_srgb_tables():
+    print 'const float'
+    print 'util_format_srgb_8unorm_to_linear_float_table[256] = {'
+    for j in range(0, 256, 4):
+        print '   ',
+        for i in range(j, j + 4):
+            print '%.7e,' % (srgb_to_linear(i / 255.0),),
+        print
+    print '};'
+    print
+    print 'const uint8_t'
+    print 'util_format_srgb_to_linear_8unorm_table[256] = {'
+    for j in range(0, 256, 16):
+        print '   ',
+        for i in range(j, j + 16):
+            print '%3u,' % (int(srgb_to_linear(i / 255.0) * 255.0 + 0.5),),
+        print
+    print '};'
+    print
+    print 'const uint8_t'
+    print 'util_format_linear_to_srgb_8unorm_table[256] = {'
+    for j in range(0, 256, 16):
+        print '   ',
+        for i in range(j, j + 16):
+            print '%3u,' % (int(linear_to_srgb(i / 255.0) * 255.0 + 0.5),),
+        print
+    print '};'
+    print
+
+
+def main():
+    print '/* This file is autogenerated by u_format_srgb.py. Do not edit directly. */'
+    print
+    # This will print the copyright message on the top of this file
+    print __doc__.strip()
+    print
+    print '#include "u_format_srgb.h"'
+    print
+    generate_srgb_tables()    
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index 4e29d15f3b..ae9a598197 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -3,7 +3,7 @@
 '''
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -33,6 +33,7 @@
 import sys
 
 from u_format_parse import *
+import u_format_pack
 
 
 def layout_map(layout):
@@ -85,30 +86,22 @@ def write_format_table(formats):
     print __doc__.strip()
     print
     print '#include "u_format.h"'
+    print '#include "u_format_s3tc.h"'
     print
-    print 'const struct util_format_description'
-    print 'util_format_none_description = {'
-    print "   PIPE_FORMAT_NONE,"
-    print "   \"PIPE_FORMAT_NONE\","
-    print "   {0, 0, 0},"
-    print "   0,"
-    print "   0,"
-    print "   0,"
-    print "   0,"
-    print "   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},"
-    print "   {0, 0, 0, 0},"
-    print "   0"
-    print "};"
-    print
+    
+    u_format_pack.generate(formats)
+    
     for format in formats:
         print 'const struct util_format_description'
         print 'util_format_%s_description = {' % (format.short_name(),)
         print "   %s," % (format.name,)
         print "   \"%s\"," % (format.name,)
+        print "   \"%s\"," % (format.short_name(),)
         print "   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())
         print "   %s," % (layout_map(format.layout),)
         print "   %u,\t/* nr_channels */" % (format.nr_channels(),)
         print "   %s,\t/* is_array */" % (bool_map(format.is_array()),)
+        print "   %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),)
         print "   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)
         print "   {"
         for i in range(4):
@@ -136,8 +129,37 @@ def write_format_table(formats):
             print "      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)
         print "   },"
         print "   %s," % (colorspace_map(format.colorspace),)
+        if format.colorspace != ZS:
+            print "   &util_format_%s_unpack_rgba_8unorm," % format.short_name() 
+            print "   &util_format_%s_pack_rgba_8unorm," % format.short_name() 
+            print "   &util_format_%s_unpack_rgba_float," % format.short_name() 
+            print "   &util_format_%s_pack_rgba_float," % format.short_name() 
+            print "   &util_format_%s_fetch_rgba_float," % format.short_name()
+        else:
+            print "   NULL, /* unpack_rgba_8unorm */" 
+            print "   NULL, /* pack_rgba_8unorm */" 
+            print "   NULL, /* unpack_rgba_float */" 
+            print "   NULL, /* pack_rgba_float */" 
+            print "   NULL, /* fetch_rgba_float */" 
+        if format.colorspace == ZS and format.swizzles[0] != SWIZZLE_NONE:
+            print "   &util_format_%s_unpack_z_32unorm," % format.short_name() 
+            print "   &util_format_%s_pack_z_32unorm," % format.short_name() 
+            print "   &util_format_%s_unpack_z_float," % format.short_name() 
+            print "   &util_format_%s_pack_z_float," % format.short_name() 
+        else:
+            print "   NULL, /* unpack_z_32unorm */" 
+            print "   NULL, /* pack_z_32unorm */" 
+            print "   NULL, /* unpack_z_float */" 
+            print "   NULL, /* pack_z_float */" 
+        if format.colorspace == ZS and format.swizzles[1] != SWIZZLE_NONE:
+            print "   &util_format_%s_unpack_s_8uscaled," % format.short_name() 
+            print "   &util_format_%s_pack_s_8uscaled" % format.short_name() 
+        else:
+            print "   NULL, /* unpack_s_8uscaled */" 
+            print "   NULL /* pack_s_8uscaled */" 
         print "};"
         print
+        
     print "const struct util_format_description *"
     print "util_format_description(enum pipe_format format)"
     print "{"
@@ -146,13 +168,10 @@ def write_format_table(formats):
     print "   }"
     print
     print "   switch (format) {"
-    print "   case PIPE_FORMAT_NONE:"
-    print "      return &util_format_none_description;"
     for format in formats:
         print "   case %s:" % format.name
         print "      return &util_format_%s_description;" % (format.short_name(),)
     print "   default:"
-    print "      assert(0);"
     print "      return NULL;"
     print "   }"
     print "}"
diff --git a/src/gallium/auxiliary/util/u_format_tests.c b/src/gallium/auxiliary/util/u_format_tests.c
new file mode 100644
index 0000000000..b1df6c49c4
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_tests.c
@@ -0,0 +1,970 @@
+/**************************************************************************
+ *
+ * Copyright 2009-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "u_memory.h"
+#include "u_format_tests.h"
+
+
+/*
+ * Helper macros to create the packed bytes for longer words.
+ */
+
+#define PACKED_1x8(x)          {x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_2x8(x, y)       {x, y, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_3x8(x, y, z)    {x, y, z, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_4x8(x, y, z, w) {x, y, z, w, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_8x8(a, b, c, d, e, f, g, h) {a, b, c, d, e, f, g, h, 0, 0, 0, 0, 0, 0, 0, 0}
+
+#define PACKED_1x16(x)          {(x) & 0xff, (x) >> 8,          0,        0,          0,        0,          0,        0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_2x16(x, y)       {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8,          0,        0,          0,        0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_3x16(x, y, z)    {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8, (z) & 0xff, (z) >> 8,          0,        0, 0, 0, 0, 0, 0, 0, 0, 0}
+#define PACKED_4x16(x, y, z, w) {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8, (z) & 0xff, (z) >> 8, (w) & 0xff, (w) >> 8, 0, 0, 0, 0, 0, 0, 0, 0}
+
+#define PACKED_1x32(x)          {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24,          0,                 0,                  0,         0,          0,                 0,                  0,         0,          0,                 0,                  0,         0}
+#define PACKED_2x32(x, y)       {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24,          0,                 0,                  0,         0,          0,                 0,                  0,         0}
+#define PACKED_3x32(x, y, z)    {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24, (z) & 0xff, ((z) >> 8) & 0xff, ((z) >> 16) & 0xff, (z) >> 24,          0,                 0,                  0,         0}
+#define PACKED_4x32(x, y, z, w) {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24, (z) & 0xff, ((z) >> 8) & 0xff, ((z) >> 16) & 0xff, (z) >> 24, (w) & 0xff, ((w) >> 8) & 0xff, ((w) >> 16) & 0xff, (w) >> 24}
+
+#define UNPACKED_1x1(r, g, b, a) \
+      {{{r, g, b, a}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}}
+
+#define UNPACKED_2x1(r0, g0, b0, a0, r1, g1, b1, a1) \
+      {{{r0, g0, b0, a0}, {r1, g1, b1, a1}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \
+       {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}}
+
+
+/**
+ * Test cases.
+ *
+ * These were manually entered. We could generate these
+ *
+ * To keep this to a we cover only the corner cases, which should produce
+ * good enough coverage since that pixel format transformations are afine for
+ * non SRGB formats.
+ */
+const struct util_format_test_case
+util_format_test_cases[] =
+{
+
+   /*
+    * 32-bit rendertarget formats
+    */
+
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * 16-bit rendertarget formats
+    */
+
+   {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x03e0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x7c00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x7fff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x03e0), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x8000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x000f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x00f0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0f00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0fff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x000f), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x00f0), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0f00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xf000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x07e0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xf800), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * Luminance/intensity/alpha formats
+    */
+
+   {PIPE_FORMAT_L8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_A8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_I8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_I8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x0f), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)},
+   {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xf0), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x00ff), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)},
+   {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xff00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_L16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * SRGB formats
+    */
+
+   {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xbc), UNPACKED_1x1(0.502886458033, 0.502886458033, 0.502886458033, 1.0)},
+   {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x00bc), UNPACKED_1x1(0.502886458033, 0.502886458033, 0.502886458033, 0.0)},
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x00ff), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)},
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xcc00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)},
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xff00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xbc, 0x00, 0x00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xbc, 0x00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xbc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xbc, 0x00, 0x00, 0x00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xbc, 0x00, 0x00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xbc, 0x00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xcc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000bc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xcc000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000bc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000cc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000cc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * Mixed-signed formats
+    */
+
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x81, 0x00, 0x00, 0x00), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x81, 0x00, 0x00), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1( 0.0,  0.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000001ff), UNPACKED_1x1( 1.0,  0.0,  0.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000201), UNPACKED_1x1(-1.0,  0.0,  0.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0007fc00), UNPACKED_1x1( 0.0,  1.0,  0.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00080400), UNPACKED_1x1( 0.0, -1.0,  0.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x1ff00000), UNPACKED_1x1( 0.0,  0.0,  1.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x20100000), UNPACKED_1x1( 0.0,  0.0, -1.0, 0.0)},
+   {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1( 0.0,  0.0,  0.0, 1.0)},
+
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x000f), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0011), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x01e0), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0220), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1( 0.0,  0.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0,  0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x81, 0x00), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x81), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+
+   /*
+    * Depth-stencil formats
+    */
+
+   {PIPE_FORMAT_S8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x3f800000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x3f800000, 0x00000000), UNPACKED_1x1( 1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x000000ff), UNPACKED_1x1( 0.0, 255.0, 0.0, 0.0)},
+
+   /*
+    * YUV formats
+    */
+
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_2x1(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_2x1(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_2x1(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_2x1(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_2x1(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * TODO: Exercise the UV channels as well.
+    */
+   {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x10, 0x80, 0x10), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0xeb, 0x80, 0x10), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x10, 0x80, 0xeb), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x10, 0x80, 0x10, 0x80), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xeb, 0x80, 0x10, 0x80), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x10, 0x80, 0xeb, 0x80), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0)},
+
+   /*
+    * Compressed formats
+    */
+
+   {
+      PIPE_FORMAT_DXT1_RGB,
+      PACKED_8x8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff),
+      PACKED_8x8(0xf2, 0xd7, 0xb0, 0x20, 0xae, 0x2c, 0x6f, 0x97),
+      {
+         {
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0},
+            {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0},
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0},
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}
+         },
+         {
+            {0xd6/255.0, 0xff/255.0, 0x94/255.0, 0xff/255.0},
+            {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0},
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0},
+            {0xd6/255.0, 0xff/255.0, 0x94/255.0, 0xff/255.0}
+         },
+         {
+            {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0},
+            {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0},
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0},
+            {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0}
+         },
+         {
+            {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0},
+            {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0},
+            {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0},
+            {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}
+         }
+      }
+   },
+   {
+      PIPE_FORMAT_DXT1_RGBA,
+      PACKED_8x8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff),
+      PACKED_8x8(0xff, 0x2f, 0xa4, 0x72, 0xeb, 0xb2, 0xbd, 0xbe),
+      {
+         {
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0},
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}
+         },
+         {
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0},
+            {0x29/255.0, 0xff/255.0, 0xff/255.0, 0xff/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}
+         },
+         {
+            {0x73/255.0, 0x55/255.0, 0x21/255.0, 0xff/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}
+         },
+         {
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0},
+            {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}
+         }
+      }
+   },
+   {
+      PIPE_FORMAT_DXT3_RGBA,
+      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+      {0xe7, 0x4a, 0x8f, 0x96, 0x5b, 0xc1, 0x1c, 0x84, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a},
+      {
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x77/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xee/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xaa/255.0},
+            {0x8c/255.0, 0xff/255.0, 0xb5/255.0, 0x44/255.0}
+         },
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xff/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x88/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x66/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x99/255.0}
+         },
+         {
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0xbb/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x55/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x11/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xcc/255.0}
+         },
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xcc/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x11/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x44/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x88/255.0}
+         }
+      }
+   },
+   {
+      PIPE_FORMAT_DXT5_RGBA,
+      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+      {0xf8, 0x11, 0xc5, 0x0c, 0x9a, 0x73, 0xb4, 0x9c, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a},
+      {
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x74/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xf8/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xb6/255.0},
+            {0x8c/255.0, 0xff/255.0, 0xb5/255.0, 0x53/255.0}
+         },
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xf8/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x95/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x53/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x95/255.0}
+         },
+         {
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0xb6/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x53/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x11/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xd7/255.0}
+         },
+         {
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xb6/255.0},
+            {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x11/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x32/255.0},
+            {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x95/255.0}
+         }
+      }
+   },
+
+
+   /*
+    * Standard 8-bit integer formats
+    */
+
+   {PIPE_FORMAT_R8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0xff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(  0.0,   0.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(255.0,   0.0,   0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1(  0.0,   0.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0x00), UNPACKED_1x1(255.0,   0.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0xff), UNPACKED_1x1(  0.0, 255.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0xff), UNPACKED_1x1(255.0, 255.0,   0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(  0.0,   0.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(255.0,   0.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1(  0.0, 255.0,   0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1(  0.0,   0.0, 255.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(255.0, 255.0, 255.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(  0.0,   0.0,   0.0,   0.0)},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(255.0,   0.0,   0.0,   0.0)},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1(  0.0, 255.0,   0.0,   0.0)},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1(  0.0,   0.0, 255.0,   0.0)},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1(  0.0,   0.0,   0.0, 255.0)},
+   {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(255.0, 255.0, 255.0, 255.0)},
+
+   {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x7f), UNPACKED_1x1( 1.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x81), UNPACKED_1x1(-1.0,  0.0,  0.0,  1.0)},
+
+   {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 1.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x81, 0x00), UNPACKED_1x1(-1.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1( 0.0,  1.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x81), UNPACKED_1x1( 0.0, -1.0,  0.0,  1.0)},
+
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x7f, 0x00, 0x00), UNPACKED_1x1( 1.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x81, 0x00, 0x00), UNPACKED_1x1(-1.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0,  1.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x81, 0x00), UNPACKED_1x1( 0.0, -1.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0,  0.0,  1.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x81), UNPACKED_1x1( 0.0,  0.0, -1.0,  1.0)},
+
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x81, 0x00, 0x00, 0x00), UNPACKED_1x1(-1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1( 0.0,  1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x81, 0x00, 0x00), UNPACKED_1x1( 0.0, -1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0,  0.0,  1.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x81, 0x00), UNPACKED_1x1( 0.0,  0.0, -1.0,  0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x81), UNPACKED_1x1( 0.0,  0.0,  0.0, -1.0)},
+
+   {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(   0.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x7f), UNPACKED_1x1( 127.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x80), UNPACKED_1x1(-128.0,    0.0,    0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1(   0.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 127.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x80, 0x00), UNPACKED_1x1(-128.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1(   0.0,  127.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x80), UNPACKED_1x1(   0.0, -128.0,    0.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(   0.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x7f, 0x00, 0x00), UNPACKED_1x1( 127.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x80, 0x00, 0x00), UNPACKED_1x1(-128.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x7f, 0x00), UNPACKED_1x1(   0.0,  127.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x80, 0x00), UNPACKED_1x1(   0.0, -128.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x7f), UNPACKED_1x1(   0.0,    0.0,  127.0, 1.0)},
+   {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x80), UNPACKED_1x1(   0.0,    0.0, -128.0, 1.0)},
+
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(   0.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 127.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x00, 0x00, 0x00), UNPACKED_1x1(-128.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1(   0.0,  127.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x80, 0x00, 0x00), UNPACKED_1x1(   0.0, -128.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x7f, 0x00), UNPACKED_1x1(   0.0,    0.0,  127.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x80, 0x00), UNPACKED_1x1(   0.0,    0.0, -128.0,    0.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x7f), UNPACKED_1x1(   0.0,    0.0,    0.0,  127.0)},
+   {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x80), UNPACKED_1x1(   0.0,    0.0,    0.0, -128.0)},
+
+   /*
+    * Standard 16-bit integer formats
+    */
+
+   {PIPE_FORMAT_R16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xffff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0x0000, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xffff, 0x0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xffff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xffff, 0x0000, 0x0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xffff, 0x0000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xffff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R16_USCALED, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(    0.0,     0.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16_USCALED, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(65535.0,     0.0,     0.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1(    0.0,     0.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0x0000), UNPACKED_1x1(65535.0,     0.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xffff), UNPACKED_1x1(    0.0, 65535.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0,     0.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1(    0.0,     0.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0x0000, 0x0000), UNPACKED_1x1(65535.0,     0.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xffff, 0x0000), UNPACKED_1x1(    0.0, 65535.0,     0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xffff), UNPACKED_1x1(    0.0,     0.0, 65535.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0, 65535.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(    0.0,     0.0,     0.0,     0.0)},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(65535.0,     0.0,     0.0,     0.0)},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xffff, 0x0000, 0x0000), UNPACKED_1x1(    0.0, 65535.0,     0.0,     0.0)},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xffff, 0x0000), UNPACKED_1x1(    0.0,     0.0, 65535.0,     0.0)},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xffff), UNPACKED_1x1(    0.0,     0.0,     0.0, 65535.0)},
+   {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0, 65535.0, 65535.0)},
+
+   {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x8001), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+
+   {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x7fff, 0x0000), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x8001, 0x0000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x7fff), UNPACKED_1x1(   0.0,    1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x8001), UNPACKED_1x1(   0.0,   -1.0,    0.0,    1.0)},
+
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x7fff, 0x0000, 0x0000), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x8001, 0x0000, 0x0000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x7fff, 0x0000), UNPACKED_1x1(   0.0,    1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x8001, 0x0000), UNPACKED_1x1(   0.0,   -1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x7fff), UNPACKED_1x1(   0.0,    0.0,    1.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x8001), UNPACKED_1x1(   0.0,    0.0,   -1.0,    1.0)},
+
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(   0.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x7fff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(   1.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x8001, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x7fff, 0x0000, 0x0000), UNPACKED_1x1(   0.0,    1.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x8001, 0x0000, 0x0000), UNPACKED_1x1(   0.0,   -1.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x7fff, 0x0000), UNPACKED_1x1(   0.0,    0.0,    1.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x8001, 0x0000), UNPACKED_1x1(   0.0,    0.0,   -1.0,    0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x7fff), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x8001), UNPACKED_1x1(   0.0,    0.0,    0.0,   -1.0)},
+
+   {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(     0.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1( 32767.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x8000), UNPACKED_1x1(-32768.0,      0.0,      0.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1(     0.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x7fff, 0x0000), UNPACKED_1x1( 32767.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x8000, 0x0000), UNPACKED_1x1(-32768.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x7fff), UNPACKED_1x1(     0.0,  32767.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x8000), UNPACKED_1x1(     0.0, -32768.0,      0.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1(     0.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x7fff, 0x0000, 0x0000), UNPACKED_1x1( 32767.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x8000, 0x0000, 0x0000), UNPACKED_1x1(-32768.0,      0.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x7fff, 0x0000), UNPACKED_1x1(     0.0,  32767.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x8000, 0x0000), UNPACKED_1x1(     0.0, -32768.0,      0.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x7fff), UNPACKED_1x1(     0.0,      0.0,  32767.0,   1.0)},
+   {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x8000), UNPACKED_1x1(     0.0,      0.0, -32768.0,   1.0)},
+
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(     0.0,      0.0,      0.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x7fff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 32767.0,      0.0,      0.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x8000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(-32768.0,      0.0,      0.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x7fff, 0x0000, 0x0000), UNPACKED_1x1(     0.0,  32767.0,      0.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x8000, 0x0000, 0x0000), UNPACKED_1x1(     0.0, -32768.0,      0.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x7fff, 0x0000), UNPACKED_1x1(     0.0,      0.0,  32767.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x8000, 0x0000), UNPACKED_1x1(     0.0,      0.0, -32768.0,      0.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x7fff), UNPACKED_1x1(     0.0,      0.0,      0.0,  32767.0)},
+   {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x8000), UNPACKED_1x1(     0.0,      0.0,      0.0, -32768.0)},
+
+   /*
+    * Standard 32-bit integer formats
+    *
+    * NOTE: We can't accurately represent integers larger than +/-0x1000000
+    * with single precision floats, so that's as far as we test.
+    */
+
+   {PIPE_FORMAT_R32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffffffff, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffffffff, 0x00000000, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xffffffff, 0x00000000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffffffff, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xffffffff, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xffffffff, 0x00000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)},
+
+   {PIPE_FORMAT_R32_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(       0.0,        0.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x01000000), UNPACKED_1x1(16777216.0,        0.0,        0.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1(       0.0,        0.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x00000000), UNPACKED_1x1(16777216.0,        0.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x01000000), UNPACKED_1x1(       0.0, 16777216.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0,        0.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(       0.0,        0.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1(16777216.0,        0.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1(       0.0, 16777216.0,        0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1(       0.0,        0.0, 16777216.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0, 16777216.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(       0.0,        0.0,        0.0,        0.0)},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(16777216.0,        0.0,        0.0,        0.0)},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1(       0.0, 16777216.0,        0.0,        0.0)},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1(       0.0,        0.0, 16777216.0,        0.0)},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1(       0.0,        0.0,        0.0, 16777216.0)},
+   {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x01000000, 0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0, 16777216.0, 16777216.0)},
+
+   {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x7fffffff), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x80000001), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+
+   {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x7fffffff, 0x00000000), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x80000001, 0x00000000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x7fffffff), UNPACKED_1x1(   0.0,    1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x80000001), UNPACKED_1x1(   0.0,   -1.0,    0.0,    1.0)},
+
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x7fffffff, 0x00000000, 0x00000000), UNPACKED_1x1(   1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x80000001, 0x00000000, 0x00000000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x7fffffff, 0x00000000), UNPACKED_1x1(   0.0,    1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x80000001, 0x00000000), UNPACKED_1x1(   0.0,   -1.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x7fffffff), UNPACKED_1x1(   0.0,    0.0,    1.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x80000001), UNPACKED_1x1(   0.0,    0.0,   -1.0,    1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(   0.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x7fffffff, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(   1.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x80000001, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(  -1.0,    0.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x7fffffff, 0x00000000, 0x00000000), UNPACKED_1x1(   0.0,    1.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x80000001, 0x00000000, 0x00000000), UNPACKED_1x1(   0.0,   -1.0,    0.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x7fffffff, 0x00000000), UNPACKED_1x1(   0.0,    0.0,    1.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x80000001, 0x00000000), UNPACKED_1x1(   0.0,    0.0,   -1.0,    0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x7fffffff), UNPACKED_1x1(   0.0,    0.0,    0.0,    1.0)},
+   {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x80000001), UNPACKED_1x1(   0.0,    0.0,    0.0,   -1.0)},
+
+   {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(        0.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x01000000), UNPACKED_1x1( 16777216.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(-16777216.0,         0.0,         0.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1(        0.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x00000000), UNPACKED_1x1( 16777216.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xff000000, 0x00000000), UNPACKED_1x1(-16777216.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x01000000), UNPACKED_1x1(        0.0,  16777216.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xff000000), UNPACKED_1x1(        0.0, -16777216.0,         0.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(        0.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1( 16777216.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xff000000, 0x00000000, 0x00000000), UNPACKED_1x1(-16777216.0,         0.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1(        0.0,  16777216.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xff000000, 0x00000000), UNPACKED_1x1(        0.0, -16777216.0,         0.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1(        0.0,         0.0,  16777216.0,   1.0)},
+   {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xff000000), UNPACKED_1x1(        0.0,         0.0, -16777216.0,   1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(        0.0,         0.0,         0.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 16777216.0,         0.0,         0.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xff000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-16777216.0,         0.0,         0.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1(        0.0,  16777216.0,         0.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xff000000, 0x00000000, 0x00000000), UNPACKED_1x1(        0.0, -16777216.0,         0.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1(        0.0,         0.0,  16777216.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xff000000, 0x00000000), UNPACKED_1x1(        0.0,         0.0, -16777216.0,         0.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1(        0.0,         0.0,         0.0,  16777216.0)},
+   {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xff000000), UNPACKED_1x1(        0.0,         0.0,         0.0, -16777216.0)},
+
+   /*
+    * Standard 32-bit float formats
+    */
+
+   {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x3f800000), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0xbf800000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x3f800000, 0x00000000), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xbf800000, 0x00000000), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x3f800000), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xbf800000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0,  1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x3f800000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xbf800000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x3f800000, 0x00000000), UNPACKED_1x1( 0.0,  1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xbf800000, 0x00000000), UNPACKED_1x1( 0.0, -1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x3f800000), UNPACKED_1x1( 0.0,  0.0,  1.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xbf800000), UNPACKED_1x1( 0.0,  0.0, -1.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x3f800000, 0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0,  1.0,  1.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x3f800000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xbf800000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x3f800000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xbf800000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x3f800000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  1.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xbf800000, 0x00000000), UNPACKED_1x1( 0.0,  0.0, -1.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x3f800000), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xbf800000), UNPACKED_1x1( 0.0,  0.0,  0.0, -1.0)},
+   {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0,  1.0,  1.0,  1.0)},
+
+   /*
+    * Half float formats
+    */
+
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x3c00), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xbc00), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1( 0.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x3c00, 0x0000), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xbc00, 0x0000), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x3c00), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xbc00), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x3c00, 0x3c00), UNPACKED_1x1( 1.0,  1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x3c00, 0x0000, 0x0000), UNPACKED_1x1( 1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xbc00, 0x0000, 0x0000), UNPACKED_1x1(-1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x3c00, 0x0000), UNPACKED_1x1( 0.0,  1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xbc00, 0x0000), UNPACKED_1x1( 0.0, -1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x3c00), UNPACKED_1x1( 0.0,  0.0,  1.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xbc00), UNPACKED_1x1( 0.0,  0.0, -1.0, 1.0)},
+   {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x3c00, 0x3c00, 0x3c00), UNPACKED_1x1( 1.0,  1.0,  1.0, 1.0)},
+
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x3c00, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xbc00, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(-1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x3c00, 0x0000, 0x0000), UNPACKED_1x1( 0.0,  1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xbc00, 0x0000, 0x0000), UNPACKED_1x1( 0.0, -1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x3c00, 0x0000), UNPACKED_1x1( 0.0,  0.0,  1.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xbc00, 0x0000), UNPACKED_1x1( 0.0,  0.0, -1.0,  0.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x3c00), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xbc00), UNPACKED_1x1( 0.0,  0.0,  0.0, -1.0)},
+   {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x3c00, 0x3c00, 0x3c00, 0x3c00), UNPACKED_1x1( 1.0,  1.0,  1.0,  1.0)},
+
+   /*
+    * 32-bit fixed point formats
+    */
+
+   {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00010000), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0xffff0000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00010000, 0x00000000), UNPACKED_1x1( 1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffff0000, 0x00000000), UNPACKED_1x1(-1.0,  0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00010000), UNPACKED_1x1( 0.0,  1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xffff0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00010000, 0x00010000), UNPACKED_1x1( 1.0,  1.0, 0.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00010000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffff0000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00010000, 0x00000000), UNPACKED_1x1( 0.0,  1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xffff0000, 0x00000000), UNPACKED_1x1( 0.0, -1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00010000), UNPACKED_1x1( 0.0,  0.0,  1.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xffff0000), UNPACKED_1x1( 0.0,  0.0, -1.0, 1.0)},
+   {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00010000, 0x00010000, 0x00010000), UNPACKED_1x1( 1.0,  1.0,  1.0, 1.0)},
+
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00010000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffff0000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0,  0.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00010000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0,  1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xffff0000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -1.0,  0.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00010000, 0x00000000), UNPACKED_1x1( 0.0,  0.0,  1.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xffff0000, 0x00000000), UNPACKED_1x1( 0.0,  0.0, -1.0,  0.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00010000), UNPACKED_1x1( 0.0,  0.0,  0.0,  1.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xffff0000), UNPACKED_1x1( 0.0,  0.0,  0.0, -1.0)},
+   {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00010000, 0x00010000, 0x00010000, 0x00010000), UNPACKED_1x1( 1.0,  1.0,  1.0,  1.0)},
+
+   /*
+    * D3D9 specific vertex formats
+    */
+
+   {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(   0.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(1023.0,    0.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(   0.0, 1023.0,    0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(   0.0,    0.0, 1023.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3fffffff), UNPACKED_1x1(1023.0, 1023.0, 1023.0, 1.0)},
+
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000001ff), UNPACKED_1x1( 1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000201), UNPACKED_1x1(-1.0,  0.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x0007fc00), UNPACKED_1x1( 0.0,  1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00080400), UNPACKED_1x1( 0.0, -1.0,  0.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x1ff00000), UNPACKED_1x1( 0.0,  0.0,  1.0, 1.0)},
+   {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x20100000), UNPACKED_1x1( 0.0,  0.0, -1.0, 1.0)},
+
+   /*
+    * Special formats that not fit anywhere else
+    */
+
+};
+
+
+const unsigned util_format_nr_test_cases = Elements(util_format_test_cases);
diff --git a/src/gallium/auxiliary/util/u_format_tests.h b/src/gallium/auxiliary/util/u_format_tests.h
new file mode 100644
index 0000000000..f59563f4f4
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_tests.h
@@ -0,0 +1,71 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FORMAT_TESTS_H_
+#define U_FORMAT_TESTS_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+
+
+#define UTIL_FORMAT_MAX_PACKED_BYTES 16
+#define UTIL_FORMAT_MAX_UNPACKED_WIDTH 4
+#define UTIL_FORMAT_MAX_UNPACKED_HEIGHT 4
+
+
+/**
+ * A (packed, unpacked) color pair.
+ */
+struct util_format_test_case
+{
+   enum pipe_format format;
+
+   /**
+    * Mask of the bits that actually meaningful data. Used to mask out the
+    * "X" channels.
+    */
+   uint8_t mask[UTIL_FORMAT_MAX_PACKED_BYTES];
+
+   uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+
+   /**
+    * RGBA.
+    */
+   double unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4];
+};
+
+
+extern const struct util_format_test_case
+util_format_test_cases[];
+
+
+extern const unsigned util_format_nr_test_cases;
+
+
+#endif /* U_FORMAT_TESTS_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_yuv.c b/src/gallium/auxiliary/util/u_format_yuv.c
new file mode 100644
index 0000000000..ab8bf29c97
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_yuv.c
@@ -0,0 +1,1047 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * YUV and RGB subsampled formats conversion.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_format_yuv.h"
+
+
+void
+util_format_r8g8_b8g8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      float r, g0, g1, b;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r  = ubyte_to_float((value >>  0) & 0xff);
+         g0 = ubyte_to_float((value >>  8) & 0xff);
+         b  = ubyte_to_float((value >> 16) & 0xff);
+         g1 = ubyte_to_float((value >> 24) & 0xff);
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+
+         dst[0] = r;    /* r */
+         dst[1] = g1;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r  = ubyte_to_float((value >>  0) & 0xff);
+         g0 = ubyte_to_float((value >>  8) & 0xff);
+         b  = ubyte_to_float((value >> 16) & 0xff);
+         g1 = ubyte_to_float((value >> 24) & 0xff);
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_r8g8_b8g8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t r, g0, g1, b;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r  = (value >>  0) & 0xff;
+         g0 = (value >>  8) & 0xff;
+         b  = (value >> 16) & 0xff;
+         g1 = (value >> 24) & 0xff;
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+         dst += 4;
+
+         dst[0] = r;    /* r */
+         dst[1] = g1;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         r  = (value >>  0) & 0xff;
+         g0 = (value >>  8) & 0xff;
+         b  = (value >> 16) & 0xff;
+         g1 = (value >> 24) & 0xff;
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_r8g8_b8g8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      float r, g0, g1, b;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         r  = 0.5f*(src[0] + src[4]);
+         g0 = src[1];
+         g1 = src[5];
+         b  = 0.5f*(src[2] + src[6]);
+
+         value  = float_to_ubyte(r);
+         value |= float_to_ubyte(g0) <<  8;
+         value |= float_to_ubyte(b)  << 16;
+         value |= float_to_ubyte(g1) << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         r  = src[0];
+         g0 = src[1];
+         g1 = 0;
+         b  = src[2];
+
+         value  = float_to_ubyte(r);
+         value |= float_to_ubyte(g0) <<  8;
+         value |= float_to_ubyte(b)  << 16;
+         value |= float_to_ubyte(g1) << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_r8g8_b8g8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint32_t r, g0, g1, b;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         r  = (src[0] + src[4] + 1) >> 1;
+         g0 = src[1];
+         g1 = src[5];
+         b  = (src[2] + src[6] + 1) >> 1;
+
+         value  = r;
+         value |= g0 <<  8;
+         value |= b  << 16;
+         value |= g1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         r  = src[0];
+         g0 = src[1];
+         g1 = 0;
+         b  = src[2];
+
+         value  = r;
+         value |= g0 <<  8;
+         value |= b  << 16;
+         value |= g1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_r8g8_b8g8_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned i, unsigned j)
+{
+   assert(i < 2);
+   assert(j < 1);
+
+   dst[0] = ubyte_to_float(src[0]);             /* r */
+   dst[1] = ubyte_to_float(src[1 + 2*i]);       /* g */
+   dst[2] = ubyte_to_float(src[2]);             /* b */
+   dst[3] = 1.0f;                               /* a */
+}
+
+
+void
+util_format_g8r8_g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      float r, g0, g1, b;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         g0 = ubyte_to_float((value >>  0) & 0xff);
+         r  = ubyte_to_float((value >>  8) & 0xff);
+         g1 = ubyte_to_float((value >> 16) & 0xff);
+         b  = ubyte_to_float((value >> 24) & 0xff);
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+
+         dst[0] = r;    /* r */
+         dst[1] = g1;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         g0 = ubyte_to_float((value >>  0) & 0xff);
+         r  = ubyte_to_float((value >>  8) & 0xff);
+         g1 = ubyte_to_float((value >> 16) & 0xff);
+         b  = ubyte_to_float((value >> 24) & 0xff);
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 1.0f; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_g8r8_g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t r, g0, g1, b;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         g0 = (value >>  0) & 0xff;
+         r  = (value >>  8) & 0xff;
+         g1 = (value >> 16) & 0xff;
+         b  = (value >> 24) & 0xff;
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+         dst += 4;
+
+         dst[0] = r;    /* r */
+         dst[1] = g1;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         g0 = (value >>  0) & 0xff;
+         r  = (value >>  8) & 0xff;
+         g1 = (value >> 16) & 0xff;
+         b  = (value >> 24) & 0xff;
+
+         dst[0] = r;    /* r */
+         dst[1] = g0;   /* g */
+         dst[2] = b;    /* b */
+         dst[3] = 0xff; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_g8r8_g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      float r, g0, g1, b;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         r  = 0.5f*(src[0] + src[4]);
+         g0 = src[1];
+         g1 = src[5];
+         b  = 0.5f*(src[2] + src[6]);
+
+         value  = float_to_ubyte(g0);
+         value |= float_to_ubyte(r)  <<  8;
+         value |= float_to_ubyte(g1) << 16;
+         value |= float_to_ubyte(b)  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         r  = src[0];
+         g0 = src[1];
+         g1 = 0;
+         b  = src[2];
+
+         value  = float_to_ubyte(g0);
+         value |= float_to_ubyte(r)  <<  8;
+         value |= float_to_ubyte(g1) << 16;
+         value |= float_to_ubyte(b)  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_g8r8_g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint32_t r, g0, g1, b;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         r  = (src[0] + src[4] + 1) >> 1;
+         g0 = src[1];
+         g1 = src[5];
+         b  = (src[2] + src[6] + 1) >> 1;
+
+         value  = g0;
+         value |= r  <<  8;
+         value |= g1 << 16;
+         value |= b  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         r  = src[0];
+         g0 = src[1];
+         g1 = 0;
+         b  = src[2];
+
+         value  = g0;
+         value |= r  <<  8;
+         value |= g1 << 16;
+         value |= b  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_g8r8_g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned i, unsigned j)
+{
+   assert(i < 2);
+   assert(j < 1);
+
+   dst[0] = ubyte_to_float(src[1]);             /* r */
+   dst[1] = ubyte_to_float(src[0 + 2*i]);       /* g */
+   dst[2] = ubyte_to_float(src[3]);             /* b */
+   dst[3] = 1.0f;                               /* a */
+}
+
+
+void
+util_format_uyvy_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                              const uint8_t *src_row, unsigned src_stride,
+                              unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t y0, y1, u, v;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         u  = (value >>  0) & 0xff;
+         y0 = (value >>  8) & 0xff;
+         v  = (value >> 16) & 0xff;
+         y1 = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+
+         util_format_yuv_to_rgb_float(y1, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         u  = (value >>  0) & 0xff;
+         y0 = (value >>  8) & 0xff;
+         v  = (value >> 16) & 0xff;
+         y1 = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_uyvy_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                               const uint8_t *src_row, unsigned src_stride,
+                               unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t y0, y1, u, v;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         u  = (value >>  0) & 0xff;
+         y0 = (value >>  8) & 0xff;
+         v  = (value >> 16) & 0xff;
+         y1 = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+         dst += 4;
+
+         util_format_yuv_to_rgb_8unorm(y1, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         u  = (value >>  0) & 0xff;
+         y0 = (value >>  8) & 0xff;
+         v  = (value >> 16) & 0xff;
+         y1 = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_uyvy_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                            const float *src_row, unsigned src_stride,
+                            unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint8_t y0, y1, u, v;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         uint8_t y0, y1, u0, u1, v0, v1, u, v;
+
+         util_format_rgb_float_to_yuv(src[0], src[1], src[2],
+                                      &y0, &u0, &v0);
+         util_format_rgb_float_to_yuv(src[4], src[5], src[6],
+                                      &y1, &u1, &v1);
+
+         u = (u0 + u1 + 1) >> 1;
+         v = (v0 + v1 + 1) >> 1;
+
+         value  = u;
+         value |= y0 <<  8;
+         value |= v  << 16;
+         value |= y1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         util_format_rgb_float_to_yuv(src[0], src[1], src[2],
+                                      &y0, &u, &v);
+         y1 = 0;
+
+         value  = u;
+         value |= y0 <<  8;
+         value |= v  << 16;
+         value |= y1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_uyvy_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                             const uint8_t *src_row, unsigned src_stride,
+                             unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint8_t y0, y1, u, v;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         uint8_t y0, y1, u0, u1, v0, v1, u, v;
+
+         util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2],
+                                       &y0, &u0, &v0);
+         util_format_rgb_8unorm_to_yuv(src[4], src[5], src[6],
+                                       &y1, &u1, &v1);
+
+         u = (u0 + u1 + 1) >> 1;
+         v = (v0 + v1 + 1) >> 1;
+
+         value  = u;
+         value |= y0 <<  8;
+         value |= v  << 16;
+         value |= y1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2],
+                                       &y0, &u, &v);
+         y1 = 0;
+
+         value  = u;
+         value |= y0 <<  8;
+         value |= v  << 16;
+         value |= y1 << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_uyvy_fetch_rgba_float(float *dst, const uint8_t *src,
+                             unsigned i, unsigned j)
+{
+   uint8_t y, u, v;
+
+   assert(i < 2);
+   assert(j < 1);
+
+   y = src[1 + i*2];
+   u = src[0];
+   v = src[2];
+
+   util_format_yuv_to_rgb_float(y, u, v, &dst[0], &dst[1], &dst[2]);
+
+   dst[3] = 1.0f;
+}
+
+
+void
+util_format_yuyv_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                              const uint8_t *src_row, unsigned src_stride,
+                              unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t y0, y1, u, v;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         y0 = (value >>  0) & 0xff;
+         u  = (value >>  8) & 0xff;
+         y1 = (value >> 16) & 0xff;
+         v  = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+
+         util_format_yuv_to_rgb_float(y1, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         y0 = (value >>  0) & 0xff;
+         u  = (value >>  8) & 0xff;
+         y1 = (value >> 16) & 0xff;
+         v  = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 1.0f; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_yuyv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                               const uint8_t *src_row, unsigned src_stride,
+                               unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      uint32_t value;
+      uint8_t y0, y1, u, v;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         value = *src++;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         y0 = (value >>  0) & 0xff;
+         u  = (value >>  8) & 0xff;
+         y1 = (value >> 16) & 0xff;
+         v  = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+         dst += 4;
+
+         util_format_yuv_to_rgb_8unorm(y1, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+         dst += 4;
+      }
+
+      if (x < width) {
+         value = *src;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         y0 = (value >>  0) & 0xff;
+         u  = (value >>  8) & 0xff;
+         y1 = (value >> 16) & 0xff;
+         v  = (value >> 24) & 0xff;
+
+         util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]);
+         dst[3] = 0xff; /* a */
+      }
+
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+
+void
+util_format_yuyv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                            const float *src_row, unsigned src_stride,
+                            unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint8_t y0, y1, u, v;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         uint8_t y0, y1, u0, u1, v0, v1, u, v;
+
+         util_format_rgb_float_to_yuv(src[0], src[1], src[2],
+                                      &y0, &u0, &v0);
+         util_format_rgb_float_to_yuv(src[4], src[5], src[6],
+                                      &y1, &u1, &v1);
+
+         u = (u0 + u1 + 1) >> 1;
+         v = (v0 + v1 + 1) >> 1;
+
+         value  = y0;
+         value |= u  <<  8;
+         value |= y1 << 16;
+         value |= v  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         util_format_rgb_float_to_yuv(src[0], src[1], src[2],
+                                      &y0, &u, &v);
+         y1 = 0;
+
+         value  = y0;
+         value |= u  <<  8;
+         value |= y1 << 16;
+         value |= v  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_yuyv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                             const uint8_t *src_row, unsigned src_stride,
+                             unsigned width, unsigned height)
+{
+   unsigned x, y;
+
+   for (y = 0; y < height; y += 1) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      uint8_t y0, y1, u, v;
+      uint32_t value;
+
+      for (x = 0; x + 1 < width; x += 2) {
+         uint8_t y0, y1, u0, u1, v0, v1, u, v;
+
+         util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2],
+                                       &y0, &u0, &v0);
+         util_format_rgb_8unorm_to_yuv(src[4], src[5], src[6],
+                                       &y1, &u1, &v1);
+
+         u = (u0 + u1 + 1) >> 1;
+         v = (v0 + v1 + 1) >> 1;
+
+         value  = y0;
+         value |= u  <<  8;
+         value |= y1 << 16;
+         value |= v  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst++ = value;
+
+         src += 8;
+      }
+
+      if (x < width) {
+         util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2],
+                                       &y0, &u, &v);
+         y1 = 0;
+
+         value  = y0;
+         value |= u  <<  8;
+         value |= y1 << 16;
+         value |= v  << 24;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+
+         *dst = value;
+      }
+
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+
+void
+util_format_yuyv_fetch_rgba_float(float *dst, const uint8_t *src,
+                             unsigned i, unsigned j)
+{
+   uint8_t y, u, v;
+
+   assert(i < 2);
+   assert(j < 1);
+
+   y = src[0 + i*2];
+   u = src[1];
+   v = src[3];
+
+   util_format_yuv_to_rgb_float(y, u, v, &dst[0], &dst[1], &dst[2]);
+
+   dst[3] = 1.0f;
+}
diff --git a/src/gallium/auxiliary/util/u_format_yuv.h b/src/gallium/auxiliary/util/u_format_yuv.h
new file mode 100644
index 0000000000..dc9632346d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_yuv.h
@@ -0,0 +1,223 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * YUV colorspace conversion.
+ *
+ * @author Brian Paul <brianp@vmware.com>
+ * @author Michal Krol <michal@vmware.com>
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * See also:
+ * - http://www.fourcc.org/fccyvrgb.php
+ * - http://msdn.microsoft.com/en-us/library/ms893078
+ * - http://en.wikipedia.org/wiki/YUV
+ */
+
+
+#ifndef U_FORMAT_YUV_H_
+#define U_FORMAT_YUV_H_
+
+
+#include "pipe/p_compiler.h"
+#include "u_math.h"
+
+
+/*
+ * TODO: Ensure we use consistent and right floating formulas, with enough
+ * precision in the coefficients.
+ */
+
+static INLINE void
+util_format_rgb_float_to_yuv(float r, float g, float b,
+                             uint8_t *y, uint8_t *u, uint8_t *v)
+{
+   const float _r = CLAMP(r, 0.0f, 1.0f);
+   const float _g = CLAMP(g, 0.0f, 1.0f);
+   const float _b = CLAMP(b, 0.0f, 1.0f);
+
+   const float scale = 255.0f;
+
+   const int _y = scale * ( (0.257f * _r) + (0.504f * _g) + (0.098f * _b));
+   const int _u = scale * (-(0.148f * _r) - (0.291f * _g) + (0.439f * _b));
+   const int _v = scale * ( (0.439f * _r) - (0.368f * _g) - (0.071f * _b));
+
+   *y = _y + 16; 
+   *u = _u + 128; 
+   *v = _v + 128; 
+}
+
+
+static INLINE void
+util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v,
+                             float *r, float *g, float *b)
+{
+   const int _y = y - 16;
+   const int _u = u - 128;
+   const int _v = v - 128;
+
+   const float y_factor = 255.0f / 219.0f;
+
+   const float scale = 1.0f / 255.0f;
+
+   *r = scale * (y_factor * _y               + 1.596f * _v);
+   *g = scale * (y_factor * _y - 0.391f * _u - 0.813f * _v);
+   *b = scale * (y_factor * _y + 2.018f * _u              );
+}
+
+
+static INLINE void
+util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b,
+                	      uint8_t *y, uint8_t *u, uint8_t *v)
+{
+   *y = ((  66 * r + 129 * g +  25 * b + 128) >> 8) +  16;
+   *u = (( -38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+   *v = (( 112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+
+
+static INLINE void
+util_format_yuv_to_rgb_8unorm(uint8_t y, uint8_t u, uint8_t v,
+                              uint8_t *r, uint8_t *g, uint8_t *b)
+{
+   const int _y = y - 16;
+   const int _u = u - 128;
+   const int _v = v - 128;
+
+   const int _r = (298 * _y            + 409 * _v + 128) >> 8;
+   const int _g = (298 * _y - 100 * _u - 208 * _v + 128) >> 8;
+   const int _b = (298 * _y + 516 * _u            + 128) >> 8;
+
+   *r = CLAMP(_r, 0, 255);
+   *g = CLAMP(_g, 0, 255);
+   *b = CLAMP(_b, 0, 255);
+}
+
+
+
+void
+util_format_uyvy_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                              const uint8_t *src_row, unsigned src_stride,
+                              unsigned width, unsigned height);
+
+void
+util_format_uyvy_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                               const uint8_t *src_row, unsigned src_stride,
+                               unsigned width, unsigned height);
+
+void
+util_format_uyvy_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                            const float *src_row, unsigned src_stride,
+                            unsigned width, unsigned height);
+
+void
+util_format_uyvy_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                             const uint8_t *src_row, unsigned src_stride,
+                             unsigned width, unsigned height);
+
+void
+util_format_uyvy_fetch_rgba_float(float *dst, const uint8_t *src,
+                             unsigned i, unsigned j);
+
+void
+util_format_yuyv_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                              const uint8_t *src_row, unsigned src_stride,
+                              unsigned width, unsigned height);
+
+void
+util_format_yuyv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                               const uint8_t *src_row, unsigned src_stride,
+                               unsigned width, unsigned height);
+
+void
+util_format_yuyv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                            const float *src_row, unsigned src_stride,
+                            unsigned width, unsigned height);
+
+void
+util_format_yuyv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                             const uint8_t *src_row, unsigned src_stride,
+                             unsigned width, unsigned height);
+
+void
+util_format_yuyv_fetch_rgba_float(float *dst, const uint8_t *src,
+                             unsigned i, unsigned j);
+
+
+void
+util_format_r8g8_b8g8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height);
+
+void
+util_format_r8g8_b8g8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height);
+
+void
+util_format_r8g8_b8g8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height);
+
+void
+util_format_r8g8_b8g8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height);
+
+void
+util_format_r8g8_b8g8_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned i, unsigned j);
+
+void
+util_format_g8r8_g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height);
+
+void
+util_format_g8r8_g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height);
+
+void
+util_format_g8r8_g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height);
+
+void
+util_format_g8r8_g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height);
+
+void
+util_format_g8r8_g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned i, unsigned j);
+
+
+
+#endif /* U_FORMAT_YUV_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
new file mode 100644
index 0000000000..792d69c214
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -0,0 +1,920 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "u_debug.h"
+#include "u_math.h"
+#include "u_format_zs.h"
+
+
+/*
+ * z32_unorm conversion functions
+ */
+
+static INLINE uint16_t
+z32_unorm_to_z16_unorm(uint32_t z)
+{
+   /* z * 0xffff / 0xffffffff */
+   return z >> 16;
+}
+
+static INLINE uint32_t
+z16_unorm_to_z32_unorm(uint16_t z)
+{
+   /* z * 0xffffffff / 0xffff */
+   return (z << 16) | z;
+}
+
+static INLINE uint32_t
+z32_unorm_to_z24_unorm(uint32_t z)
+{
+   /* z * 0xffffff / 0xffffffff */
+   return z >> 8;
+}
+
+static INLINE uint32_t
+z24_unorm_to_z32_unorm(uint32_t z)
+{
+   /* z * 0xffffffff / 0xffffff */
+   return (z << 8) | (z >> 16);
+}
+
+
+/*
+ * z32_float conversion functions
+ */
+
+static INLINE uint16_t
+z32_float_to_z16_unorm(float z)
+{
+   const float scale = 0xffff;
+   return (uint16_t)(z * scale);
+}
+
+static INLINE float
+z16_unorm_to_z32_float(uint16_t z)
+{
+   const float scale = 1.0 / 0xffff;
+   return (float)(z * scale);
+}
+
+static INLINE uint32_t
+z32_float_to_z24_unorm(float z)
+{
+   const double scale = 0xffffff;
+   return (uint32_t)(z * scale) & 0xffffff;
+}
+
+static INLINE float
+z24_unorm_to_z32_float(uint32_t z)
+{
+   const double scale = 1.0 / 0xffffff;
+   return (float)(z * scale);
+}
+
+static INLINE uint32_t
+z32_float_to_z32_unorm(float z)
+{
+   const double scale = 0xffffffff;
+   return (uint32_t)(z * scale);
+}
+
+static INLINE float
+z32_unorm_to_z32_float(uint32_t z)
+{
+   const double scale = 1.0 / 0xffffffff;
+   return (float)(z * scale);
+}
+
+
+void
+util_format_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z16_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                     const uint8_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint16_t *src = (const uint16_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint16_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap16(value);
+#endif
+         *dst++ = z16_unorm_to_z32_float(value);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z16_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                   const float *src_row, unsigned src_stride,
+                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint16_t *dst = (uint16_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint16_t value;
+         value = z32_float_to_z16_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap16(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z16_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const uint16_t *src = (const uint16_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint16_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap16(value);
+#endif
+         *dst++ = z16_unorm_to_z32_unorm(value);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z16_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                     const uint32_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      uint16_t *dst = (uint16_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint16_t value;
+         value = z32_unorm_to_z16_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap16(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z32_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                     const uint8_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z32_unorm_to_z32_float(value);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                   const float *src_row, unsigned src_stride,
+                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value;
+         value = z32_float_to_z32_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z32_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width * 4);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                     const uint32_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width * 4);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                     const uint8_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width * 4);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                   const float *src_row, unsigned src_stride,
+                                   unsigned width, unsigned height)
+{
+   unsigned y;
+   for(y = 0; y < height; ++y) {
+      memcpy(dst_row, src_row, width * 4);
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const float *src = (const float *)src_row;
+      for(x = 0; x < width; ++x) {
+         *dst++ = z32_float_to_z32_unorm(*src++);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                     const uint32_t *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      float *dst = (float *)dst_row;
+      for(x = 0; x < width; ++x) {
+         *dst++ = z32_unorm_to_z32_float(*src++);
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                                const uint8_t *src_row, unsigned src_stride,
+                                                unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_float(value & 0xffffff);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                              const float *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0xff000000;
+         value |= z32_float_to_z24_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                                  const uint8_t *src_row, unsigned src_stride,
+                                                  unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                                const uint32_t *src_row, unsigned src_stride,
+                                                unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value= *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0xff000000;
+         value |= z32_unorm_to_z24_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                   const uint8_t *src_row, unsigned src_stride,
+                                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value >> 24;
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                 const uint8_t *src_row, unsigned src_stride,
+                                                 unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0x00ffffff;
+         value |= *src++ << 24;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                                const uint8_t *src_row, unsigned src_stride,
+                                                unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_float(value >> 8);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                              const float *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0x000000ff;
+         value |= z32_float_to_z24_unorm(*src++) << 8;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                                  const uint8_t *src_row, unsigned src_stride,
+                                                  unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_unorm(value >> 8);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                                const uint32_t *src_row, unsigned src_stride,
+                                                unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0x000000ff;
+         value |= *src++ & 0xffffff00;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                   const uint8_t *src_row, unsigned src_stride,
+                                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint8_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value & 0xff;
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                 const uint8_t *src_row, unsigned src_stride,
+                                                 unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint8_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0xffffff00;
+         value |= *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z24x8_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_float(value & 0xffffff);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z24x8_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                     const float *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value;
+         value = z32_float_to_z24_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z24x8_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z24x8_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint32_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value;
+         value = z32_unorm_to_z24_unorm(*src++);
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_x8z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                       const uint8_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const uint32_t *src = (uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_float(value >> 8);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_x8z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                     const float *src_row, unsigned src_stride,
+                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value;
+         value = z32_float_to_z24_unorm(*src++) << 8;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_x8z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const uint32_t *src = (const uint32_t *)src_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value = *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = z24_unorm_to_z32_unorm(value >> 8);
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_x8z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                       const uint32_t *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      uint32_t *dst = (uint32_t *)dst_row;
+      for(x = 0; x < width; ++x) {
+         uint32_t value;
+         value = z32_unorm_to_z24_unorm(*src++) << 8;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_z_float(float *dst_row, unsigned dst_stride,
+                                                   const uint8_t *src_row, unsigned src_stride,
+                                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      float *dst = dst_row;
+      const float *src = (const float *)src_row;
+      for(x = 0; x < width; ++x) {
+         *dst = *src;
+         src += 2;
+         dst += 1;
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
+                                                 const float *src_row, unsigned src_stride,
+                                                 unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const float *src = src_row;
+      float *dst = (float *)dst_row;
+      for(x = 0; x < width; ++x) {
+         *dst = *src;
+         src += 1;
+         dst += 2;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
+                                                     const uint8_t *src_row, unsigned src_stride,
+                                                     unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint32_t *dst = dst_row;
+      const float *src = (const float *)src_row;
+      for(x = 0; x < width; ++x) {
+         *dst = z32_float_to_z32_unorm(*src);
+         src += 2;
+         dst += 1;
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
+                                                   const uint32_t *src_row, unsigned src_stride,
+                                                   unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint32_t *src = src_row;
+      float *dst = (float *)dst_row;
+      for(x = 0; x < width; ++x) {
+         *dst++ = z32_unorm_to_z32_float(*src++);
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                      const uint8_t *src_row, unsigned src_stride,
+                                                      unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      uint8_t *dst = dst_row;
+      const uint8_t *src = src_row + 4;
+      for(x = 0; x < width; ++x) {
+         *dst = *src;
+         src += 8;
+         dst += 1;
+      }
+      src_row += src_stride/sizeof(*src_row);
+      dst_row += dst_stride/sizeof(*dst_row);
+   }
+}
+
+void
+util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+                                                    const uint8_t *src_row, unsigned src_stride,
+                                                    unsigned width, unsigned height)
+{
+   unsigned x, y;
+   for(y = 0; y < height; ++y) {
+      const uint8_t *src = src_row;
+      uint8_t *dst = dst_row + 4;
+      for(x = 0; x < width; ++x) {
+         *dst = *src;
+         src += 1;
+         dst += 8;
+      }
+      dst_row += dst_stride/sizeof(*dst_row);
+      src_row += src_stride/sizeof(*src_row);
+   }
+}
+
diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h
new file mode 100644
index 0000000000..650db4b95f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_zs.h
@@ -0,0 +1,196 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FORMAT_ZS_H_
+#define U_FORMAT_ZS_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+void
+util_format_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z16_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z16_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z16_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z16_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24x8_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24x8_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24x8_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z24x8_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_x8z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_x8z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_x8z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_x8z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+void
+util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+
+#endif /* U_FORMAT_ZS_H_ */
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index fc027e48e4..eee6030ddc 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -62,11 +62,12 @@ struct gen_mipmap_state
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
    struct pipe_clip_state clip;
+   struct pipe_vertex_element velem[2];
 
    void *vs;
    void *fs2d, *fsCube;
 
-   struct pipe_buffer *vbuf;  /**< quad vertices */
+   struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
 
    float vertices[4][2][4];   /**< vertex/texcoords for quad */
@@ -937,6 +938,7 @@ format_to_type_comps(enum pipe_format pformat,
       *datatype = DTYPE_UBYTE;
       *comps = 4;
       return;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
    case PIPE_FORMAT_B5G5R5A1_UNORM:
       *datatype = DTYPE_USHORT_1_5_5_5_REV;
       *comps = 4;
@@ -1114,11 +1116,10 @@ reduce_3d(enum pipe_format pformat,
 
 static void
 make_1d_mipmap(struct gen_mipmap_state *ctx,
-               struct pipe_texture *pt,
+               struct pipe_resource *pt,
                uint face, uint baseLevel, uint lastLevel)
 {
    struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
    const uint zslice = 0;
    uint dstLevel;
 
@@ -1127,38 +1128,37 @@ make_1d_mipmap(struct gen_mipmap_state *ctx,
       struct pipe_transfer *srcTrans, *dstTrans;
       void *srcMap, *dstMap;
       
-      srcTrans = screen->get_tex_transfer(screen, pt, face, srcLevel, zslice,
+      srcTrans = pipe_get_transfer(pipe, pt, face, srcLevel, zslice,
                                           PIPE_TRANSFER_READ, 0, 0,
                                           u_minify(pt->width0, srcLevel),
                                           u_minify(pt->height0, srcLevel));
-      dstTrans = screen->get_tex_transfer(screen, pt, face, dstLevel, zslice,
+      dstTrans = pipe_get_transfer(pipe, pt, face, dstLevel, zslice,
                                           PIPE_TRANSFER_WRITE, 0, 0,
                                           u_minify(pt->width0, dstLevel),
                                           u_minify(pt->height0, dstLevel));
 
-      srcMap = (ubyte *) screen->transfer_map(screen, srcTrans);
-      dstMap = (ubyte *) screen->transfer_map(screen, dstTrans);
+      srcMap = (ubyte *) pipe->transfer_map(pipe, srcTrans);
+      dstMap = (ubyte *) pipe->transfer_map(pipe, dstTrans);
 
       reduce_1d(pt->format,
-                srcTrans->width, srcMap,
-                dstTrans->width, dstMap);
+                srcTrans->box.width, srcMap,
+                dstTrans->box.width, dstMap);
 
-      screen->transfer_unmap(screen, srcTrans);
-      screen->transfer_unmap(screen, dstTrans);
+      pipe->transfer_unmap(pipe, srcTrans);
+      pipe->transfer_unmap(pipe, dstTrans);
 
-      screen->tex_transfer_destroy(srcTrans);
-      screen->tex_transfer_destroy(dstTrans);
+      pipe->transfer_destroy(pipe, srcTrans);
+      pipe->transfer_destroy(pipe, dstTrans);
    }
 }
 
 
 static void
 make_2d_mipmap(struct gen_mipmap_state *ctx,
-               struct pipe_texture *pt,
+               struct pipe_resource *pt,
                uint face, uint baseLevel, uint lastLevel)
 {
    struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
    const uint zslice = 0;
    uint dstLevel;
    
@@ -1170,36 +1170,36 @@ make_2d_mipmap(struct gen_mipmap_state *ctx,
       struct pipe_transfer *srcTrans, *dstTrans;
       ubyte *srcMap, *dstMap;
       
-      srcTrans = screen->get_tex_transfer(screen, pt, face, srcLevel, zslice,
-                                          PIPE_TRANSFER_READ, 0, 0,
-                                          u_minify(pt->width0, srcLevel),
-                                          u_minify(pt->height0, srcLevel));
-      dstTrans = screen->get_tex_transfer(screen, pt, face, dstLevel, zslice,
-                                          PIPE_TRANSFER_WRITE, 0, 0,
-                                          u_minify(pt->width0, dstLevel),
-                                          u_minify(pt->height0, dstLevel));
-
-      srcMap = (ubyte *) screen->transfer_map(screen, srcTrans);
-      dstMap = (ubyte *) screen->transfer_map(screen, dstTrans);
+      srcTrans = pipe_get_transfer(pipe, pt, face, srcLevel, zslice,
+				   PIPE_TRANSFER_READ, 0, 0,
+				   u_minify(pt->width0, srcLevel),
+				   u_minify(pt->height0, srcLevel));
+      dstTrans = pipe_get_transfer(pipe, pt, face, dstLevel, zslice,
+				   PIPE_TRANSFER_WRITE, 0, 0,
+				   u_minify(pt->width0, dstLevel),
+				   u_minify(pt->height0, dstLevel));
+
+      srcMap = (ubyte *) pipe->transfer_map(pipe, srcTrans);
+      dstMap = (ubyte *) pipe->transfer_map(pipe, dstTrans);
 
       reduce_2d(pt->format,
-                srcTrans->width, srcTrans->height,
+                srcTrans->box.width, srcTrans->box.height,
                 srcTrans->stride, srcMap,
-                dstTrans->width, dstTrans->height,
+                dstTrans->box.width, dstTrans->box.height,
                 dstTrans->stride, dstMap);
 
-      screen->transfer_unmap(screen, srcTrans);
-      screen->transfer_unmap(screen, dstTrans);
+      pipe->transfer_unmap(pipe, srcTrans);
+      pipe->transfer_unmap(pipe, dstTrans);
 
-      screen->tex_transfer_destroy(srcTrans);
-      screen->tex_transfer_destroy(dstTrans);
+      pipe->transfer_destroy(pipe, srcTrans);
+      pipe->transfer_destroy(pipe, dstTrans);
    }
 }
 
 
 static void
 make_3d_mipmap(struct gen_mipmap_state *ctx,
-               struct pipe_texture *pt,
+               struct pipe_resource *pt,
                uint face, uint baseLevel, uint lastLevel)
 {
 #if 0
@@ -1215,17 +1215,17 @@ make_3d_mipmap(struct gen_mipmap_state *ctx,
       struct pipe_transfer *srcTrans, *dstTrans;
       ubyte *srcMap, *dstMap;
       
-      srcTrans = screen->get_tex_transfer(screen, pt, face, srcLevel, zslice,
+      srcTrans = pipe->get_transfer(pipe, pt, face, srcLevel, zslice,
                                           PIPE_TRANSFER_READ, 0, 0,
                                           u_minify(pt->width0, srcLevel),
                                           u_minify(pt->height0, srcLevel));
-      dstTrans = screen->get_tex_transfer(screen, pt, face, dstLevel, zslice,
+      dstTrans = pipe->get_transfer(pipe, pt, face, dstLevel, zslice,
                                           PIPE_TRANSFER_WRITE, 0, 0,
                                           u_minify(pt->width0, dstLevel),
                                           u_minify(pt->height0, dstLevel));
 
-      srcMap = (ubyte *) screen->transfer_map(screen, srcTrans);
-      dstMap = (ubyte *) screen->transfer_map(screen, dstTrans);
+      srcMap = (ubyte *) pipe->transfer_map(pipe, srcTrans);
+      dstMap = (ubyte *) pipe->transfer_map(pipe, dstTrans);
 
       reduce_3d(pt->format,
                 srcTrans->width, srcTrans->height,
@@ -1233,11 +1233,11 @@ make_3d_mipmap(struct gen_mipmap_state *ctx,
                 dstTrans->width, dstTrans->height,
                 dstTrans->stride, dstMap);
 
-      screen->transfer_unmap(screen, srcTrans);
-      screen->transfer_unmap(screen, dstTrans);
+      pipe->transfer_unmap(pipe, srcTrans);
+      pipe->transfer_unmap(pipe, dstTrans);
 
-      screen->tex_transfer_destroy(srcTrans);
-      screen->tex_transfer_destroy(dstTrans);
+      pipe->transfer_destroy(pipe, srcTrans);
+      pipe->transfer_destroy(pipe, dstTrans);
    }
 #else
    (void) reduce_3d;
@@ -1247,7 +1247,7 @@ make_3d_mipmap(struct gen_mipmap_state *ctx,
 
 static void
 fallback_gen_mipmap(struct gen_mipmap_state *ctx,
-                    struct pipe_texture *pt,
+                    struct pipe_resource *pt,
                     uint face, uint baseLevel, uint lastLevel)
 {
    switch (pt->target) {
@@ -1307,6 +1307,15 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
    ctx->sampler.normalized_coords = 1;
 
+   /* vertex elements state */
+   memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
+   for (i = 0; i < 2; i++) {
+      ctx->velem[i].src_offset = i * 4 * sizeof(float);
+      ctx->velem[i].instance_divisor = 0;
+      ctx->velem[i].vertex_buffer_index = 0;
+      ctx->velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   }
+
    /* vertex shader - still needed to specify mapping from fragment
     * shader input semantics to vertex elements 
     */
@@ -1349,8 +1358,7 @@ get_next_slot(struct gen_mipmap_state *ctx)
 
    if (!ctx->vbuf) {
       ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
-                                     32,
-                                     PIPE_BUFFER_USAGE_VERTEX,
+                                     PIPE_BIND_VERTEX_BUFFER,
                                      max_slots * sizeof ctx->vertices);
    }
    
@@ -1411,7 +1419,7 @@ set_vertex_data(struct gen_mipmap_state *ctx,
 
    offset = get_next_slot( ctx );
 
-   pipe_buffer_write_nooverlap(ctx->pipe->screen, ctx->vbuf,
+   pipe_buffer_write_nooverlap(ctx->pipe, ctx->vbuf,
                                offset, sizeof(ctx->vertices), ctx->vertices);
 
    return offset;
@@ -1431,7 +1439,7 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
    pipe->delete_fs_state(pipe, ctx->fs2d);
    pipe->delete_fs_state(pipe, ctx->fsCube);
 
-   pipe_buffer_reference(&ctx->vbuf, NULL);
+   pipe_resource_reference(&ctx->vbuf, NULL);
 
    FREE(ctx);
 }
@@ -1443,7 +1451,7 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
  */
 void util_gen_mipmap_flush( struct gen_mipmap_state *ctx )
 {
-   pipe_buffer_reference(&ctx->vbuf, NULL);
+   pipe_resource_reference(&ctx->vbuf, NULL);
    ctx->vbuf_slot = 0;
 } 
 
@@ -1452,7 +1460,7 @@ void util_gen_mipmap_flush( struct gen_mipmap_state *ctx )
  * Generate mipmap images.  It's assumed all needed texture memory is
  * already allocated.
  *
- * \param pt  the texture to generate mipmap levels for
+ * \param psv  the sampler view to the texture to generate mipmap levels for
  * \param face  which cube face to generate mipmaps for (0 for non-cube maps)
  * \param baseLevel  the first mipmap level to use as a src
  * \param lastLevel  the last mipmap level to generate
@@ -1461,12 +1469,13 @@ void util_gen_mipmap_flush( struct gen_mipmap_state *ctx )
  */
 void
 util_gen_mipmap(struct gen_mipmap_state *ctx,
-                struct pipe_texture *pt,
+                struct pipe_sampler_view *psv,
                 uint face, uint baseLevel, uint lastLevel, uint filter)
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
    struct pipe_framebuffer_state fb;
+   struct pipe_resource *pt = psv->texture;
    void *fs = (pt->target == PIPE_TEXTURE_CUBE) ? ctx->fsCube : ctx->fs2d;
    uint dstLevel;
    uint zslice = 0;
@@ -1484,8 +1493,8 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
           filter == PIPE_TEX_FILTER_NEAREST);
 
    /* check if we can render in the texture's format */
-   if (!screen->is_format_supported(screen, pt->format, PIPE_TEXTURE_2D,
-                                    PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+   if (!screen->is_format_supported(screen, psv->format, PIPE_TEXTURE_2D,
+                                    PIPE_BIND_RENDER_TARGET, 0)) {
       fallback_gen_mipmap(ctx, pt, face, baseLevel, lastLevel);
       return;
    }
@@ -1495,18 +1504,20 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_save_depth_stencil_alpha(ctx->cso);
    cso_save_rasterizer(ctx->cso);
    cso_save_samplers(ctx->cso);
-   cso_save_sampler_textures(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
    cso_save_viewport(ctx->cso);
    cso_save_clip(ctx->cso);
+   cso_save_vertex_elements(ctx->cso);
 
    /* bind our state */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
+   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    cso_set_fragment_shader_handle(ctx->cso, fs);
    cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
@@ -1529,7 +1540,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
 
       struct pipe_surface *surf = 
          screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
-                                 PIPE_BUFFER_USAGE_GPU_WRITE);
+                                 PIPE_BIND_RENDER_TARGET);
 
       /*
        * Setup framebuffer / dest surface
@@ -1562,7 +1573,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
       cso_single_sampler(ctx->cso, 0, &ctx->sampler);
       cso_single_sampler_done(ctx->cso);
 
-      cso_set_sampler_textures(ctx->cso, 1, &pt);
+      cso_set_fragment_sampler_views(ctx->cso, 1, &psv);
 
       /* quad coords in clip coords */
       offset = set_vertex_data(ctx,
@@ -1587,10 +1598,11 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_restore_depth_stencil_alpha(ctx->cso);
    cso_restore_rasterizer(ctx->cso);
    cso_restore_samplers(ctx->cso);
-   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
    cso_restore_viewport(ctx->cso);
    cso_restore_clip(ctx->cso);
+   cso_restore_vertex_elements(ctx->cso);
 }
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.h b/src/gallium/auxiliary/util/u_gen_mipmap.h
index 54608f9466..a7502b9982 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.h
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.h
@@ -37,7 +37,7 @@ extern "C" {
 
    
 struct pipe_context;
-struct pipe_texture;
+struct pipe_resource;
 struct cso_context;
 
 struct gen_mipmap_state;
@@ -59,7 +59,7 @@ util_gen_mipmap_flush( struct gen_mipmap_state *ctx );
 
 extern void
 util_gen_mipmap(struct gen_mipmap_state *ctx,
-                struct pipe_texture *pt,
+                struct pipe_sampler_view *psv,
                 uint face, uint baseLevel, uint lastLevel, uint filter);
 
 
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
new file mode 100644
index 0000000000..ad030e90c6
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -0,0 +1,90 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef U_HALF_H
+#define U_HALF_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_math.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint32_t util_half_to_float_mantissa_table[2048];
+extern const uint32_t util_half_to_float_exponent_table[64];
+extern const uint32_t util_half_to_float_offset_table[64];
+extern const uint16_t util_float_to_half_base_table[512];
+extern const uint8_t util_float_to_half_shift_table[512];
+
+/*
+ * Note that if the half float is a signaling NaN, the x87 FPU will turn
+ * it into a quiet NaN immediately upon loading into a float.
+ *
+ * Additionally, denormals may be flushed to zero.
+ *
+ * To avoid this, use the floatui functions instead of the float ones
+ * when just doing conversion rather than computation on the resulting
+ * floats.
+ */
+
+static INLINE uint32_t
+util_half_to_floatui(uint16_t h)
+{
+   unsigned exp = h >> 10;
+   return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)] + util_half_to_float_exponent_table[exp];
+}
+
+static INLINE float
+util_half_to_float(uint16_t h)
+{
+   union fi r;
+   r.ui = util_half_to_floatui(h);
+   return r.f;
+}
+
+static INLINE uint16_t
+util_floatui_to_half(uint32_t v)
+{
+   unsigned signexp = v >> 23;
+   return util_float_to_half_base_table[signexp] + ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]);
+}
+
+static INLINE uint16_t
+util_float_to_half(float f)
+{
+   union fi i;
+   i.f = f;
+   return util_floatui_to_half(i.ui);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_HALF_H */
+
diff --git a/src/gallium/auxiliary/util/u_half.py b/src/gallium/auxiliary/util/u_half.py
new file mode 100644
index 0000000000..8007482e97
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.py
@@ -0,0 +1,179 @@
+# Copyright 2010 Luca Barbieri
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# *************************************************************************
+
+# The code is a reimplementation of the algorithm in
+#  www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+# "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
+#
+# The table contents have been slightly changed so that the exponent
+# bias is now in the exponent table instead of the mantissa table (mostly
+# for cosmetic reasons, and because it theoretically allows a variant
+# that flushes denormal to zero but uses a mantissa table with 24-bit
+# entries).
+#
+# The tables are also constructed slightly differently.
+#
+
+# Note that using a 64K * 4 table is a terrible idea since it will not fit
+# in the L1 cache and will massively pollute the L2 cache as well
+#
+# These should instead fit in the L1 cache.
+#
+# TODO: we could use a denormal bias table instead of the mantissa/offset
+# tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
+# but would involve more computation
+#
+# Note however that if denormals are never encountered, the L1 cache usage
+# is only about 4608 bytes anyway.
+
+table_index = None
+table_length = None
+
+def begin(t, n, l):
+	global table_length
+	global table_index
+	table_index = 0
+	table_length = l
+	print
+	print "const " + t + " " + n + "[" + str(l) + "] = {"
+
+def value(v):
+	global table_index
+	table_index += 1
+	print "\t" + hex(v) + ","
+
+def end():
+	global table_length
+	global table_index
+	print "};"
+	assert table_index == table_length
+
+print "/* This file is autogenerated by u_half.py. Do not edit directly. */"
+print "#include \"util/u_half.h\""
+
+begin("uint32_t", "util_half_to_float_mantissa_table", 2048)
+# zero
+value(0)
+
+# denormals
+for i in xrange(1, 1024):
+	m = i << 13
+	e = 0
+
+	# normalize number
+	while (m & 0x00800000) == 0:
+		e -= 0x00800000;
+		m <<= 1;
+
+	m &= ~0x00800000;
+	e += 0x38800000;
+	value(m | e)
+
+# normals
+for i in xrange(1024, 2048):
+	value((i - 1024) << 13)
+end()
+
+begin("uint32_t", "util_half_to_float_exponent_table", 64)
+# positive zero or denormals
+value(0)
+
+# positive numbers
+for i in xrange(1, 31):
+	value(0x38000000 + (i << 23))
+
+# positive infinity/NaN
+value(0x7f800000)
+
+# negative zero or denormals
+value(0x80000000)
+
+# negative numbers
+for i in range(33, 63):
+	value(0xb8000000 + ((i - 32) << 23))
+
+# negative infinity/NaN
+value(0xff800000)
+end()
+
+begin("uint32_t", "util_half_to_float_offset_table", 64)
+# positive zero or denormals
+value(0)
+
+# positive normals
+for i in range(1, 32):
+	value(1024)
+
+# negative zero or denormals
+value(0)
+
+# negative normals
+for i in xrange(33, 64):
+	value(1024)
+end()
+
+begin("uint16_t", "util_float_to_half_base_table", 512)
+for sign in (0, 0x8000):
+	# very small numbers mapping to zero
+	for i in xrange(-127, -24):
+		value(sign | 0)
+
+	# small numbers mapping to denormals
+	for i in xrange(-24, -14):
+		value(sign | (0x400 >> (-14 -i)))
+
+	# normal numbers
+	for i in xrange(-14, 16):
+		value(sign | ((i + 15) << 10))
+
+	# large numbers mapping to infinity
+	for i in xrange(16, 128):
+		value(sign | 0x7c00)
+
+	# infinity and NaNs
+	value(sign | 0x7c00)
+end()
+
+begin("uint8_t", "util_float_to_half_shift_table", 512)
+for sign in (0, 0x8000):
+	# very small numbers mapping to zero
+	for i in xrange(-127, -24):
+		value(24)
+
+	# small numbers mapping to denormals
+	for i in xrange(-24, -14):
+		value(-1 - i)
+
+	# normal numbers
+	for i in xrange(-14, 16):
+		value(13)
+
+	# large numbers mapping to infinity
+	for i in xrange(16, 128):
+		value(24)
+
+	# infinity and NaNs
+	value(13)
+end()
+
diff --git a/src/gallium/auxiliary/util/u_init.h b/src/gallium/auxiliary/util/u_init.h
new file mode 100644
index 0000000000..7bc356a791
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_init.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_INIT_H
+#define U_INIT_H
+
+/* Use UTIL_INIT(f) to have f called at program initialization.
+   Note that it is only guaranteed to be called if any symbol in the
+   .c file it is in sis referenced by the program.
+
+   UTIL_INIT functions are called in arbitrary order.
+*/
+
+#ifdef __cplusplus
+/* use a C++ global constructor */
+#define UTIL_INIT(f) struct f##__gctor_t {f##__gctor_t() {x();}} f##__gctor;
+#elif defined(_MSC_VER)
+/* add a pointer to the section where MSVC stores global constructor pointers */
+/* see http://blogs.msdn.com/vcblog/archive/2006/10/20/crt-initialization.aspx and
+   http://stackoverflow.com/questions/1113409/attribute-constructor-equivalent-in-vc */
+#pragma section(".CRT$XCU",read)
+#define UTIL_INIT(f) static void __cdecl f##__init(void) {f();}; __declspec(allocate(".CRT$XCU")) void (__cdecl* f##__xcu)(void) = f##__init;
+#elif defined(__GNUC__)
+#define UTIL_INIT(f) static void f##__init(void) __attribute__((constructor)); static void f##__init(void) {f();}
+#else
+#error Unsupported compiler: please find out how to implement global initializers in C on it
+#endif
+
+#endif
+
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 0cb3432c6e..089adcf382 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -34,6 +34,8 @@
 #include "pipe/p_screen.h"
 #include "util/u_debug.h"
 #include "util/u_atomic.h"
+#include "util/u_box.h"
+#include "util/u_math.h"
 
 
 #ifdef __cplusplus
@@ -87,18 +89,6 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
    return destroy;
 }
 
-static INLINE void
-pipe_buffer_reference(struct pipe_buffer **ptr, struct pipe_buffer *buf)
-{
-   struct pipe_buffer *old_buf;
-
-   assert(ptr);
-   old_buf = *ptr;
-
-   if (pipe_reference(&(*ptr)->reference, &buf->reference))
-      old_buf->screen->buffer_destroy(old_buf);
-   *ptr = buf;
-}
 
 static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
@@ -110,106 +100,184 @@ pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
    *ptr = surf;
 }
 
+
 static INLINE void
-pipe_texture_reference(struct pipe_texture **ptr, struct pipe_texture *tex)
+pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
 {
-   struct pipe_texture *old_tex = *ptr;
+   struct pipe_resource *old_tex = *ptr;
 
    if (pipe_reference(&(*ptr)->reference, &tex->reference))
-      old_tex->screen->texture_destroy(old_tex);
+      old_tex->screen->resource_destroy(old_tex->screen, old_tex);
    *ptr = tex;
 }
 
 
+static INLINE void
+pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
+{
+   struct pipe_sampler_view *old_view = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &view->reference))
+      old_view->context->sampler_view_destroy(old_view->context, old_view);
+   *ptr = view;
+}
+
+static INLINE void
+pipe_surface_reset(struct pipe_surface* ps, struct pipe_resource *pt,
+		unsigned face, unsigned level, unsigned zslice, unsigned flags)
+{
+   pipe_resource_reference(&ps->texture, pt);
+   ps->format = pt->format;
+   ps->width = u_minify(pt->width0, level);
+   ps->height = u_minify(pt->height0, level);
+   ps->usage = flags;
+   ps->face = face;
+   ps->level = level;
+   ps->zslice = zslice;
+}
+
+static INLINE void
+pipe_surface_init(struct pipe_surface* ps, struct pipe_resource *pt,
+                unsigned face, unsigned level, unsigned zslice, unsigned flags)
+{
+   ps->texture = 0;
+   pipe_reference_init(&ps->reference, 1);
+   pipe_surface_reset(ps, pt, face, level, zslice, flags);
+}
+
 /*
  * Convenience wrappers for screen buffer functions.
  */
 
-static INLINE struct pipe_buffer *
+static INLINE struct pipe_resource *
 pipe_buffer_create( struct pipe_screen *screen,
-                    unsigned alignment, unsigned usage, unsigned size )
+		    unsigned bind,
+		    unsigned size )
 {
-   return screen->buffer_create(screen, alignment, usage, size);
+   struct pipe_resource buffer;
+   memset(&buffer, 0, sizeof buffer);
+   buffer.target = PIPE_BUFFER;
+   buffer.format = PIPE_FORMAT_R8_UNORM; /* want TYPELESS or similar */
+   buffer.bind = bind;
+   buffer._usage = PIPE_USAGE_DEFAULT;
+   buffer.flags = 0;
+   buffer.width0 = size;
+   buffer.height0 = 1;
+   buffer.depth0 = 1;
+   return screen->resource_create(screen, &buffer);
 }
 
-static INLINE struct pipe_buffer *
-pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
+
+static INLINE struct pipe_resource *
+pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size,
+			 unsigned usage )
 {
-   return screen->user_buffer_create(screen, ptr, size);
+   return screen->user_buffer_create(screen, ptr, size, usage);
 }
 
 static INLINE void *
-pipe_buffer_map(struct pipe_screen *screen,
-                struct pipe_buffer *buf,
-                unsigned usage)
+pipe_buffer_map_range(struct pipe_context *pipe,
+		      struct pipe_resource *buffer,
+		      unsigned offset,
+		      unsigned length,
+		      unsigned usage,
+		      struct pipe_transfer **transfer)
 {
-   if(screen->buffer_map_range) {
-      unsigned offset = 0;
-      unsigned length = buf->size;
-      return screen->buffer_map_range(screen, buf, offset, length, usage);
+   struct pipe_box box;
+   void *map;
+
+   assert(offset < buffer->width0);
+   assert(offset + length <= buffer->width0);
+   assert(length);
+   
+   u_box_1d(offset, length, &box);
+
+   *transfer = pipe->get_transfer( pipe,
+				   buffer,
+				   u_subresource(0, 0),
+				   usage,
+				   &box);
+   
+   if (*transfer == NULL)
+      return NULL;
+
+   map = pipe->transfer_map( pipe, *transfer );
+   if (map == NULL) {
+      pipe->transfer_destroy( pipe, *transfer );
+      return NULL;
    }
-   else
-      return screen->buffer_map(screen, buf, usage);
+
+   /* Match old screen->buffer_map_range() behaviour, return pointer
+    * to where the beginning of the buffer would be:
+    */
+   return (void *)((char *)map - offset);
 }
 
-static INLINE void
-pipe_buffer_unmap(struct pipe_screen *screen,
-                  struct pipe_buffer *buf)
+
+static INLINE void *
+pipe_buffer_map(struct pipe_context *pipe,
+                struct pipe_resource *buffer,
+                unsigned usage,
+		struct pipe_transfer **transfer)
 {
-   screen->buffer_unmap(screen, buf);
+   return pipe_buffer_map_range(pipe, buffer, 0, buffer->width0, usage, transfer);
 }
 
-static INLINE void *
-pipe_buffer_map_range(struct pipe_screen *screen,
-                struct pipe_buffer *buf,
-                unsigned offset,
-                unsigned length,
-                unsigned usage)
+
+static INLINE void
+pipe_buffer_unmap(struct pipe_context *pipe,
+                  struct pipe_resource *buf,
+		  struct pipe_transfer *transfer)
 {
-   assert(offset < buf->size);
-   assert(offset + length <= buf->size);
-   assert(length);
-   if(screen->buffer_map_range)
-      return screen->buffer_map_range(screen, buf, offset, length, usage);
-   else
-      return screen->buffer_map(screen, buf, usage);
+   if (transfer) {
+      pipe->transfer_unmap(pipe, transfer);
+      pipe->transfer_destroy(pipe, transfer);
+   }
 }
 
 static INLINE void
-pipe_buffer_flush_mapped_range(struct pipe_screen *screen,
-                               struct pipe_buffer *buf,
+pipe_buffer_flush_mapped_range(struct pipe_context *pipe,
+			       struct pipe_transfer *transfer,
                                unsigned offset,
                                unsigned length)
 {
-   assert(offset < buf->size);
-   assert(offset + length <= buf->size);
+   struct pipe_box box;
+   int transfer_offset;
+
    assert(length);
-   if(screen->buffer_flush_mapped_range)
-      screen->buffer_flush_mapped_range(screen, buf, offset, length);
+   assert(transfer->box.x <= offset);
+   assert(offset + length <= transfer->box.x + transfer->box.width);
+
+   /* Match old screen->buffer_flush_mapped_range() behaviour, where
+    * offset parameter is relative to the start of the buffer, not the
+    * mapped range.
+    */
+   transfer_offset = offset - transfer->box.x;
+   
+   u_box_1d(transfer_offset, length, &box);
+
+   pipe->transfer_flush_region(pipe, transfer, &box);
 }
 
 static INLINE void
-pipe_buffer_write(struct pipe_screen *screen,
-                  struct pipe_buffer *buf,
-                  unsigned offset, unsigned size,
+pipe_buffer_write(struct pipe_context *pipe,
+                  struct pipe_resource *buf,
+                  unsigned offset,
+		  unsigned size,
                   const void *data)
 {
-   void *map;
-   
-   assert(offset < buf->size);
-   assert(offset + size <= buf->size);
-   assert(size);
-
-   map = pipe_buffer_map_range(screen, buf, offset, size, 
-                               PIPE_BUFFER_USAGE_CPU_WRITE | 
-                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
-                               PIPE_BUFFER_USAGE_DISCARD);
-   assert(map);
-   if(map) {
-      memcpy((uint8_t *)map + offset, data, size);
-      pipe_buffer_flush_mapped_range(screen, buf, offset, size);
-      pipe_buffer_unmap(screen, buf);
-   }
+   struct pipe_box box;
+
+   u_box_1d(offset, size, &box);
+
+   pipe->transfer_inline_write( pipe,
+				buf,
+				u_subresource(0,0),
+				PIPE_TRANSFER_WRITE,
+				&box,
+				data,
+				size,
+				0);
 }
 
 /**
@@ -219,87 +287,88 @@ pipe_buffer_write(struct pipe_screen *screen,
  * been written before.
  */
 static INLINE void
-pipe_buffer_write_nooverlap(struct pipe_screen *screen,
-                            struct pipe_buffer *buf,
+pipe_buffer_write_nooverlap(struct pipe_context *pipe,
+                            struct pipe_resource *buf,
                             unsigned offset, unsigned size,
                             const void *data)
 {
-   void *map;
-
-   assert(offset < buf->size);
-   assert(offset + size <= buf->size);
-   assert(size);
-
-   map = pipe_buffer_map_range(screen, buf, offset, size,
-                               PIPE_BUFFER_USAGE_CPU_WRITE |
-                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
-                               PIPE_BUFFER_USAGE_DISCARD |
-                               PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
-   assert(map);
-   if(map) {
-      memcpy((uint8_t *)map + offset, data, size);
-      pipe_buffer_flush_mapped_range(screen, buf, offset, size);
-      pipe_buffer_unmap(screen, buf);
-   }
+   struct pipe_box box;
+
+   u_box_1d(offset, size, &box);
+
+   pipe->transfer_inline_write(pipe, 
+			       buf,
+			       u_subresource(0,0),
+			       (PIPE_TRANSFER_WRITE |
+				PIPE_TRANSFER_NOOVERWRITE),
+			       &box,
+			       data,
+			       0, 0);
 }
 
 static INLINE void
-pipe_buffer_read(struct pipe_screen *screen,
-                 struct pipe_buffer *buf,
-                 unsigned offset, unsigned size,
+pipe_buffer_read(struct pipe_context *pipe,
+                 struct pipe_resource *buf,
+                 unsigned offset,
+		 unsigned size,
                  void *data)
 {
-   void *map;
-   
-   assert(offset < buf->size);
-   assert(offset + size <= buf->size);
-   assert(size);
-
-   map = pipe_buffer_map_range(screen, buf, offset, size, PIPE_BUFFER_USAGE_CPU_READ);
-   assert(map);
-   if(map) {
-      memcpy(data, (const uint8_t *)map + offset, size);
-      pipe_buffer_unmap(screen, buf);
-   }
+   struct pipe_transfer *src_transfer;
+   ubyte *map;
+
+   map = (ubyte *) pipe_buffer_map_range(pipe,
+					 buf,
+					 offset, size,
+					 PIPE_TRANSFER_READ,
+					 &src_transfer);
+
+   if (map)
+      memcpy(data, map + offset, size);
+
+   pipe_buffer_unmap(pipe, buf, src_transfer);
 }
 
-static INLINE void *
-pipe_transfer_map( struct pipe_transfer *transf )
+static INLINE struct pipe_transfer *
+pipe_get_transfer( struct pipe_context *context,
+		       struct pipe_resource *resource,
+		       unsigned face, unsigned level,
+		       unsigned zslice,
+		       enum pipe_transfer_usage usage,
+		       unsigned x, unsigned y,
+		       unsigned w, unsigned h)
 {
-   struct pipe_screen *screen = transf->texture->screen;
-   return screen->transfer_map(screen, transf);
+   struct pipe_box box;
+   u_box_2d_zslice( x, y, zslice, w, h, &box );
+   return context->get_transfer( context,
+				 resource,
+				 u_subresource(face, level),
+				 usage,
+				 &box );
 }
 
-static INLINE void
-pipe_transfer_unmap( struct pipe_transfer *transf )
+static INLINE void *
+pipe_transfer_map( struct pipe_context *context,
+                   struct pipe_transfer *transfer )
 {
-   struct pipe_screen *screen = transf->texture->screen;
-   screen->transfer_unmap(screen, transf);
+   return context->transfer_map( context, transfer );
 }
 
 static INLINE void
-pipe_transfer_destroy( struct pipe_transfer *transf )
+pipe_transfer_unmap( struct pipe_context *context,
+                     struct pipe_transfer *transfer )
 {
-   struct pipe_screen *screen = transf->texture->screen;
-   screen->tex_transfer_destroy(transf);
+   context->transfer_unmap( context, transfer );
 }
 
-static INLINE unsigned
-pipe_transfer_buffer_flags( struct pipe_transfer *transf )
+
+static INLINE void
+pipe_transfer_destroy( struct pipe_context *context, 
+		       struct pipe_transfer *transfer )
 {
-   switch (transf->usage & PIPE_TRANSFER_READ_WRITE) {
-   case PIPE_TRANSFER_READ_WRITE:
-      return PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE;
-   case PIPE_TRANSFER_READ:
-      return PIPE_BUFFER_USAGE_CPU_READ;
-   case PIPE_TRANSFER_WRITE:
-      return PIPE_BUFFER_USAGE_CPU_WRITE;
-   default:
-      debug_assert(0);
-      return 0;
-   }
+   context->transfer_destroy(context, transfer);
 }
 
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index a2fc597356..53d56599fe 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -88,7 +88,7 @@ mem_dup(const void *src, uint size)
 /**
  * Offset of a field in a struct, in bytes.
  */
-#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
+#define Offset(TYPE, MEMBER) ((uintptr_t)&(((TYPE *)NULL)->MEMBER))
 
 
 
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 50f1b1670b..3ebef9fb74 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -37,6 +37,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "util/u_debug.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
 
@@ -92,6 +93,11 @@ util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
          uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
       }
       return;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+      {
+         uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+      }
+      return;
    case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
          uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
@@ -216,6 +222,15 @@ util_unpack_color_ub(enum pipe_format format, union util_color *uc,
          *a = (ubyte) 0xff;
       }
       return;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+      {
+         ushort p = uc->us;
+         *r = (ubyte) (((p >>  7) & 0xf8) | ((p >> 12) & 0x7));
+         *g = (ubyte) (((p >>  2) & 0xf8) | ((p >>  7) & 0x7));
+         *b = (ubyte) (((p <<  3) & 0xf8) | ((p >>  2) & 0x7));
+         *a = (ubyte) 0xff;
+      }
+      return;
    case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
          ushort p = uc->us;
@@ -361,6 +376,11 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
          uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
       }
       return;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+      {
+         uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+      }
+      return;
    case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
          uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
@@ -427,17 +447,17 @@ util_pack_z(enum pipe_format format, double z)
       return (uint) (z * 0xffffffff);
    case PIPE_FORMAT_Z32_FLOAT:
       return (uint)z;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       if (z == 1.0)
          return 0xffffff;
       return (uint) (z * 0xffffff);
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       if (z == 1.0)
          return 0xffffff00;
       return ((uint) (z * 0xffffff)) << 8;
-   case PIPE_FORMAT_S8_UNORM:
+   case PIPE_FORMAT_S8_USCALED:
       /* this case can get it via util_pack_z_stencil() */
       return 0;
    default:
@@ -458,13 +478,13 @@ util_pack_z_stencil(enum pipe_format format, double z, uint s)
    unsigned packed = util_pack_z(format, z);
 
    switch (format) {
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
       packed |= s << 24;
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
       packed |= s;
       break;
-   case PIPE_FORMAT_S8_UNORM:
+   case PIPE_FORMAT_S8_USCALED:
       packed |= s;
       break;
    default:
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 8479161c74..098cdfd58b 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -35,6 +35,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_screen.h"
 #include "util/u_format.h"
+#include "util/u_inlines.h"
 #include "util/u_rect.h"
 
 
@@ -169,7 +170,6 @@ util_surface_copy(struct pipe_context *pipe,
                   unsigned src_x, unsigned src_y, 
                   unsigned w, unsigned h)
 {
-   struct pipe_screen *screen = pipe->screen;
    struct pipe_transfer *src_trans, *dst_trans;
    void *dst_map;
    const void *src_map;
@@ -182,28 +182,28 @@ util_surface_copy(struct pipe_context *pipe,
    src_format = src->texture->format;
    dst_format = dst->texture->format;
 
-   src_trans = screen->get_tex_transfer(screen,
-                                        src->texture,
-                                        src->face,
-                                        src->level,
-                                        src->zslice,
-                                        PIPE_TRANSFER_READ,
-                                        src_x, src_y, w, h);
-
-   dst_trans = screen->get_tex_transfer(screen,
-                                        dst->texture,
-                                        dst->face,
-                                        dst->level,
-                                        dst->zslice,
-                                        PIPE_TRANSFER_WRITE,
-                                        dst_x, dst_y, w, h);
+   src_trans = pipe_get_transfer(pipe,
+				 src->texture,
+				 src->face,
+				 src->level,
+				 src->zslice,
+				 PIPE_TRANSFER_READ,
+				 src_x, src_y, w, h);
+
+   dst_trans = pipe_get_transfer(pipe,
+				 dst->texture,
+				 dst->face,
+				 dst->level,
+				 dst->zslice,
+				 PIPE_TRANSFER_WRITE,
+				 dst_x, dst_y, w, h);
 
    assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format));
    assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format));
    assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format));
 
-   src_map = pipe->screen->transfer_map(screen, src_trans);
-   dst_map = pipe->screen->transfer_map(screen, dst_trans);
+   src_map = pipe->transfer_map(pipe, src_trans);
+   dst_map = pipe->transfer_map(pipe, dst_trans);
 
    assert(src_map);
    assert(dst_map);
@@ -221,11 +221,11 @@ util_surface_copy(struct pipe_context *pipe,
                      do_flip ? h - 1 : 0);
    }
 
-   pipe->screen->transfer_unmap(pipe->screen, src_trans);
-   pipe->screen->transfer_unmap(pipe->screen, dst_trans);
+   pipe->transfer_unmap(pipe, src_trans);
+   pipe->transfer_unmap(pipe, dst_trans);
 
-   screen->tex_transfer_destroy(src_trans);
-   screen->tex_transfer_destroy(dst_trans);
+   pipe->transfer_destroy(pipe, src_trans);
+   pipe->transfer_destroy(pipe, dst_trans);
 }
 
 
@@ -243,65 +243,65 @@ util_surface_fill(struct pipe_context *pipe,
                   unsigned dstx, unsigned dsty,
                   unsigned width, unsigned height, unsigned value)
 {
-   struct pipe_screen *screen = pipe->screen;
    struct pipe_transfer *dst_trans;
    void *dst_map;
 
    assert(dst->texture);
    if (!dst->texture)
       return;
-   dst_trans = screen->get_tex_transfer(screen,
-                                        dst->texture,
-                                        dst->face,
-                                        dst->level,
-                                        dst->zslice,
-                                        PIPE_TRANSFER_WRITE,
-                                        dstx, dsty, width, height);
+   dst_trans = pipe_get_transfer(pipe,
+				 dst->texture,
+				 dst->face,
+				 dst->level,
+				 dst->zslice,
+				 PIPE_TRANSFER_WRITE,
+				 dstx, dsty, width, height);
 
-   dst_map = pipe->screen->transfer_map(screen, dst_trans);
+   dst_map = pipe->transfer_map(pipe, dst_trans);
 
    assert(dst_map);
 
    if (dst_map) {
       assert(dst_trans->stride > 0);
 
-      switch (util_format_get_blocksize(dst_trans->texture->format)) {
+      switch (util_format_get_blocksize(dst->texture->format)) {
       case 1:
       case 2:
       case 4:
-         util_fill_rect(dst_map, dst_trans->texture->format, dst_trans->stride,
+         util_fill_rect(dst_map, dst->texture->format,
+			dst_trans->stride,
                         0, 0, width, height, value);
          break;
       case 8:
-         {
-            /* expand the 4-byte clear value to an 8-byte value */
-            ushort *row = (ushort *) dst_map;
-            ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
-            ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
-            ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
-            ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
-            unsigned i, j;
-            val0 = (val0 << 8) | val0;
-            val1 = (val1 << 8) | val1;
-            val2 = (val2 << 8) | val2;
-            val3 = (val3 << 8) | val3;
-            for (i = 0; i < height; i++) {
-               for (j = 0; j < width; j++) {
-                  row[j*4+0] = val0;
-                  row[j*4+1] = val1;
-                  row[j*4+2] = val2;
-                  row[j*4+3] = val3;
-               }
-               row += dst_trans->stride/2;
-            }
-         }
-         break;
+      {
+	 /* expand the 4-byte clear value to an 8-byte value */
+	 ushort *row = (ushort *) dst_map;
+	 ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
+	 ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
+	 ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
+	 ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
+	 unsigned i, j;
+	 val0 = (val0 << 8) | val0;
+	 val1 = (val1 << 8) | val1;
+	 val2 = (val2 << 8) | val2;
+	 val3 = (val3 << 8) | val3;
+	 for (i = 0; i < height; i++) {
+	    for (j = 0; j < width; j++) {
+	       row[j*4+0] = val0;
+	       row[j*4+1] = val1;
+	       row[j*4+2] = val2;
+	       row[j*4+3] = val3;
+	    }
+	    row += dst_trans->stride/2;
+	 }
+      }
+      break;
       default:
          assert(0);
          break;
       }
    }
 
-   pipe->screen->transfer_unmap(pipe->screen, dst_trans);
-   screen->tex_transfer_destroy(dst_trans);
+   pipe->transfer_unmap(pipe, dst_trans);
+   pipe->transfer_destroy(pipe, dst_trans);
 }
diff --git a/src/gallium/auxiliary/util/u_resource.c b/src/gallium/auxiliary/util/u_resource.c
new file mode 100644
index 0000000000..9e6474b83d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_resource.c
@@ -0,0 +1,96 @@
+
+
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+
+static INLINE struct u_resource *
+u_resource( struct pipe_resource *res )
+{
+   return (struct u_resource *)res;
+}
+
+boolean u_resource_get_handle_vtbl(struct pipe_screen *screen,
+			  struct pipe_resource *resource,
+			  struct winsys_handle *handle)
+{
+   struct u_resource *ur = u_resource(resource);
+   return ur->vtbl->resource_get_handle(screen, resource, handle);
+}
+
+void u_resource_destroy_vtbl(struct pipe_screen *screen,
+		    struct pipe_resource *resource)
+{
+   struct u_resource *ur = u_resource(resource);
+   ur->vtbl->resource_destroy(screen, resource);
+}
+
+unsigned u_is_resource_referenced_vtbl( struct pipe_context *pipe,
+					struct pipe_resource *resource,
+					unsigned face, unsigned level)
+{
+   struct u_resource *ur = u_resource(resource);
+   return ur->vtbl->is_resource_referenced(pipe, resource, face, level);
+}
+
+struct pipe_transfer *u_get_transfer_vtbl(struct pipe_context *context,
+					  struct pipe_resource *resource,
+					  struct pipe_subresource sr,
+					  enum pipe_transfer_usage usage,
+					  const struct pipe_box *box)
+{
+   struct u_resource *ur = u_resource(resource);
+   return ur->vtbl->get_transfer(context, resource, sr, usage, box);
+}
+
+void u_transfer_destroy_vtbl(struct pipe_context *pipe,
+			     struct pipe_transfer *transfer)
+{
+   struct u_resource *ur = u_resource(transfer->resource);
+   ur->vtbl->transfer_destroy(pipe, transfer);
+}
+
+void *u_transfer_map_vtbl( struct pipe_context *pipe,
+			   struct pipe_transfer *transfer )
+{
+   struct u_resource *ur = u_resource(transfer->resource);
+   return ur->vtbl->transfer_map(pipe, transfer);
+}
+
+void u_transfer_flush_region_vtbl( struct pipe_context *pipe,
+				   struct pipe_transfer *transfer,
+				   const struct pipe_box *box)
+{
+   struct u_resource *ur = u_resource(transfer->resource);
+   ur->vtbl->transfer_flush_region(pipe, transfer, box);
+}
+
+void u_transfer_unmap_vtbl( struct pipe_context *pipe,
+			    struct pipe_transfer *transfer )
+{
+   struct u_resource *ur = u_resource(transfer->resource);
+   ur->vtbl->transfer_unmap(pipe, transfer);
+}
+
+void u_transfer_inline_write_vtbl( struct pipe_context *pipe,
+				   struct pipe_resource *resource,
+				   struct pipe_subresource sr,
+				   unsigned usage,
+				   const struct pipe_box *box,
+				   const void *data,
+				   unsigned stride,
+				   unsigned slice_stride)
+{
+   struct u_resource *ur = u_resource(resource);
+   ur->vtbl->transfer_inline_write(pipe, 
+				   resource,
+				   sr,
+				   usage,
+				   box,
+				   data,
+				   stride,
+				   slice_stride);
+}
+
+
+
+
diff --git a/src/gallium/auxiliary/util/u_sampler.c b/src/gallium/auxiliary/util/u_sampler.c
new file mode 100644
index 0000000000..e77f562ea2
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sampler.c
@@ -0,0 +1,100 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "u_format.h"
+#include "u_sampler.h"
+
+
+static void
+default_template(struct pipe_sampler_view *view,
+                 const struct pipe_resource *texture,
+                 enum pipe_format format,
+                 unsigned expand_green_blue)
+{
+   /* XXX: Check if format is compatible with texture->format.
+    */
+
+   view->format = format;
+   view->first_level = 0;
+   view->last_level = texture->last_level;
+   view->swizzle_r = PIPE_SWIZZLE_RED;
+   view->swizzle_g = PIPE_SWIZZLE_GREEN;
+   view->swizzle_b = PIPE_SWIZZLE_BLUE;
+   view->swizzle_a = PIPE_SWIZZLE_ALPHA;
+
+   /* Override default green and blue component expansion to the requested
+    * one.
+    *
+    * Gallium expands nonexistent components to (0,0,0,1), DX9 expands
+    * to (1,1,1,1).  Since alpha is always expanded to 1, and red is
+    * always present, we only really care about green and blue
+    * components.
+    *
+    * To make it look less hackish, one would have to add
+    * UTIL_FORMAT_SWIZZLE_EXPAND to indicate components for expansion
+    * and then override without exceptions or favoring one component
+    * over another.
+    */
+   if (format != PIPE_FORMAT_A8_UNORM) {
+      const struct util_format_description *desc = util_format_description(format);
+
+      assert(desc);
+      if (desc) {
+         if (desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0) {
+            view->swizzle_g = expand_green_blue;
+         }
+         if (desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0) {
+            view->swizzle_b = expand_green_blue;
+         }
+      }
+   }
+}
+
+void
+u_sampler_view_default_template(struct pipe_sampler_view *view,
+                                const struct pipe_resource *texture,
+                                enum pipe_format format)
+{
+   /* Expand to (0, 0, 0, 1) */
+   default_template(view,
+                    texture,
+                    format,
+                    PIPE_SWIZZLE_ZERO);
+}
+
+void
+u_sampler_view_default_dx9_template(struct pipe_sampler_view *view,
+                                    const struct pipe_resource *texture,
+                                    enum pipe_format format)
+{
+   /* Expand to (1, 1, 1, 1) */
+   default_template(view,
+                    texture,
+                    format,
+                    PIPE_SWIZZLE_ONE);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.h b/src/gallium/auxiliary/util/u_sampler.h
index 79d6981bb5..f3dad7417e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_depth.h
+++ b/src/gallium/auxiliary/util/u_sampler.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,38 +26,32 @@
  **************************************************************************/
 
 
-/**
- * Depth/stencil testing to LLVM IR translation.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#ifndef LP_BLD_DEPTH_H
-#define LP_BLD_DEPTH_H
-
+#ifndef U_SAMPLER_H
+#define U_SAMPLER_H
 
-#include <llvm-c/Core.h>  
 
- 
-struct pipe_depth_state;
-struct util_format_description;
-struct lp_type;
-struct lp_build_mask_context;
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-struct lp_type
-lp_depth_type(const struct util_format_description *format_desc,
-              unsigned length);
 
+void
+u_sampler_view_default_template(struct pipe_sampler_view *view,
+                                const struct pipe_resource *texture,
+                                enum pipe_format format);
 
 void
-lp_build_depth_test(LLVMBuilderRef builder,
-                    const struct pipe_depth_state *state,
-                    struct lp_type type,
-                    const struct util_format_description *format_desc,
-                    struct lp_build_mask_context *mask,
-                    LLVMValueRef src,
-                    LLVMValueRef dst_ptr);
+u_sampler_view_default_dx9_template(struct pipe_sampler_view *view,
+                                    const struct pipe_resource *texture,
+                                    enum pipe_format format);
+
 
+#ifdef __cplusplus
+} /* extern "C" { */
+#endif
 
-#endif /* !LP_BLD_DEPTH_H */
+#endif /* U_SAMPLER_H */
diff --git a/src/gallium/auxiliary/util/u_simple_screen.c b/src/gallium/auxiliary/util/u_simple_screen.c
deleted file mode 100644
index 53f3c16dbc..0000000000
--- a/src/gallium/auxiliary/util/u_simple_screen.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "u_simple_screen.h"
-
-#include "pipe/p_screen.h"
-#include "pipe/p_state.h"
-#include "util/u_simple_screen.h"
-
-
-static struct pipe_buffer *
-pass_buffer_create(struct pipe_screen *screen,
-                   unsigned alignment,
-                   unsigned usage,
-                   unsigned size)
-{
-   struct pipe_buffer *buffer =
-      screen->winsys->buffer_create(screen->winsys, alignment, usage, size);
-
-   buffer->screen = screen;
-
-   return buffer;
-}
-
-static struct pipe_buffer *
-pass_user_buffer_create(struct pipe_screen *screen,
-                        void *ptr,
-                        unsigned bytes)
-{
-   struct pipe_buffer *buffer =
-      screen->winsys->user_buffer_create(screen->winsys, ptr, bytes);
-
-   buffer->screen = screen;
-
-   return buffer;
-}
-
-static struct pipe_buffer *
-pass_surface_buffer_create(struct pipe_screen *screen,
-                           unsigned width, unsigned height,
-                           enum pipe_format format,
-                           unsigned usage,
-                           unsigned tex_usage,
-                           unsigned *stride)
-{
-   struct pipe_buffer *buffer =
-      screen->winsys->surface_buffer_create(screen->winsys, width, height,
-                                            format, usage, tex_usage, stride);
-
-   buffer->screen = screen;
-
-   return buffer;
-}
-
-static void *
-pass_buffer_map(struct pipe_screen *screen,
-                struct pipe_buffer *buf,
-                unsigned usage)
-{
-   return screen->winsys->buffer_map(screen->winsys, buf, usage);
-}
-
-static void
-pass_buffer_unmap(struct pipe_screen *screen,
-                  struct pipe_buffer *buf)
-{
-   screen->winsys->buffer_unmap(screen->winsys, buf);
-}
-
-static void
-pass_buffer_destroy(struct pipe_buffer *buf)
-{
-   buf->screen->winsys->buffer_destroy(buf);
-}
-
-
-static void
-pass_flush_frontbuffer(struct pipe_screen *screen,
-                       struct pipe_surface *surf,
-                       void *context_private)
-{
-   screen->winsys->flush_frontbuffer(screen->winsys, surf, context_private);
-}
-
-static void
-pass_fence_reference(struct pipe_screen *screen,
-                     struct pipe_fence_handle **ptr,
-                     struct pipe_fence_handle *fence)
-{
-   screen->winsys->fence_reference(screen->winsys, ptr, fence);
-}
-
-static int
-pass_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence,
-                     unsigned flag)
-{
-   return screen->winsys->fence_signalled(screen->winsys, fence, flag);
-}
-
-static int
-pass_fence_finish(struct pipe_screen *screen,
-                  struct pipe_fence_handle *fence,
-                  unsigned flag)
-{
-   return screen->winsys->fence_finish(screen->winsys, fence, flag);
-}
-
-void
-u_simple_screen_init(struct pipe_screen *screen)
-{
-   screen->buffer_create = pass_buffer_create;
-   screen->user_buffer_create = pass_user_buffer_create;
-   screen->surface_buffer_create = pass_surface_buffer_create;
-
-   screen->buffer_map = pass_buffer_map;
-   screen->buffer_unmap = pass_buffer_unmap;
-   screen->buffer_destroy = pass_buffer_destroy;
-   screen->flush_frontbuffer = pass_flush_frontbuffer;
-   screen->fence_reference = pass_fence_reference;
-   screen->fence_signalled = pass_fence_signalled;
-   screen->fence_finish = pass_fence_finish;
-}
-
-const char *
-u_simple_screen_winsys_name(struct pipe_screen *screen)
-{
-   return screen->winsys->get_name(screen->winsys);
-}
diff --git a/src/gallium/auxiliary/util/u_simple_screen.h b/src/gallium/auxiliary/util/u_simple_screen.h
index bb3f5ba102..b52232f025 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.h
+++ b/src/gallium/auxiliary/util/u_simple_screen.h
@@ -33,7 +33,7 @@
 struct pipe_screen;
 struct pipe_fence_handle;
 struct pipe_surface;
-struct pipe_buffer;
+struct pipe_resource;
 
 /**
  * Gallium3D drivers are (meant to be!) independent of both GL and the
@@ -53,11 +53,6 @@ struct pipe_winsys
    const char *(*get_name)( struct pipe_winsys *ws );
 
    /**
-    * Do any special operations to ensure buffer size is correct
-    */
-   void (*update_buffer)( struct pipe_winsys *ws,
-                          void *context_private );
-   /**
     * Do any special operations to ensure frontbuffer contents are
     * displayed, eg copy fake frontbuffer.
     */
@@ -73,14 +68,13 @@ struct pipe_winsys
     * window systems must then implement that interface (rather than the
     * other way around...).
     *
-    * usage is a bitmask of PIPE_BUFFER_USAGE_PIXEL/VERTEX/INDEX/CONSTANT. This
-    * usage argument is only an optimization hint, not a guarantee, therefore
-    * proper behavior must be observed in all circumstances.
+    * usage is a bitmask of PIPE_BIND_*.
+    * All possible usages must be present.
     *
     * alignment indicates the client's alignment requirements, eg for
     * SSE instructions.
     */
-   struct pipe_buffer *(*buffer_create)( struct pipe_winsys *ws,
+   struct pipe_resource *(*buffer_create)( struct pipe_winsys *ws,
                                          unsigned alignment,
                                          unsigned usage,
                                          unsigned size );
@@ -106,7 +100,7 @@ struct pipe_winsys
     * Note that ptr may be accessed at any time upto the time when the
     * buffer is destroyed, so the data must not be freed before then.
     */
-   struct pipe_buffer *(*user_buffer_create)(struct pipe_winsys *ws,
+   struct pipe_resource *(*user_buffer_create)(struct pipe_winsys *ws,
                                                     void *ptr,
                                                     unsigned bytes);
 
@@ -117,11 +111,11 @@ struct pipe_winsys
     * display targets) must be allocated with special characteristics, memory
     * pools, or obtained directly from the windowing system.
     *
-    * This callback is invoked by the pipe_screenwhen creating a texture marked
-    * with the PIPE_TEXTURE_USAGE_DISPLAY_TARGET flag  to get the underlying
+    * This callback is invoked by the pipe_screen when creating a texture marked
+    * with the PIPE_BIND_DISPLAY_TARGET flag  to get the underlying
     * buffer storage.
     */
-   struct pipe_buffer *(*surface_buffer_create)(struct pipe_winsys *ws,
+   struct pipe_resource *(*surface_buffer_create)(struct pipe_winsys *ws,
                                                 unsigned width, unsigned height,
                                                 enum pipe_format format,
                                                 unsigned usage,
@@ -134,13 +128,13 @@ struct pipe_winsys
     * flags is bitmask of PIPE_BUFFER_USAGE_CPU_READ/WRITE flags.
     */
    void *(*buffer_map)( struct pipe_winsys *ws,
-                        struct pipe_buffer *buf,
+                        struct pipe_resource *buf,
                         unsigned usage );
 
    void (*buffer_unmap)( struct pipe_winsys *ws,
-                         struct pipe_buffer *buf );
+                         struct pipe_resource *buf );
 
-   void (*buffer_destroy)( struct pipe_buffer *buf );
+   void (*buffer_destroy)( struct pipe_resource *buf );
 
 
    /** Set ptr = fence, with reference counting */
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index 33306bbc2a..42440d0d67 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -50,7 +50,8 @@
 boolean
 util_create_rgba_surface(struct pipe_screen *screen,
                          uint width, uint height,
-                         struct pipe_texture **textureOut,
+			 uint bind,
+                         struct pipe_resource **textureOut,
                          struct pipe_surface **surfaceOut)
 {
    static const enum pipe_format rgbaFormats[] = {
@@ -60,15 +61,14 @@ util_create_rgba_surface(struct pipe_screen *screen,
       PIPE_FORMAT_NONE
    };
    const uint target = PIPE_TEXTURE_2D;
-   const uint usage = PIPE_TEXTURE_USAGE_RENDER_TARGET;
    enum pipe_format format = PIPE_FORMAT_NONE;
-   struct pipe_texture templ;
+   struct pipe_resource templ;
    uint i;
 
    /* Choose surface format */
    for (i = 0; rgbaFormats[i]; i++) {
       if (screen->is_format_supported(screen, rgbaFormats[i],
-                                      target, usage, 0)) {
+                                      target, bind, 0)) {
          format = rgbaFormats[i];
          break;
       }
@@ -84,16 +84,19 @@ util_create_rgba_surface(struct pipe_screen *screen,
    templ.width0 = width;
    templ.height0 = height;
    templ.depth0 = 1;
-   templ.tex_usage = usage;
+   templ.bind = bind;
 
-   *textureOut = screen->texture_create(screen, &templ);
+   *textureOut = screen->resource_create(screen, &templ);
    if (!*textureOut)
       return FALSE;
 
    /* create surface / view into texture */
-   *surfaceOut = screen->get_tex_surface(screen, *textureOut, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE);
+   *surfaceOut = screen->get_tex_surface(screen, 
+					 *textureOut,
+					 0, 0, 0,
+					 bind);
    if (!*surfaceOut) {
-      pipe_texture_reference(textureOut, NULL);
+      pipe_resource_reference(textureOut, NULL);
       return FALSE;
    }
 
@@ -105,11 +108,11 @@ util_create_rgba_surface(struct pipe_screen *screen,
  * Release the surface and texture from util_create_rgba_surface().
  */
 void
-util_destroy_rgba_surface(struct pipe_texture *texture,
+util_destroy_rgba_surface(struct pipe_resource *texture,
                           struct pipe_surface *surface)
 {
    pipe_surface_reference(&surface, NULL);
-   pipe_texture_reference(&texture, NULL);
+   pipe_resource_reference(&texture, NULL);
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_surface.h b/src/gallium/auxiliary/util/u_surface.h
index 3c60df2c3e..119fcd4ce8 100644
--- a/src/gallium/auxiliary/util/u_surface.h
+++ b/src/gallium/auxiliary/util/u_surface.h
@@ -52,13 +52,13 @@ util_same_surface(const struct pipe_surface *s1, const struct pipe_surface *s2)
 
 extern boolean
 util_create_rgba_surface(struct pipe_screen *screen,
-                         uint width, uint height,
-                         struct pipe_texture **textureOut,
+                         uint width, uint height, uint bind,
+                         struct pipe_resource **textureOut,
                          struct pipe_surface **surfaceOut);
 
 
 extern void
-util_destroy_rgba_surface(struct pipe_texture *texture,
+util_destroy_rgba_surface(struct pipe_resource *texture,
                           struct pipe_surface *surface);
 
 
diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c
new file mode 100644
index 0000000000..0be4609a20
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_surfaces.c
@@ -0,0 +1,112 @@
+#include "u_surfaces.h"
+#include "util/u_hash_table.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+/* TODO: ouch, util_hash_table should do these by default when passed a null function pointer
+ * this indirect function call is quite bad
+ */
+static unsigned
+hash(void *key)
+{
+   return (unsigned)key;
+}
+
+static int
+compare(void *key1, void *key2)
+{
+   return (unsigned)key1 - (unsigned)key2;
+}
+
+struct pipe_surface *
+util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
+{
+   struct pipe_surface *ps;
+   void *key = NULL;
+
+   if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
+   {	/* or 2D array */
+      if(!us->u.table)
+	 us->u.table = util_hash_table_create(hash, compare);
+      key = (void *)(((zslice + face) << 8) | level);
+      /* TODO: ouch, should have a get-reference function...
+       * also, shouldn't allocate a two-pointer structure for each item... */
+      ps = util_hash_table_get(us->u.table, key);
+   }
+   else
+   {
+      if(!us->u.array)
+	 us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *));
+      ps = us->u.array[level];
+   }
+
+   if(ps)
+   {
+      p_atomic_inc(&ps->reference.count);
+      return ps;
+   }
+
+   ps = (struct pipe_surface *)CALLOC(1, surface_struct_size);
+   if(!ps)
+      return NULL;
+
+   pipe_surface_init(ps, pt, face, level, zslice, flags);
+   ps->offset = ~0;
+
+   if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
+      util_hash_table_set(us->u.table, key, ps);
+   else
+      us->u.array[level] = ps;
+
+   return ps;
+}
+
+void
+util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps)
+{
+   struct pipe_resource *pt = ps->texture;
+   if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
+   {	/* or 2D array */
+      void* key = (void*)(((ps->zslice + ps->face) << 8) | ps->level);
+      util_hash_table_remove(us->u.table, key);
+   }
+   else
+      us->u.array[ps->level] = 0;
+}
+
+static enum pipe_error
+util_surfaces_destroy_callback(void *key, void *value, void *data)
+{
+   void (*destroy_surface) (struct pipe_surface * ps) = data;
+   destroy_surface((struct pipe_surface *)value);
+   return PIPE_OK;
+}
+
+void
+util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *))
+{
+   if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
+   {	/* or 2D array */
+      if(us->u.table)
+      {
+	 util_hash_table_foreach(us->u.table, util_surfaces_destroy_callback, destroy_surface);
+	 util_hash_table_destroy(us->u.table);
+	 us->u.table = NULL;
+      }
+   }
+   else
+   {
+      if(us->u.array)
+      {
+	 unsigned i;
+	 for(i = 0; i < pt->last_level; ++i)
+	 {
+	    struct pipe_surface *ps = us->u.array[i];
+	    if(ps)
+	       destroy_surface(ps);
+	 }
+	 free(us->u.array);
+	 us->u.array = NULL;
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
new file mode 100644
index 0000000000..6de5e7cc17
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -0,0 +1,54 @@
+#ifndef U_SURFACES_H_
+#define U_SURFACES_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_atomic.h"
+
+struct util_hash_table;
+
+struct util_surfaces
+{
+   union
+   {
+      struct util_hash_table *table;
+      struct pipe_surface **array;
+   } u;
+};
+
+struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags);
+
+/* fast inline path for the very common case */
+static INLINE struct pipe_surface *
+util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
+{
+   if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array))
+   {
+      struct pipe_surface *ps = us->u.array[level];
+      if(ps)
+      {
+	 p_atomic_inc(&ps->reference.count);
+	 return ps;
+      }
+   }
+
+   return util_surfaces_do_get(us, surface_struct_size, pscreen, pt, face, level, zslice, flags);
+}
+
+void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
+
+static INLINE void
+util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps)
+{
+   if(likely(ps->texture->target == PIPE_TEXTURE_2D))
+   {
+      us->u.array[ps->level] = 0;
+      return;
+   }
+
+   return util_surfaces_do_detach(us, ps);
+}
+
+void util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *));
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 79481b710b..fe327c302b 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -45,27 +45,27 @@
  * Move raw block of pixels from transfer object to user memory.
  */
 void
-pipe_get_tile_raw(struct pipe_transfer *pt,
+pipe_get_tile_raw(struct pipe_context *pipe,
+                  struct pipe_transfer *pt,
                   uint x, uint y, uint w, uint h,
                   void *dst, int dst_stride)
 {
-   struct pipe_screen *screen = pt->texture->screen;
    const void *src;
 
    if (dst_stride == 0)
-      dst_stride = util_format_get_stride(pt->texture->format, w);
+      dst_stride = util_format_get_stride(pt->resource->format, w);
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
-   src = screen->transfer_map(screen, pt);
+   src = pipe->transfer_map(pipe, pt);
    assert(src);
    if(!src)
       return;
 
-   util_copy_rect(dst, pt->texture->format, dst_stride, 0, 0, w, h, src, pt->stride, x, y);
+   util_copy_rect(dst, pt->resource->format, dst_stride, 0, 0, w, h, src, pt->stride, x, y);
 
-   screen->transfer_unmap(screen, pt);
+   pipe->transfer_unmap(pipe, pt);
 }
 
 
@@ -73,28 +73,28 @@ pipe_get_tile_raw(struct pipe_transfer *pt,
  * Move raw block of pixels from user memory to transfer object.
  */
 void
-pipe_put_tile_raw(struct pipe_transfer *pt,
+pipe_put_tile_raw(struct pipe_context *pipe,
+                  struct pipe_transfer *pt,
                   uint x, uint y, uint w, uint h,
                   const void *src, int src_stride)
 {
-   struct pipe_screen *screen = pt->texture->screen;
    void *dst;
-   enum pipe_format format = pt->texture->format;
+   enum pipe_format format = pt->resource->format;
 
    if (src_stride == 0)
       src_stride = util_format_get_stride(format, w);
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
-   dst = screen->transfer_map(screen, pt);
+   dst = pipe->transfer_map(pipe, pt);
    assert(dst);
    if(!dst)
       return;
 
    util_copy_rect(dst, format, pt->stride, x, y, w, h, src, src_stride, 0, 0);
 
-   screen->transfer_unmap(screen, pt);
+   pipe->transfer_unmap(pipe, pt);
 }
 
 
@@ -108,387 +108,6 @@ pipe_put_tile_raw(struct pipe_transfer *pt,
 
 
 
-/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
-
-static void
-a8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const unsigned pixel = *src++;
-         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
-         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
-         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
-         pRow[3] = ubyte_to_float((pixel >> 24) & 0xff);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-a8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_B8G8R8X8_UNORM ***/
-
-static void
-x8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const unsigned pixel = *src++;
-         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
-         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
-         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
-         pRow[3] = 1.0F;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-x8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
-
-static void
-b8g8r8a8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const unsigned pixel = *src++;
-         pRow[0] = ubyte_to_float((pixel >>  8) & 0xff);
-         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
-         pRow[2] = ubyte_to_float((pixel >> 24) & 0xff);
-         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-b8g8r8a8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_A8B8G8R8_UNORM ***/
-
-static void
-r8g8b8a8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const unsigned pixel = *src++;
-         pRow[0] = ubyte_to_float((pixel >> 24) & 0xff);
-         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
-         pRow[2] = ubyte_to_float((pixel >>  8) & 0xff);
-         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-r8g8b8a8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (r << 24) | (g << 16) | (b << 8) | a;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_B5G5R5A1_UNORM ***/
-
-static void
-a1r5g5b5_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const ushort pixel = *src++;
-         pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f);
-         pRow[1] = ((pixel >>  5) & 0x1f) * (1.0f / 31.0f);
-         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
-         pRow[3] = ((pixel >> 15)       ) * 1.0f;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-a1r5g5b5_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         r = r >> 3;  /* 5 bits */
-         g = g >> 3;  /* 5 bits */
-         b = b >> 3;  /* 5 bits */
-         a = a >> 7;  /* 1 bit */
-         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_B4G4R4A4_UNORM ***/
-
-static void
-a4r4g4b4_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       float *p,
-                       unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const ushort pixel = *src++;
-         pRow[0] = ((pixel >>  8) & 0xf) * (1.0f / 15.0f);
-         pRow[1] = ((pixel >>  4) & 0xf) * (1.0f / 15.0f);
-         pRow[2] = ((pixel      ) & 0xf) * (1.0f / 15.0f);
-         pRow[3] = ((pixel >> 12)      ) * (1.0f / 15.0f);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-a4r4g4b4_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const float *p,
-                       unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = float_to_ubyte(pRow[0]);
-         g = float_to_ubyte(pRow[1]);
-         b = float_to_ubyte(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         r >>= 4;
-         g >>= 4;
-         b >>= 4;
-         a >>= 4;
-         *dst++ = (a << 12) | (r << 8) | (g << 4) | b;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_B5G6R5_UNORM ***/
-
-static void
-r5g6b5_get_tile_rgba(const ushort *src,
-                     unsigned w, unsigned h,
-                     float *p,
-                     unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const ushort pixel = *src++;
-         pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f);
-         pRow[1] = ((pixel >>  5) & 0x3f) * (1.0f / 63.0f);
-         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
-         pRow[3] = 1.0f;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-r5g6b5_put_tile_rgba(ushort *dst,
-                     unsigned w, unsigned h,
-                     const float *p,
-                     unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0);
-         uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0);
-         uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0);
-         *dst++ = (r << 11) | (g << 5) | (b);
-      }
-      p += src_stride;
-   }
-}
-
-
-
-/*** PIPE_FORMAT_R8G8B8_UNORM ***/
-
-static void
-r8g8b8_get_tile_rgba(const ubyte *src,
-                     unsigned w, unsigned h,
-                     float *p,
-                     unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         pRow[0] = ubyte_to_float(src[0]);
-         pRow[1] = ubyte_to_float(src[1]);
-         pRow[2] = ubyte_to_float(src[2]);
-         pRow[3] = 1.0f;
-         src += 3;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-r8g8b8_put_tile_rgba(ubyte *dst,
-                     unsigned w, unsigned h,
-                     const float *p,
-                     unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         dst[0] = float_to_ubyte(pRow[0]);
-         dst[1] = float_to_ubyte(pRow[1]);
-         dst[2] = float_to_ubyte(pRow[2]);
-         dst += 3;
-      }
-      p += src_stride;
-   }
-}
-
-
-
 /*** PIPE_FORMAT_Z16_UNORM ***/
 
 /**
@@ -518,448 +137,6 @@ z16_get_tile_rgba(const ushort *src,
 
 
 
-/*** PIPE_FORMAT_L8_UNORM ***/
-
-static void
-l8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 float *p,
-                 unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] = ubyte_to_float(*src);
-         pRow[3] = 1.0;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-l8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const float *p,
-                 unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r;
-         r = float_to_ubyte(pRow[0]);
-         *dst++ = (ubyte) r;
-      }
-      p += src_stride;
-   }
-}
-
-
-
-/*** PIPE_FORMAT_A8_UNORM ***/
-
-static void
-a8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 float *p,
-                 unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] = 0.0;
-         pRow[3] = ubyte_to_float(*src);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-a8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const float *p,
-                 unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned a;
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (ubyte) a;
-      }
-      p += src_stride;
-   }
-}
-
-
-
-/*** PIPE_FORMAT_R16_SNORM ***/
-
-static void
-r16_get_tile_rgba(const short *src,
-                  unsigned w, unsigned h,
-                  float *p,
-                  unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src++, pRow += 4) {
-         pRow[0] = SHORT_TO_FLOAT(src[0]);
-         pRow[1] =
-         pRow[2] = 0.0;
-         pRow[3] = 1.0;
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-r16_put_tile_rgba(short *dst,
-                  unsigned w, unsigned h,
-                  const float *p,
-                  unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, dst++, pRow += 4) {
-         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
-
-static void
-r16g16b16a16_get_tile_rgba(const short *src,
-                           unsigned w, unsigned h,
-                           float *p,
-                           unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src += 4, pRow += 4) {
-         pRow[0] = SHORT_TO_FLOAT(src[0]);
-         pRow[1] = SHORT_TO_FLOAT(src[1]);
-         pRow[2] = SHORT_TO_FLOAT(src[2]);
-         pRow[3] = SHORT_TO_FLOAT(src[3]);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-r16g16b16a16_put_tile_rgba(short *dst,
-                           unsigned w, unsigned h,
-                           const float *p,
-                           unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, dst += 4, pRow += 4) {
-         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
-         UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]);
-         UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]);
-         UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]);
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_A8B8G8R8_SRGB ***/
-
-/**
- * Convert an 8-bit sRGB value from non-linear space to a
- * linear RGB value in [0, 1].
- * Implemented with a 256-entry lookup table.
- */
-static INLINE float
-srgb_to_linear(ubyte cs8)
-{
-   static float table[256];
-   static boolean tableReady = FALSE;
-   if (!tableReady) {
-      /* compute lookup table now */
-      uint i;
-      for (i = 0; i < 256; i++) {
-         const float cs = ubyte_to_float(i);
-         if (cs <= 0.04045) {
-            table[i] = cs / 12.92f;
-         }
-         else {
-            table[i] = (float) powf((cs + 0.055) / 1.055, 2.4);
-         }
-      }
-      tableReady = TRUE;
-   }
-   return table[cs8];
-}
-
-
-/**
- * Convert linear float in [0,1] to an srgb ubyte value in [0,255].
- * XXX this hasn't been tested (render to srgb surface).
- * XXX this needs optimization.
- */
-static INLINE ubyte
-linear_to_srgb(float cl)
-{
-   if (cl >= 1.0F)
-      return 255;
-   else if (cl >= 0.0031308F)
-      return float_to_ubyte(1.055F * powf(cl, 0.41666F) - 0.055F);
-   else if (cl > 0.0F)
-      return float_to_ubyte(12.92F * cl);
-   else
-      return 0.0;
-}
-
-
-static void
-a8r8g8b8_srgb_get_tile_rgba(const unsigned *src,
-                            unsigned w, unsigned h,
-                            float *p,
-                            unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         const unsigned pixel = *src++;
-         pRow[0] = srgb_to_linear((pixel >> 16) & 0xff);
-         pRow[1] = srgb_to_linear((pixel >>  8) & 0xff);
-         pRow[2] = srgb_to_linear((pixel >>  0) & 0xff);
-         pRow[3] = ubyte_to_float((pixel >> 24) & 0xff);
-      }
-      p += dst_stride;
-   }
-}
-
-static void
-a8r8g8b8_srgb_put_tile_rgba(unsigned *dst,
-                            unsigned w, unsigned h,
-                            const float *p,
-                            unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, g, b, a;
-         r = linear_to_srgb(pRow[0]);
-         g = linear_to_srgb(pRow[1]);
-         b = linear_to_srgb(pRow[2]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_L8A8_SRGB ***/
-
-static void
-a8l8_srgb_get_tile_rgba(const ushort *src,
-                        unsigned w, unsigned h,
-                        float *p,
-                        unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         ushort p = *src++;
-         pRow[0] =
-         pRow[1] =
-         pRow[2] = srgb_to_linear(p & 0xff);
-         pRow[3] = ubyte_to_float(p >> 8);
-      }
-      p += dst_stride;
-   }
-}
-
-static void
-a8l8_srgb_put_tile_rgba(ushort *dst,
-                        unsigned w, unsigned h,
-                        const float *p,
-                        unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, a;
-         r = linear_to_srgb(pRow[0]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (a << 8) | r;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_L8_SRGB ***/
-
-static void
-l8_srgb_get_tile_rgba(const ubyte *src,
-                      unsigned w, unsigned h,
-                      float *p,
-                      unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] = srgb_to_linear(*src);
-         pRow[3] = 1.0;
-      }
-      p += dst_stride;
-   }
-}
-
-static void
-l8_srgb_put_tile_rgba(ubyte *dst,
-                      unsigned w, unsigned h,
-                      const float *p,
-                      unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r;
-         r = linear_to_srgb(pRow[0]);
-         *dst++ = (ubyte) r;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_I8_UNORM ***/
-
-static void
-i8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 float *p,
-                 unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, src++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] =
-         pRow[3] = ubyte_to_float(*src);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-i8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const float *p,
-                 unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r;
-         r = float_to_ubyte(pRow[0]);
-         *dst++ = (ubyte) r;
-      }
-      p += src_stride;
-   }
-}
-
-
-/*** PIPE_FORMAT_L8A8_UNORM ***/
-
-static void
-a8l8_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   float *p,
-                   unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         ushort p = *src++;
-         pRow[0] =
-         pRow[1] =
-         pRow[2] = ubyte_to_float(p & 0xff);
-         pRow[3] = ubyte_to_float(p >> 8);
-      }
-      p += dst_stride;
-   }
-}
-
-
-static void
-a8l8_put_tile_rgba(ushort *dst,
-                   unsigned w, unsigned h,
-                   const float *p,
-                   unsigned src_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      const float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         unsigned r, a;
-         r = float_to_ubyte(pRow[0]);
-         a = float_to_ubyte(pRow[3]);
-         *dst++ = (a << 8) | r;
-      }
-      p += src_stride;
-   }
-}
-
-
-
-
 /*** PIPE_FORMAT_Z32_UNORM ***/
 
 /**
@@ -987,7 +164,7 @@ z32_get_tile_rgba(const unsigned *src,
 }
 
 
-/*** PIPE_FORMAT_Z24S8_UNORM ***/
+/*** PIPE_FORMAT_Z24_UNORM_S8_USCALED ***/
 
 /**
  * Return Z component as four float in [0,1].  Stencil part ignored.
@@ -1014,7 +191,7 @@ s8z24_get_tile_rgba(const unsigned *src,
 }
 
 
-/*** PIPE_FORMAT_S8Z24_UNORM ***/
+/*** PIPE_FORMAT_S8_USCALED_Z24_UNORM ***/
 
 /**
  * Return Z component as four float in [0,1].  Stencil part ignored.
@@ -1067,94 +244,6 @@ z32f_get_tile_rgba(const float *src,
 }
 
 
-/*** PIPE_FORMAT_UYVY / PIPE_FORMAT_YUYV ***/
-
-/**
- * Convert YCbCr (or YCrCb) to RGBA.
- */
-static void
-ycbcr_get_tile_rgba(const ushort *src,
-                    unsigned w, unsigned h,
-                    float *p,
-                    unsigned dst_stride,
-                    boolean rev)
-{
-   const float scale = 1.0f / 255.0f;
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      /* do two texels at a time */
-      for (j = 0; j < (w & ~1); j += 2, src += 2) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         pRow[0] = r * scale;
-         pRow[1] = g * scale;
-         pRow[2] = b * scale;
-         pRow[3] = 1.0f;
-         pRow += 4;
-
-         /* odd pixel: use y1,cr,cb */
-         r = 1.164f * (y1-16) + 1.596f * (cr-128);
-         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y1-16) + 2.018f * (cb-128);
-         pRow[0] = r * scale;
-         pRow[1] = g * scale;
-         pRow[2] = b * scale;
-         pRow[3] = 1.0f;
-         pRow += 4;
-
-      }
-      /* do the last texel */
-      if (w & 1) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         pRow[0] = r * scale;
-         pRow[1] = g * scale;
-         pRow[2] = b * scale;
-         pRow[3] = 1.0f;
-         pRow += 4;
-      }
-      p += dst_stride;
-   }
-}
-
-
 void
 pipe_tile_raw_to_rgba(enum pipe_format format,
                       void *src,
@@ -1162,80 +251,23 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
                       float *dst, unsigned dst_stride)
 {
    switch (format) {
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
-      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_A8B8G8R8_UNORM:
-      r8g8b8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_B5G5R5A1_UNORM:
-      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_B4G4R4A4_UNORM:
-      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      r8g8b8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_L8A8_UNORM:
-      a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_SRGB:
-      a8r8g8b8_srgb_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_L8A8_SRGB:
-      a8l8_srgb_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
-      break;
-   case PIPE_FORMAT_L8_SRGB:
-      l8_srgb_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
-      break;
    case PIPE_FORMAT_Z16_UNORM:
       z16_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_Z32_UNORM:
       z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_Z32_FLOAT:
       z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_UYVY:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
-      break;
-   case PIPE_FORMAT_YUYV:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
-      break;
    default:
       util_format_read_4f(format,
                           dst, dst_stride * sizeof(float),
@@ -1246,15 +278,16 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
 
 
 void
-pipe_get_tile_rgba(struct pipe_transfer *pt,
+pipe_get_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_transfer *pt,
                    uint x, uint y, uint w, uint h,
                    float *p)
 {
    unsigned dst_stride = w * 4;
    void *packed;
-   enum pipe_format format = pt->texture->format;
+   enum pipe_format format = pt->resource->format;
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
    packed = MALLOC(util_format_get_nblocks(format, w, h) * util_format_get_blocksize(format));
@@ -1265,24 +298,94 @@ pipe_get_tile_rgba(struct pipe_transfer *pt,
    if(format == PIPE_FORMAT_UYVY || format == PIPE_FORMAT_YUYV)
       assert((x & 1) == 0);
 
-   pipe_get_tile_raw(pt, x, y, w, h, packed, 0);
+   pipe_get_tile_raw(pipe, pt, x, y, w, h, packed, 0);
+
+   pipe_tile_raw_to_rgba(format, packed, w, h, p, dst_stride);
+
+   FREE(packed);
+}
+
+
+void
+pipe_get_tile_swizzle(struct pipe_context *pipe,
+		      struct pipe_transfer *pt,
+                      uint x,
+                      uint y,
+                      uint w,
+                      uint h,
+                      uint swizzle_r,
+                      uint swizzle_g,
+                      uint swizzle_b,
+                      uint swizzle_a,
+                      enum pipe_format format,
+                      float *p)
+{
+   unsigned dst_stride = w * 4;
+   void *packed;
+   uint iy;
+   float rgba01[6];
+
+   if (u_clip_tile(x, y, &w, &h, &pt->box)) {
+      return;
+   }
+
+   packed = MALLOC(util_format_get_nblocks(format, w, h) * util_format_get_blocksize(format));
+   if (!packed) {
+      return;
+   }
+
+   if (format == PIPE_FORMAT_UYVY || format == PIPE_FORMAT_YUYV) {
+      assert((x & 1) == 0);
+   }
+
+   pipe_get_tile_raw(pipe, pt, x, y, w, h, packed, 0);
 
    pipe_tile_raw_to_rgba(format, packed, w, h, p, dst_stride);
 
    FREE(packed);
+
+   if (swizzle_r == PIPE_SWIZZLE_RED &&
+       swizzle_g == PIPE_SWIZZLE_GREEN &&
+       swizzle_b == PIPE_SWIZZLE_BLUE &&
+       swizzle_a == PIPE_SWIZZLE_ALPHA) {
+      /* no-op, skip */
+      return;
+   }
+
+   rgba01[PIPE_SWIZZLE_ZERO] = 0.0f;
+   rgba01[PIPE_SWIZZLE_ONE] = 1.0f;
+
+   for (iy = 0; iy < h; iy++) {
+      float *row = p;
+      uint ix;
+
+      for (ix = 0; ix < w; ix++) {
+         rgba01[PIPE_SWIZZLE_RED] = row[0];
+         rgba01[PIPE_SWIZZLE_GREEN] = row[1];
+         rgba01[PIPE_SWIZZLE_BLUE] = row[2];
+         rgba01[PIPE_SWIZZLE_ALPHA] = row[3];
+
+         *row++ = rgba01[swizzle_r];
+         *row++ = rgba01[swizzle_g];
+         *row++ = rgba01[swizzle_b];
+         *row++ = rgba01[swizzle_a];
+      }
+      p += dst_stride;
+   }
 }
 
 
 void
-pipe_put_tile_rgba(struct pipe_transfer *pt,
+pipe_put_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_transfer *pt,
                    uint x, uint y, uint w, uint h,
                    const float *p)
 {
    unsigned src_stride = w * 4;
    void *packed;
-   enum pipe_format format = pt->texture->format;
+   enum pipe_format format = pt->resource->format;
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
    packed = MALLOC(util_format_get_nblocks(format, w, h) * util_format_get_blocksize(format));
@@ -1291,68 +394,17 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
       return;
 
    switch (format) {
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
-      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_A8B8G8R8_UNORM:
-      r8g8b8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_B5G5R5A1_UNORM:
-      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      r8g8b8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_B4G4R4A4_UNORM:
-      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_L8A8_UNORM:
-      a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_put_tile_rgba((short *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_SRGB:
-      a8r8g8b8_srgb_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_L8A8_SRGB:
-      a8l8_srgb_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
-      break;
-   case PIPE_FORMAT_L8_SRGB:
-      l8_srgb_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
-      break;
    case PIPE_FORMAT_Z16_UNORM:
       /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
       break;
    case PIPE_FORMAT_Z32_UNORM:
       /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
@@ -1363,7 +415,7 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
                            0, 0, w, h);
    }
 
-   pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
+   pipe_put_tile_raw(pipe, pt, x, y, w, h, packed, 0);
 
    FREE(packed);
 }
@@ -1373,21 +425,21 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
  * Get a block of Z values, converted to 32-bit range.
  */
 void
-pipe_get_tile_z(struct pipe_transfer *pt,
+pipe_get_tile_z(struct pipe_context *pipe,
+                struct pipe_transfer *pt,
                 uint x, uint y, uint w, uint h,
                 uint *z)
 {
-   struct pipe_screen *screen = pt->texture->screen;
    const uint dstStride = w;
    ubyte *map;
    uint *pDest = z;
    uint i, j;
-   enum pipe_format format = pt->texture->format;
+   enum pipe_format format = pt->resource->format;
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
-   map = (ubyte *)screen->transfer_map(screen, pt);
+   map = (ubyte *)pipe->transfer_map(pipe, pt);
    if (!map) {
       assert(0);
       return;
@@ -1405,7 +457,7 @@ pipe_get_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       {
          const uint *ptrc
@@ -1420,7 +472,7 @@ pipe_get_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       {
          const uint *ptrc
@@ -1453,26 +505,26 @@ pipe_get_tile_z(struct pipe_transfer *pt,
       assert(0);
    }
 
-   screen->transfer_unmap(screen, pt);
+   pipe->transfer_unmap(pipe, pt);
 }
 
 
 void
-pipe_put_tile_z(struct pipe_transfer *pt,
+pipe_put_tile_z(struct pipe_context *pipe,
+                struct pipe_transfer *pt,
                 uint x, uint y, uint w, uint h,
                 const uint *zSrc)
 {
-   struct pipe_screen *screen = pt->texture->screen;
    const uint srcStride = w;
    const uint *ptrc = zSrc;
    ubyte *map;
    uint i, j;
-   enum pipe_format format = pt->texture->format;
+   enum pipe_format format = pt->resource->format;
 
-   if (pipe_clip_tile(x, y, &w, &h, pt))
+   if (u_clip_tile(x, y, &w, &h, &pt->box))
       return;
 
-   map = (ubyte *)screen->transfer_map(screen, pt);
+   map = (ubyte *)pipe->transfer_map(pipe, pt);
    if (!map) {
       assert(0);
       return;
@@ -1489,10 +541,10 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
+         //assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
@@ -1516,10 +568,10 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
+         //assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
@@ -1560,7 +612,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
       assert(0);
    }
 
-   screen->transfer_unmap(screen, pt);
+   pipe->transfer_unmap(pipe, pt);
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
index 1453af38b8..986eee0743 100644
--- a/src/gallium/auxiliary/util/u_tile.h
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -32,22 +32,24 @@
 
 struct pipe_transfer;
 
-
 /**
  * Clip tile against transfer dims.
+ *
+ * XXX: this only clips width and height!
+ *
  * \return TRUE if tile is totally clipped, FALSE otherwise
  */
 static INLINE boolean
-pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_transfer *pt)
+u_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_box *box)
 {
-   if (x >= pt->width)
+   if (x >= box->width)
       return TRUE;
-   if (y >= pt->height)
+   if (y >= box->height)
       return TRUE;
-   if (x + *w > pt->width)
-      *w = pt->width - x;
-   if (y + *h > pt->height)
-      *h = pt->height - y;
+   if (x + *w > box->width)
+      *w = box->width - x;
+   if (y + *h > box->height)
+      *h = box->height - y;
    return FALSE;
 }
 
@@ -56,34 +58,54 @@ extern "C" {
 #endif
 
 void
-pipe_get_tile_raw(struct pipe_transfer *pt,
+pipe_get_tile_raw(struct pipe_context *pipe,
+                  struct pipe_transfer *pt,
                   uint x, uint y, uint w, uint h,
                   void *p, int dst_stride);
 
 void
-pipe_put_tile_raw(struct pipe_transfer *pt,
+pipe_put_tile_raw(struct pipe_context *pipe,
+                  struct pipe_transfer *pt,
                   uint x, uint y, uint w, uint h,
                   const void *p, int src_stride);
 
 
 void
-pipe_get_tile_rgba(struct pipe_transfer *pt,
+pipe_get_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_transfer *pt,
                    uint x, uint y, uint w, uint h,
                    float *p);
 
 void
-pipe_put_tile_rgba(struct pipe_transfer *pt,
+pipe_get_tile_swizzle(struct pipe_context *pipe,
+		      struct pipe_transfer *pt,
+                      uint x,
+                      uint y,
+                      uint w,
+                      uint h,
+                      uint swizzle_r,
+                      uint swizzle_g,
+                      uint swizzle_b,
+                      uint swizzle_a,
+                      enum pipe_format format,
+                      float *p);
+
+void
+pipe_put_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_transfer *pt,
                    uint x, uint y, uint w, uint h,
                    const float *p);
 
 
 void
-pipe_get_tile_z(struct pipe_transfer *pt,
+pipe_get_tile_z(struct pipe_context *pipe,
+                struct pipe_transfer *pt,
                 uint x, uint y, uint w, uint h,
                 uint *z);
 
 void
-pipe_put_tile_z(struct pipe_transfer *pt,
+pipe_put_tile_z(struct pipe_context *pipe,
+                struct pipe_transfer *pt,
                 uint x, uint y, uint w, uint h,
                 const uint *z);
 
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.c b/src/gallium/auxiliary/util/u_timed_winsys.c
deleted file mode 100644
index d88298bc14..0000000000
--- a/src/gallium/auxiliary/util/u_timed_winsys.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * 
- **************************************************************************/
-/*
- * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_simple_screen.h"
-#include "u_timed_winsys.h"
-#include "util/u_memory.h"
-#include "os/os_time.h"
-
-
-struct timed_winsys {
-   struct pipe_winsys base;
-   struct pipe_winsys *backend;
-   uint64_t last_dump;
-   struct {
-      const char *name_key;
-      double total;
-      unsigned calls;
-   } funcs[13];
-};
-
-
-static struct timed_winsys *timed_winsys( struct pipe_winsys *winsys )
-{
-   return (struct timed_winsys *)winsys;
-}
-
-
-static void time_display( struct pipe_winsys *winsys )
-{
-   struct timed_winsys *tws = timed_winsys(winsys);
-   unsigned i;
-   double overall = 0;
-
-   for (i = 0; i < Elements(tws->funcs); i++) {
-      if (tws->funcs[i].name_key) {
-         debug_printf("*** %-25s %5.3fms (%d calls, avg %.3fms)\n", 
-                      tws->funcs[i].name_key,
-                      tws->funcs[i].total,
-                      tws->funcs[i].calls,
-                      tws->funcs[i].total / tws->funcs[i].calls);
-         overall += tws->funcs[i].total;
-         tws->funcs[i].calls = 0;
-         tws->funcs[i].total = 0;
-      }
-   }
-
-   debug_printf("*** %-25s %5.3fms\n", 
-                "OVERALL WINSYS",
-                overall);
-}
-
-static void time_finish( struct pipe_winsys *winsys,
-                         long long startval, 
-                         unsigned idx,
-                         const char *name ) 
-{
-   struct timed_winsys *tws = timed_winsys(winsys);
-   int64_t endval = os_time_get();
-   double elapsed = (endval - startval)/1000.0;
-
-   if (endval - startval > 1000LL) 
-      debug_printf("*** %s %.3f\n", name, elapsed );
-
-   assert( tws->funcs[idx].name_key == name ||
-           tws->funcs[idx].name_key == NULL);
-
-   tws->funcs[idx].name_key = name;
-   tws->funcs[idx].total += elapsed;
-   tws->funcs[idx].calls++;
-
-   if (endval - tws->last_dump > 10LL * 1000LL * 1000LL) {
-      time_display( winsys );
-      tws->last_dump = endval;
-   }
-}
-
-
-/* Pipe has no concept of pools, but the psb driver passes a flag that
- * can be mapped onto pools in the backend.
- */
-static struct pipe_buffer *
-timed_buffer_create(struct pipe_winsys *winsys, 
-                    unsigned alignment, 
-                    unsigned usage, 
-                    unsigned size )
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   struct pipe_buffer *buf =
-      backend->buffer_create( backend, alignment, usage, size );
-
-   time_finish(winsys, start, 0, __FUNCTION__);
-   
-   return buf;
-}
-
-
-
-
-static struct pipe_buffer *
-timed_user_buffer_create(struct pipe_winsys *winsys,
-                             void *data, 
-                             unsigned bytes) 
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   struct pipe_buffer *buf = backend->user_buffer_create( backend, data, bytes );
-
-   time_finish(winsys, start, 1, __FUNCTION__);
-   
-   return buf;
-}
-
-
-static void *
-timed_buffer_map(struct pipe_winsys *winsys,
-                     struct pipe_buffer *buf,
-                     unsigned flags)
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   void *map = backend->buffer_map( backend, buf, flags );
-
-   time_finish(winsys, start, 2, __FUNCTION__);
-   
-   return map;
-}
-
-
-static void
-timed_buffer_unmap(struct pipe_winsys *winsys,
-                       struct pipe_buffer *buf)
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   backend->buffer_unmap( backend, buf );
-
-   time_finish(winsys, start, 3, __FUNCTION__);
-}
-
-
-static void
-timed_buffer_destroy(struct pipe_buffer *buf)
-{
-   struct pipe_winsys *winsys = buf->screen->winsys;
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   backend->buffer_destroy( buf );
-
-   time_finish(winsys, start, 4, __FUNCTION__);
-}
-
-
-static void
-timed_flush_frontbuffer( struct pipe_winsys *winsys,
-                         struct pipe_surface *surf,
-                         void *context_private)
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   backend->flush_frontbuffer( backend, surf, context_private );
-
-   time_finish(winsys, start, 5, __FUNCTION__);
-}
-
-
-
-
-static struct pipe_buffer *
-timed_surface_buffer_create(struct pipe_winsys *winsys,
-                              unsigned width, unsigned height,
-                              enum pipe_format format, 
-                              unsigned usage,
-                              unsigned tex_usage,
-                              unsigned *stride)
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   struct pipe_buffer *ret = backend->surface_buffer_create( backend, width, height, 
-                                                             format, usage, tex_usage, stride );
-
-   time_finish(winsys, start, 7, __FUNCTION__);
-   
-   return ret;
-}
-
-
-static const char *
-timed_get_name( struct pipe_winsys *winsys )
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   const char *ret = backend->get_name( backend );
-
-   time_finish(winsys, start, 9, __FUNCTION__);
-   
-   return ret;
-}
-
-static void
-timed_fence_reference(struct pipe_winsys *winsys,
-                    struct pipe_fence_handle **ptr,
-                    struct pipe_fence_handle *fence)
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   backend->fence_reference( backend, ptr, fence );
-
-   time_finish(winsys, start, 10, __FUNCTION__);
-}
-
-
-static int
-timed_fence_signalled( struct pipe_winsys *winsys,
-                       struct pipe_fence_handle *fence,
-                       unsigned flag )
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   int ret = backend->fence_signalled( backend, fence, flag );
-
-   time_finish(winsys, start, 11, __FUNCTION__);
-   
-   return ret;
-}
-
-static int
-timed_fence_finish( struct pipe_winsys *winsys,
-                     struct pipe_fence_handle *fence,
-                     unsigned flag )
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   int64_t start = os_time_get();
-
-   int ret = backend->fence_finish( backend, fence, flag );
-
-   time_finish(winsys, start, 12, __FUNCTION__);
-   
-   return ret;
-}
-
-static void
-timed_winsys_destroy( struct pipe_winsys *winsys )
-{
-   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   backend->destroy( backend );
-   FREE(winsys);
-}
-
-
-
-struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend )
-{
-   struct timed_winsys *ws = CALLOC_STRUCT(timed_winsys);
-   
-   ws->base.user_buffer_create = timed_user_buffer_create;
-   ws->base.buffer_map = timed_buffer_map;
-   ws->base.buffer_unmap = timed_buffer_unmap;
-   ws->base.buffer_destroy = timed_buffer_destroy;
-   ws->base.buffer_create = timed_buffer_create;
-   ws->base.surface_buffer_create = timed_surface_buffer_create;
-   ws->base.flush_frontbuffer = timed_flush_frontbuffer;
-   ws->base.get_name = timed_get_name;
-   ws->base.fence_reference = timed_fence_reference;
-   ws->base.fence_signalled = timed_fence_signalled;
-   ws->base.fence_finish = timed_fence_finish;
-   ws->base.destroy = timed_winsys_destroy;
-   
-   ws->backend = backend;
-
-   return &ws->base;
-}
-
diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c
new file mode 100644
index 0000000000..bedace3b1d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_transfer.c
@@ -0,0 +1,110 @@
+#include "pipe/p_context.h"
+#include "util/u_rect.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_memory.h"
+
+/* One-shot transfer operation with data supplied in a user
+ * pointer.  XXX: strides??
+ */
+void u_default_transfer_inline_write( struct pipe_context *pipe,
+				      struct pipe_resource *resource,
+				      struct pipe_subresource sr,
+				      unsigned usage,
+				      const struct pipe_box *box,
+				      const void *data,
+				      unsigned stride,
+				      unsigned slice_stride)
+{
+   struct pipe_transfer *transfer = NULL;
+   uint8_t *map = NULL;
+
+   transfer = pipe->get_transfer(pipe, 
+				 resource,
+				 sr,
+				 usage,
+				 box );
+   if (transfer == NULL)
+      goto out;
+
+   map = pipe_transfer_map(pipe, transfer);
+   if (map == NULL)
+      goto out;
+
+   assert(box->depth == 1);	/* XXX: fix me */
+   
+   util_copy_rect(map,
+		  resource->format,
+		  transfer->stride, /* bytes? */
+		  0, 0,
+		  box->width,
+		  box->height,
+		  data,
+		  box->width,	/* bytes? texels? */
+		  0, 0);
+
+out:
+   if (map)
+      pipe_transfer_unmap(pipe, transfer);
+
+   if (transfer)
+      pipe_transfer_destroy(pipe, transfer);
+}
+
+
+boolean u_default_resource_get_handle(struct pipe_screen *screen,
+				      struct pipe_resource *resource,
+				      struct winsys_handle *handle)
+{
+   return FALSE;
+}
+
+
+
+void u_default_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *transfer,
+				      const struct pipe_box *box)
+{
+   /* This is a no-op implementation, nothing to do.
+    */
+}
+
+unsigned u_default_is_resource_referenced( struct pipe_context *pipe,
+					   struct pipe_resource *resource,
+					unsigned face, unsigned level)
+{
+   return 0;
+}
+
+struct pipe_transfer * u_default_get_transfer(struct pipe_context *context,
+					      struct pipe_resource *resource,
+					      struct pipe_subresource sr,
+					      unsigned usage,
+					      const struct pipe_box *box)
+{
+   struct pipe_transfer *transfer = CALLOC_STRUCT(pipe_transfer);
+   if (transfer == NULL)
+      return NULL;
+
+   transfer->resource = resource;
+   transfer->sr = sr;
+   transfer->usage = usage;
+   transfer->box = *box;
+
+   /* Note strides are zero, this is ok for buffers, but not for
+    * textures 2d & higher at least. 
+    */
+   return transfer;
+}
+
+void u_default_transfer_unmap( struct pipe_context *pipe,
+			      struct pipe_transfer *transfer )
+{
+}
+
+void u_default_transfer_destroy(struct pipe_context *pipe,
+				struct pipe_transfer *transfer)
+{
+   FREE(transfer);
+}
+
diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h
new file mode 100644
index 0000000000..eb07945d15
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_transfer.h
@@ -0,0 +1,145 @@
+
+#ifndef U_TRANSFER_H
+#define U_TRANSFER_H
+
+/* Fallback implementations for inline read/writes which just go back
+ * to the regular transfer behaviour.
+ */
+#include "pipe/p_state.h"
+
+struct pipe_context;
+
+boolean u_default_resource_get_handle(struct pipe_screen *screen,
+				      struct pipe_resource *resource,
+				      struct winsys_handle *handle);
+
+void u_default_transfer_inline_write( struct pipe_context *pipe,
+			      struct pipe_resource *resource,
+			      struct pipe_subresource sr,
+			      unsigned usage,
+			      const struct pipe_box *box,
+			      const void *data,
+			      unsigned stride,
+			      unsigned slice_stride);
+
+void u_default_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *transfer,
+				      const struct pipe_box *box);
+
+unsigned u_default_is_resource_referenced( struct pipe_context *pipe,
+					   struct pipe_resource *resource,
+					   unsigned face, unsigned level);
+
+struct pipe_transfer * u_default_get_transfer(struct pipe_context *context,
+					      struct pipe_resource *resource,
+					      struct pipe_subresource sr,
+					      unsigned usage,
+					      const struct pipe_box *box);
+
+void u_default_transfer_unmap( struct pipe_context *pipe,
+			       struct pipe_transfer *transfer );
+
+void u_default_transfer_destroy(struct pipe_context *pipe,
+				struct pipe_transfer *transfer);
+
+
+
+/* Useful helper to allow >1 implementation of resource functionality
+ * to exist in a single driver.  This is intended to be transitionary!
+ */
+struct u_resource_vtbl {
+
+   boolean (*resource_get_handle)(struct pipe_screen *,
+				  struct pipe_resource *tex,
+				  struct winsys_handle *handle);
+
+   void (*resource_destroy)(struct pipe_screen *,
+			    struct pipe_resource *pt);
+
+   unsigned (*is_resource_referenced)(struct pipe_context *pipe,
+				      struct pipe_resource *texture,
+				      unsigned face, unsigned level);
+
+   struct pipe_transfer *(*get_transfer)(struct pipe_context *,
+					 struct pipe_resource *resource,
+					 struct pipe_subresource,
+					 unsigned usage,
+					 const struct pipe_box *);
+
+   void (*transfer_destroy)(struct pipe_context *,
+			    struct pipe_transfer *);
+
+   void *(*transfer_map)( struct pipe_context *,
+			  struct pipe_transfer *transfer );
+
+   void (*transfer_flush_region)( struct pipe_context *,
+				  struct pipe_transfer *transfer,
+				  const struct pipe_box *);
+
+   void (*transfer_unmap)( struct pipe_context *,
+			   struct pipe_transfer *transfer );
+
+   void (*transfer_inline_write)( struct pipe_context *pipe,
+				  struct pipe_resource *resource,
+				  struct pipe_subresource sr,
+				  unsigned usage,
+				  const struct pipe_box *box,
+				  const void *data,
+				  unsigned stride,
+				  unsigned slice_stride);
+};
+
+
+struct u_resource {
+   struct pipe_resource b;
+   struct u_resource_vtbl *vtbl;
+};
+
+
+boolean u_resource_get_handle_vtbl(struct pipe_screen *screen,
+			      struct pipe_resource *resource,
+			      struct winsys_handle *handle);
+
+void u_resource_destroy_vtbl(struct pipe_screen *screen,
+			struct pipe_resource *resource);
+
+unsigned u_is_resource_referenced_vtbl( struct pipe_context *pipe,
+					struct pipe_resource *resource,
+					unsigned face, unsigned level);
+
+struct pipe_transfer *u_get_transfer_vtbl(struct pipe_context *context,
+				     struct pipe_resource *resource,
+				     struct pipe_subresource sr,
+				     unsigned usage,
+				     const struct pipe_box *box);
+
+void u_transfer_destroy_vtbl(struct pipe_context *pipe,
+			struct pipe_transfer *transfer);
+
+void *u_transfer_map_vtbl( struct pipe_context *pipe,
+		      struct pipe_transfer *transfer );
+
+void u_transfer_flush_region_vtbl( struct pipe_context *pipe,
+			      struct pipe_transfer *transfer,
+			      const struct pipe_box *box);
+
+void u_transfer_unmap_vtbl( struct pipe_context *rm_ctx,
+		       struct pipe_transfer *transfer );
+
+void u_transfer_inline_write_vtbl( struct pipe_context *rm_ctx,
+				   struct pipe_resource *resource,
+				   struct pipe_subresource sr,
+				   unsigned usage,
+				   const struct pipe_box *box,
+				   const void *data,
+				   unsigned stride,
+				   unsigned slice_stride);
+
+
+
+
+
+
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 012b2ae233..75d44432d9 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -31,7 +31,7 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
-#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -39,7 +39,7 @@
 
 
 struct u_upload_mgr {
-   struct pipe_screen *screen;
+   struct pipe_context *pipe;
 
    unsigned default_size;
    unsigned alignment;
@@ -47,21 +47,21 @@ struct u_upload_mgr {
 
    /* The active buffer:
     */
-   struct pipe_buffer *buffer;
+   struct pipe_resource *buffer;
    unsigned size;
    unsigned offset;
 };
 
 
-struct u_upload_mgr *u_upload_create( struct pipe_screen *screen,
+struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
                                       unsigned default_size,
                                       unsigned alignment,
                                       unsigned usage )
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
 
+   upload->pipe = pipe;
    upload->default_size = default_size;
-   upload->screen = screen;
    upload->alignment = alignment;
    upload->usage = usage;
    upload->buffer = NULL;
@@ -69,31 +69,44 @@ struct u_upload_mgr *u_upload_create( struct pipe_screen *screen,
    return upload;
 }
 
-
+/* Slightly specialized version of buffer_write designed to maximize
+ * chances of the driver consolidating successive writes into a single
+ * upload.
+ *
+ * dirty_size may be slightly greater than size to cope with
+ * alignment.  We don't want to leave holes between succesively mapped
+ * regions as that may prevent the driver from consolidating uploads.
+ * 
+ * Note that the 'data' pointer has probably come from the application
+ * and we cannot read even a byte past its end without risking
+ * segfaults, or at least complaints from valgrind..
+ */
 static INLINE enum pipe_error
-my_buffer_write(struct pipe_screen *screen,
-                struct pipe_buffer *buf,
+my_buffer_write(struct pipe_context *pipe,
+                struct pipe_resource *buf,
                 unsigned offset, unsigned size, unsigned dirty_size,
                 const void *data)
 {
+   struct pipe_transfer *transfer = NULL;
    uint8_t *map;
    
-   assert(offset < buf->size);
-   assert(offset + size <= buf->size);
+   assert(offset < buf->width0);
+   assert(offset + size <= buf->width0);
    assert(dirty_size >= size);
    assert(size);
 
-   map = pipe_buffer_map_range(screen, buf, offset, size, 
-                               PIPE_BUFFER_USAGE_CPU_WRITE |
-                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
-                               PIPE_BUFFER_USAGE_DISCARD |
-                               PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
+   map = pipe_buffer_map_range(pipe, buf, offset, dirty_size,
+                               PIPE_TRANSFER_WRITE |
+                               PIPE_TRANSFER_FLUSH_EXPLICIT |
+                               PIPE_TRANSFER_DISCARD |
+                               PIPE_TRANSFER_UNSYNCHRONIZED,
+			       &transfer);
    if (map == NULL) 
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    memcpy(map + offset, data, size);
-   pipe_buffer_flush_mapped_range(screen, buf, offset, dirty_size);
-   pipe_buffer_unmap(screen, buf);
+   pipe_buffer_flush_mapped_range(pipe, transfer, offset, dirty_size);
+   pipe_buffer_unmap(pipe, buf, transfer);
 
    return PIPE_OK;
 }
@@ -109,7 +122,7 @@ my_buffer_write(struct pipe_screen *screen,
  */
 void u_upload_flush( struct u_upload_mgr *upload )
 {
-   pipe_buffer_reference( &upload->buffer, NULL );
+   pipe_resource_reference( &upload->buffer, NULL );
    upload->size = 0;
 }
 
@@ -135,9 +148,8 @@ u_upload_alloc_buffer( struct u_upload_mgr *upload,
     */
    size = align(MAX2(upload->default_size, min_size), 4096);
 
-   upload->buffer = pipe_buffer_create( upload->screen,
-                                        upload->alignment,
-                                        upload->usage | PIPE_BUFFER_USAGE_CPU_WRITE,
+   upload->buffer = pipe_buffer_create( upload->pipe->screen,
+                                        upload->usage,
                                         size );
    if (upload->buffer == NULL) 
       goto fail;
@@ -149,7 +161,7 @@ u_upload_alloc_buffer( struct u_upload_mgr *upload,
 
 fail:
    if (upload->buffer)
-      pipe_buffer_reference( &upload->buffer, NULL );
+      pipe_resource_reference( &upload->buffer, NULL );
 
    return PIPE_ERROR_OUT_OF_MEMORY;
 }
@@ -159,7 +171,7 @@ enum pipe_error u_upload_data( struct u_upload_mgr *upload,
                                unsigned size,
                                const void *data,
                                unsigned *out_offset,
-                               struct pipe_buffer **outbuf )
+                               struct pipe_resource **outbuf )
 {
    unsigned alloc_size = align( size, upload->alignment );
    enum pipe_error ret = PIPE_OK;
@@ -172,7 +184,7 @@ enum pipe_error u_upload_data( struct u_upload_mgr *upload,
 
    /* Copy the data, using map_range if available:
     */
-   ret = my_buffer_write( upload->screen, 
+   ret = my_buffer_write( upload->pipe, 
                           upload->buffer,
                           upload->offset,
                           size, 
@@ -183,7 +195,7 @@ enum pipe_error u_upload_data( struct u_upload_mgr *upload,
 
    /* Emit the return values:
     */
-   pipe_buffer_reference( outbuf, upload->buffer );
+   pipe_resource_reference( outbuf, upload->buffer );
    *out_offset = upload->offset;
    upload->offset += alloc_size;
    return PIPE_OK;
@@ -198,15 +210,18 @@ enum pipe_error u_upload_data( struct u_upload_mgr *upload,
 enum pipe_error u_upload_buffer( struct u_upload_mgr *upload,
                                  unsigned offset,
                                  unsigned size,
-                                 struct pipe_buffer *inbuf,
+                                 struct pipe_resource *inbuf,
                                  unsigned *out_offset,
-                                 struct pipe_buffer **outbuf )
+                                 struct pipe_resource **outbuf )
 {
    enum pipe_error ret = PIPE_OK;
+   struct pipe_transfer *transfer = NULL;
    const char *map = NULL;
 
-   map = (const char *)pipe_buffer_map( 
-      upload->screen, inbuf, PIPE_BUFFER_USAGE_CPU_READ );
+   map = (const char *)pipe_buffer_map(upload->pipe,
+				       inbuf,
+				       PIPE_TRANSFER_READ,
+				       &transfer);
 
    if (map == NULL) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
@@ -226,7 +241,7 @@ enum pipe_error u_upload_buffer( struct u_upload_mgr *upload,
 
 done:
    if (map)
-      pipe_buffer_unmap( upload->screen, inbuf );
+      pipe_buffer_unmap( upload->pipe, inbuf, transfer );
 
    return ret;
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index e158bed9d0..a124924fc8 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -35,11 +35,11 @@
 #include "pipe/p_defines.h"
 
 struct pipe_screen;
-struct pipe_buffer;
+struct pipe_resource;
 struct u_upload_mgr;
 
 
-struct u_upload_mgr *u_upload_create( struct pipe_screen *screen,
+struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
                                       unsigned default_size,
                                       unsigned alignment,
                                       unsigned usage );
@@ -61,15 +61,15 @@ enum pipe_error u_upload_data( struct u_upload_mgr *upload,
                                unsigned size,
                                const void *data,
                                unsigned *out_offset,
-                               struct pipe_buffer **outbuf );
+                               struct pipe_resource **outbuf );
 
 
 enum pipe_error u_upload_buffer( struct u_upload_mgr *upload,
                                  unsigned offset,
                                  unsigned size,
-                                 struct pipe_buffer *inbuf,
+                                 struct pipe_resource *inbuf,
                                  unsigned *out_offset,
-                                 struct pipe_buffer **outbuf );
+                                 struct pipe_resource **outbuf );
 
 
 
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.c b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
deleted file mode 100644
index 3193ea5f41..0000000000
--- a/src/gallium/auxiliary/vl/vl_bitstream_parser.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "vl_bitstream_parser.h"
-#include <assert.h>
-#include <limits.h>
-#include <util/u_memory.h>
-
-static unsigned
-grab_bits(unsigned cursor, unsigned how_many_bits, unsigned bitstream_elt)
-{
-   unsigned excess_bits = sizeof(unsigned) * CHAR_BIT - how_many_bits - cursor;
-	
-   assert(cursor < sizeof(unsigned) * CHAR_BIT);
-   assert(how_many_bits > 0 && how_many_bits <= sizeof(unsigned) * CHAR_BIT);
-   assert(cursor + how_many_bits <= sizeof(unsigned) * CHAR_BIT);
-
-   return (bitstream_elt << excess_bits) >> (excess_bits + cursor);
-}
-
-static unsigned
-show_bits(unsigned cursor, unsigned how_many_bits, const unsigned *bitstream)
-{	
-   unsigned cur_int = cursor / (sizeof(unsigned) * CHAR_BIT);
-   unsigned cur_bit = cursor % (sizeof(unsigned) * CHAR_BIT);
-	
-   assert(bitstream);
-	
-   if (cur_bit + how_many_bits > sizeof(unsigned) * CHAR_BIT) {
-      unsigned lower = grab_bits(cur_bit, sizeof(unsigned) * CHAR_BIT - cur_bit,
-                                 bitstream[cur_int]);
-      unsigned upper = grab_bits(0, cur_bit + how_many_bits - sizeof(unsigned) * CHAR_BIT,
-                                 bitstream[cur_int + 1]);
-      return lower | upper << (sizeof(unsigned) * CHAR_BIT - cur_bit);
-   }
-   else
-      return grab_bits(cur_bit, how_many_bits, bitstream[cur_int]);
-}
-
-bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
-                              unsigned num_bitstreams,
-                              const void **bitstreams,
-                              const unsigned *sizes)
-{
-   assert(parser);
-   assert(num_bitstreams);
-   assert(bitstreams);
-   assert(sizes);
-
-   parser->num_bitstreams = num_bitstreams;
-   parser->bitstreams = (const unsigned**)bitstreams;
-   parser->sizes = sizes;
-   parser->cur_bitstream = 0;
-   parser->cursor = 0;
-
-   return true;
-}
-
-void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser)
-{
-   assert(parser);
-}
-
-unsigned
-vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
-                             unsigned how_many_bits)
-{
-   unsigned bits;
-
-   assert(parser);
-
-   bits = vl_bitstream_parser_show_bits(parser, how_many_bits);
-
-   vl_bitstream_parser_forward(parser, how_many_bits);
-
-   return bits;
-}
-
-unsigned
-vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
-                              unsigned how_many_bits)
-{	
-   unsigned bits = 0;
-   unsigned shift = 0;
-   unsigned cursor;
-   unsigned cur_bitstream;
-
-   assert(parser);
-
-   cursor = parser->cursor;
-   cur_bitstream = parser->cur_bitstream;
-
-   while (1) {
-      unsigned bits_left = parser->sizes[cur_bitstream] * CHAR_BIT - cursor;
-      unsigned bits_to_show = how_many_bits > bits_left ? bits_left : how_many_bits;
-
-      bits |= show_bits(cursor, bits_to_show,
-                        parser->bitstreams[cur_bitstream]) << shift;
-		
-      if (how_many_bits > bits_to_show) {
-         how_many_bits -= bits_to_show;
-         cursor = 0;
-         ++cur_bitstream;
-         shift += bits_to_show;
-      }
-      else
-         break;
-   }
-
-   return bits;
-}
-
-void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
-                                 unsigned how_many_bits)
-{
-   assert(parser);
-   assert(how_many_bits);
-
-   parser->cursor += how_many_bits;
-
-   while (parser->cursor > parser->sizes[parser->cur_bitstream] * CHAR_BIT) {
-      parser->cursor -= parser->sizes[parser->cur_bitstream++] * CHAR_BIT;
-      assert(parser->cur_bitstream < parser->num_bitstreams);
-   }
-}
-
-void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
-                                unsigned how_many_bits)
-{
-   signed c;
-	
-   assert(parser);
-   assert(how_many_bits);
-	
-   c = parser->cursor - how_many_bits;
-
-   while (c < 0) {
-      c += parser->sizes[parser->cur_bitstream--] * CHAR_BIT;
-      assert(parser->cur_bitstream < parser->num_bitstreams);
-   }
-
-   parser->cursor = (unsigned)c;
-}
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.h b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
deleted file mode 100644
index 30ec743fa7..0000000000
--- a/src/gallium/auxiliary/vl/vl_bitstream_parser.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef vl_bitstream_parser_h
-#define vl_bitstream_parser_h
-
-#include "pipe/p_compiler.h"
-
-struct vl_bitstream_parser
-{
-   unsigned num_bitstreams;
-   const unsigned **bitstreams;
-   const unsigned *sizes;
-   unsigned cur_bitstream;
-   unsigned cursor;
-};
-
-bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
-                              unsigned num_bitstreams,
-                              const void **bitstreams,
-                              const unsigned *sizes);
-
-void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser);
-
-unsigned
-vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
-                             unsigned how_many_bits);
-
-unsigned
-vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
-                              unsigned how_many_bits);
-
-void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
-                                 unsigned how_many_bits);
-
-void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
-                                unsigned how_many_bits);
-
-#endif /* vl_bitstream_parser_h */
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
deleted file mode 100644
index ba23435f69..0000000000
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "vl_compositor.h"
-#include <assert.h>
-#include <pipe/p_context.h>
-#include <util/u_inlines.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-#include <util/u_memory.h>
-#include "vl_csc.h"
-#include "vl_shader_build.h"
-
-struct vertex2f
-{
-   float x, y;
-};
-
-struct vertex4f
-{
-   float x, y, z, w;
-};
-
-struct vertex_shader_consts
-{
-   struct vertex4f dst_scale;
-   struct vertex4f dst_trans;
-   struct vertex4f src_scale;
-   struct vertex4f src_trans;
-};
-
-struct fragment_shader_consts
-{
-   float matrix[16];
-};
-
-/*
- * Represents 2 triangles in a strip in normalized coords.
- * Used to render the surface onto the frame buffer.
- */
-static const struct vertex2f surface_verts[4] =
-{
-   {0.0f, 0.0f},
-   {0.0f, 1.0f},
-   {1.0f, 0.0f},
-   {1.0f, 1.0f}
-};
-
-/*
- * Represents texcoords for the above. We can use the position values directly.
- * TODO: Duplicate these in the shader, no need to create a buffer.
- */
-static const struct vertex2f *surface_texcoords = surface_verts;
-
-static void
-create_vert_shader(struct vl_compositor *c)
-{
-   const unsigned max_tokens = 50;
-
-   struct pipe_shader_state vs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(c);
-
-   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header*)&tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor*)&tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-   ti = 2;
-
-   /*
-    * decl i0             ; Vertex pos
-    * decl i1             ; Vertex texcoords
-    */
-   for (i = 0; i < 2; i++) {
-      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * decl c0             ; Scaling vector to scale vertex pos rect to destination size
-    * decl c1             ; Translation vector to move vertex pos rect into position
-    * decl c2             ; Scaling vector to scale texcoord rect to source size
-    * decl c3             ; Translation vector to move texcoord rect into position
-    */
-   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * decl o0             ; Vertex pos
-    * decl o1             ; Vertex texcoords
-    */
-   for (i = 0; i < 2; i++) {
-      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* decl t0, t1 */
-   decl = vl_decl_temps(0, 1);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * mad o0, i0, c0, c1  ; Scale and translate unit output rect to destination size and pos
-    * mad o1, i1, c2, c3  ; Scale and translate unit texcoord rect to source size and pos
-    */
-   for (i = 0; i < 2; ++i) {
-      inst = vl_inst4(TGSI_OPCODE_MAD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   vs.tokens = tokens;
-   c->vertex_shader = c->pipe->create_vs_state(c->pipe, &vs);
-   FREE(tokens);
-}
-
-static void
-create_frag_shader(struct vl_compositor *c)
-{
-   const unsigned max_tokens = 50;
-
-   struct pipe_shader_state fs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(c);
-
-   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header*)&tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor*)&tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-   ti = 2;
-
-   /* decl i0             ; Texcoords for s0 */
-   decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * decl c0-c3          ; CSC matrix c0-c3
-    */
-   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl o0             ; Fragment color */
-   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl t0 */
-   decl = vl_decl_temps(0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl s0             ; Sampler for tex containing picture to display */
-   decl = vl_decl_samplers(0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* tex2d t0, i0, s0    ; Read src pixel */
-   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * dp4 o0.x, t0, c0    ; Multiply pixel by the color conversion matrix
-    * dp4 o0.y, t0, c1
-    * dp4 o0.z, t0, c2
-    * dp4 o0.w, t0, c3
-    */
-   for (i = 0; i < 4; ++i) {
-      inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i);
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_X << i;
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-   assert(ti <= max_tokens);
-
-   fs.tokens = tokens;
-   c->fragment_shader = c->pipe->create_fs_state(c->pipe, &fs);
-   FREE(tokens);
-}
-
-static bool
-init_pipe_state(struct vl_compositor *c)
-{
-   struct pipe_sampler_state sampler;
-
-   assert(c);
-
-   c->fb_state.nr_cbufs = 1;
-   c->fb_state.zsbuf = NULL;
-
-   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
-   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
-   sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-   sampler.compare_func = PIPE_FUNC_ALWAYS;
-   sampler.normalized_coords = 1;
-   /*sampler.lod_bias = ;*/
-   /*sampler.min_lod = ;*/
-   /*sampler.max_lod = ;*/
-   /*sampler.border_color[i] = ;*/
-   /*sampler.max_anisotropy = ;*/
-   c->sampler = c->pipe->create_sampler_state(c->pipe, &sampler);
-	
-   return true;
-}
-
-static void cleanup_pipe_state(struct vl_compositor *c)
-{
-   assert(c);
-	
-   c->pipe->delete_sampler_state(c->pipe, c->sampler);
-}
-
-static bool
-init_shaders(struct vl_compositor *c)
-{
-   assert(c);
-
-   create_vert_shader(c);
-   create_frag_shader(c);
-
-   return true;
-}
-
-static void cleanup_shaders(struct vl_compositor *c)
-{
-   assert(c);
-	
-   c->pipe->delete_vs_state(c->pipe, c->vertex_shader);
-   c->pipe->delete_fs_state(c->pipe, c->fragment_shader);
-}
-
-static bool
-init_buffers(struct vl_compositor *c)
-{
-   struct fragment_shader_consts fsc;
-
-   assert(c);
-	
-   /*
-    * Create our vertex buffer and vertex buffer element
-    * VB contains 4 vertices that render a quad covering the entire window
-    * to display a rendered surface
-    * Quad is rendered as a tri strip
-    */
-   c->vertex_bufs[0].stride = sizeof(struct vertex2f);
-   c->vertex_bufs[0].max_index = 3;
-   c->vertex_bufs[0].buffer_offset = 0;
-   c->vertex_bufs[0].buffer = pipe_buffer_create
-   (
-      c->pipe->screen,
-      1,
-      PIPE_BUFFER_USAGE_VERTEX,
-      sizeof(struct vertex2f) * 4
-   );
-
-   memcpy
-   (
-      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-      surface_verts,
-      sizeof(struct vertex2f) * 4
-   );
-
-   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[0].buffer);
-
-   c->vertex_elems[0].src_offset = 0;
-   c->vertex_elems[0].instance_divisor = 0;
-   c->vertex_elems[0].vertex_buffer_index = 0;
-   c->vertex_elems[0].nr_components = 2;
-   c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /*
-    * Create our texcoord buffer and texcoord buffer element
-    * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
-    */
-   c->vertex_bufs[1].stride = sizeof(struct vertex2f);
-   c->vertex_bufs[1].max_index = 3;
-   c->vertex_bufs[1].buffer_offset = 0;
-   c->vertex_bufs[1].buffer = pipe_buffer_create
-   (
-      c->pipe->screen,
-      1,
-      PIPE_BUFFER_USAGE_VERTEX,
-      sizeof(struct vertex2f) * 4
-   );
-
-   memcpy
-   (
-      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-      surface_texcoords,
-      sizeof(struct vertex2f) * 4
-   );
-
-   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[1].buffer);
-
-   c->vertex_elems[1].src_offset = 0;
-   c->vertex_elems[1].instance_divisor = 0;
-   c->vertex_elems[1].vertex_buffer_index = 1;
-   c->vertex_elems[1].nr_components = 2;
-   c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /*
-    * Create our vertex shader's constant buffer
-    * Const buffer contains scaling and translation vectors
-    */
-   c->vs_const_buf = pipe_buffer_create
-   (
-      c->pipe->screen,
-      1,
-      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
-      sizeof(struct vertex_shader_consts)
-   );
-
-   /*
-    * Create our fragment shader's constant buffer
-    * Const buffer contains the color conversion matrix and bias vectors
-    */
-   c->fs_const_buf = pipe_buffer_create
-   (
-      c->pipe->screen,
-      1,
-      PIPE_BUFFER_USAGE_CONSTANT,
-      sizeof(struct fragment_shader_consts)
-   );
-
-   vl_csc_get_matrix(VL_CSC_COLOR_STANDARD_IDENTITY, NULL, true, fsc.matrix);
-
-   vl_compositor_set_csc_matrix(c, fsc.matrix);
-
-   return true;
-}
-
-static void
-cleanup_buffers(struct vl_compositor *c)
-{
-   unsigned i;
-
-   assert(c);
-	
-   for (i = 0; i < 2; ++i)
-      pipe_buffer_reference(&c->vertex_bufs[i].buffer, NULL);
-
-   pipe_buffer_reference(&c->vs_const_buf, NULL);
-   pipe_buffer_reference(&c->fs_const_buf, NULL);
-}
-
-bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
-{
-   assert(compositor);
-
-   memset(compositor, 0, sizeof(struct vl_compositor));
-
-   compositor->pipe = pipe;
-
-   if (!init_pipe_state(compositor))
-      return false;
-   if (!init_shaders(compositor)) {
-      cleanup_pipe_state(compositor);
-      return false;
-   }
-   if (!init_buffers(compositor)) {
-      cleanup_shaders(compositor);
-      cleanup_pipe_state(compositor);
-      return false;
-   }
-
-   return true;
-}
-
-void vl_compositor_cleanup(struct vl_compositor *compositor)
-{
-   assert(compositor);
-	
-   cleanup_buffers(compositor);
-   cleanup_shaders(compositor);
-   cleanup_pipe_state(compositor);
-}
-
-void vl_compositor_render(struct vl_compositor          *compositor,
-                          /*struct pipe_texture         *backround,
-                          struct pipe_video_rect        *backround_area,*/
-                          struct pipe_texture           *src_surface,
-                          enum pipe_mpeg12_picture_type picture_type,
-                          /*unsigned                    num_past_surfaces,
-                          struct pipe_texture           *past_surfaces,
-                          unsigned                      num_future_surfaces,
-                          struct pipe_texture           *future_surfaces,*/
-                          struct pipe_video_rect        *src_area,
-                          struct pipe_texture           *dst_surface,
-                          struct pipe_video_rect        *dst_area,
-                          /*unsigned                      num_layers,
-                          struct pipe_texture           *layers,
-                          struct pipe_video_rect        *layer_src_areas,
-                          struct pipe_video_rect        *layer_dst_areas*/
-                          struct pipe_fence_handle      **fence)
-{
-   struct vertex_shader_consts *vs_consts;
-
-   assert(compositor);
-   assert(src_surface);
-   assert(src_area);
-   assert(dst_surface);
-   assert(dst_area);
-   assert(picture_type == PIPE_MPEG12_PICTURE_TYPE_FRAME);
-
-   compositor->fb_state.width = dst_surface->width0;
-   compositor->fb_state.height = dst_surface->height0;
-   compositor->fb_state.cbufs[0] = compositor->pipe->screen->get_tex_surface
-   (
-      compositor->pipe->screen,
-      dst_surface,
-      0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
-   );
-
-   compositor->viewport.scale[0] = compositor->fb_state.width;
-   compositor->viewport.scale[1] = compositor->fb_state.height;
-   compositor->viewport.scale[2] = 1;
-   compositor->viewport.scale[3] = 1;
-   compositor->viewport.translate[0] = 0;
-   compositor->viewport.translate[1] = 0;
-   compositor->viewport.translate[2] = 0;
-   compositor->viewport.translate[3] = 0;
-
-   compositor->scissor.maxx = compositor->fb_state.width;
-   compositor->scissor.maxy = compositor->fb_state.height;
-
-   compositor->pipe->set_framebuffer_state(compositor->pipe, &compositor->fb_state);
-   compositor->pipe->set_viewport_state(compositor->pipe, &compositor->viewport);
-   compositor->pipe->set_scissor_state(compositor->pipe, &compositor->scissor);
-   compositor->pipe->bind_fragment_sampler_states(compositor->pipe, 1, &compositor->sampler);
-   compositor->pipe->set_fragment_sampler_textures(compositor->pipe, 1, &src_surface);
-   compositor->pipe->bind_vs_state(compositor->pipe, compositor->vertex_shader);
-   compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
-   compositor->pipe->set_vertex_buffers(compositor->pipe, 2, compositor->vertex_bufs);
-   compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
-   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, compositor->vs_const_buf);
-   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, compositor->fs_const_buf);
-
-   vs_consts = pipe_buffer_map
-   (
-      compositor->pipe->screen,
-      compositor->vs_const_buf,
-      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-   );
-
-   vs_consts->dst_scale.x = dst_area->w / (float)compositor->fb_state.cbufs[0]->width;
-   vs_consts->dst_scale.y = dst_area->h / (float)compositor->fb_state.cbufs[0]->height;
-   vs_consts->dst_scale.z = 1;
-   vs_consts->dst_scale.w = 1;
-   vs_consts->dst_trans.x = dst_area->x / (float)compositor->fb_state.cbufs[0]->width;
-   vs_consts->dst_trans.y = dst_area->y / (float)compositor->fb_state.cbufs[0]->height;
-   vs_consts->dst_trans.z = 0;
-   vs_consts->dst_trans.w = 0;
-
-   vs_consts->src_scale.x = src_area->w / (float)src_surface->width0;
-   vs_consts->src_scale.y = src_area->h / (float)src_surface->height0;
-   vs_consts->src_scale.z = 1;
-   vs_consts->src_scale.w = 1;
-   vs_consts->src_trans.x = src_area->x / (float)src_surface->width0;
-   vs_consts->src_trans.y = src_area->y / (float)src_surface->height0;
-   vs_consts->src_trans.z = 0;
-   vs_consts->src_trans.w = 0;
-
-   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf);
-
-   compositor->pipe->draw_arrays(compositor->pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
-   compositor->pipe->flush(compositor->pipe, PIPE_FLUSH_RENDER_CACHE, fence);
-
-   pipe_surface_reference(&compositor->fb_state.cbufs[0], NULL);
-}
-
-void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat)
-{
-   assert(compositor);
-
-   memcpy
-   (
-      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf, PIPE_BUFFER_USAGE_CPU_WRITE),
-      mat,
-      sizeof(struct fragment_shader_consts)
-   );
-
-   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf);
-}
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
deleted file mode 100644
index 6a9a3fd7af..0000000000
--- a/src/gallium/auxiliary/vl/vl_compositor.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef vl_compositor_h
-#define vl_compositor_h
-
-#include <pipe/p_compiler.h>
-#include <pipe/p_state.h>
-#include <pipe/p_video_state.h>
-
-struct pipe_context;
-struct pipe_texture;
-
-struct vl_compositor
-{
-   struct pipe_context *pipe;
-
-   struct pipe_framebuffer_state fb_state;
-   void *sampler;
-   void *vertex_shader;
-   void *fragment_shader;
-   struct pipe_viewport_state viewport;
-   struct pipe_scissor_state scissor;
-   struct pipe_vertex_buffer vertex_bufs[2];
-   struct pipe_vertex_element vertex_elems[2];
-   struct pipe_buffer *vs_const_buf, *fs_const_buf;
-};
-
-bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe);
-
-void vl_compositor_cleanup(struct vl_compositor *compositor);
-
-void vl_compositor_render(struct vl_compositor          *compositor,
-                          /*struct pipe_texture         *backround,
-                          struct pipe_video_rect        *backround_area,*/
-                          struct pipe_texture           *src_surface,
-                          enum pipe_mpeg12_picture_type picture_type,
-                          /*unsigned                    num_past_surfaces,
-                          struct pipe_texture           *past_surfaces,
-                          unsigned                      num_future_surfaces,
-                          struct pipe_texture           *future_surfaces,*/
-                          struct pipe_video_rect        *src_area,
-                          struct pipe_texture           *dst_surface,
-                          struct pipe_video_rect        *dst_area,
-                          /*unsigned                      num_layers,
-                          struct pipe_texture           *layers,
-                          struct pipe_video_rect        *layer_src_areas,
-                          struct pipe_video_rect        *layer_dst_areas,*/
-                          struct pipe_fence_handle      **fence);
-
-void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat);
-
-#endif /* vl_compositor_h */
diff --git a/src/gallium/auxiliary/vl/vl_csc.c b/src/gallium/auxiliary/vl/vl_csc.c
deleted file mode 100644
index 5ecc43a5fa..0000000000
--- a/src/gallium/auxiliary/vl/vl_csc.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "vl_csc.h"
-#include <util/u_math.h>
-#include <util/u_debug.h>
-
-/*
- * Color space conversion formulas
- *
- * To convert YCbCr to RGB,
- *    vec4  ycbcr, rgb
- *    mat44 csc
- *    rgb = csc * ycbcr
- *
- * To calculate the color space conversion matrix csc with ProcAmp adjustments,
- *    mat44 csc, cstd, procamp, bias
- *    csc = cstd * (procamp * bias)
- *
- * Where cstd is a matrix corresponding to one of the color standards (BT.601, BT.709, etc)
- * adjusted for the kind of YCbCr -> RGB mapping wanted (1:1, full),
- * bias is a matrix corresponding to the kind of YCbCr -> RGB mapping wanted (1:1, full)
- *
- * To calculate procamp,
- *    mat44 procamp, hue, saturation, brightness, contrast
- *    procamp = brightness * (saturation * (contrast * hue))
- * Alternatively,
- *    procamp = saturation * (brightness * (contrast * hue))
- *
- * contrast
- * [ c, 0, 0, 0]
- * [ 0, c, 0, 0]
- * [ 0, 0, c, 0]
- * [ 0, 0, 0, 1]
- *
- * brightness
- * [ 1, 0, 0, b]
- * [ 0, 1, 0, 0]
- * [ 0, 0, 1, 0]
- * [ 0, 0, 0, 1]
- *
- * saturation
- * [ 1, 0, 0, 0]
- * [ 0, s, 0, 0]
- * [ 0, 0, s, 0]
- * [ 0, 0, 0, 1]
- *
- * hue
- * [ 1,       0,      0, 0]
- * [ 0,  cos(h), sin(h), 0]
- * [ 0, -sin(h), cos(h), 0]
- * [ 0,       0,      0, 1]
- *
- * procamp
- * [ c,           0,          0, b]
- * [ 0,  c*s*cos(h), c*s*sin(h), 0]
- * [ 0, -c*s*sin(h), c*s*cos(h), 0]
- * [ 0,           0,          0, 1]
- *
- * bias
- * [ 1, 0, 0,  ybias]
- * [ 0, 1, 0, cbbias]
- * [ 0, 0, 1, crbias]
- * [ 0, 0, 0,      1]
- *
- * csc
- * [ c*cstd[ 0], c*cstd[ 1]*s*cos(h) - c*cstd[ 2]*s*sin(h), c*cstd[ 2]*s*cos(h) + c*cstd[ 1]*s*sin(h), cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 2]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
- * [ c*cstd[ 4], c*cstd[ 5]*s*cos(h) - c*cstd[ 6]*s*sin(h), c*cstd[ 6]*s*cos(h) + c*cstd[ 5]*s*sin(h), cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 6]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
- * [ c*cstd[ 8], c*cstd[ 9]*s*cos(h) - c*cstd[10]*s*sin(h), c*cstd[10]*s*cos(h) + c*cstd[ 9]*s*sin(h), cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[10]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
- * [ c*cstd[12], c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h), c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h), cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
- */
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-static const float bt_601[16] =
-{
-   1.0f,  0.0f,    1.371f, 0.0f,
-   1.0f, -0.336f, -0.698f, 0.0f,
-   1.0f,  1.732f,  0.0f,   0.0f,
-   0.0f,  0.0f,    0.0f,   1.0f
-};
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-static const float bt_601_full[16] =
-{
-   1.164f,  0.0f,    1.596f, 0.0f,
-   1.164f, -0.391f, -0.813f, 0.0f,
-   1.164f,  2.018f,  0.0f,   0.0f,
-   0.0f,    0.0f,    0.0f,   1.0f
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-static const float bt_709[16] =
-{
-   1.0f,  0.0f,    1.540f, 0.0f,
-   1.0f, -0.183f, -0.459f, 0.0f,
-   1.0f,  1.816f,  0.0f,   0.0f,
-   0.0f,  0.0f,    0.0f,   1.0f
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-static const float bt_709_full[16] =
-{
-   1.164f,  0.0f,    1.793f, 0.0f,
-   1.164f, -0.213f, -0.534f, 0.0f,
-   1.164f,  2.115f,  0.0f,   0.0f,
-   0.0f,    0.0f,    0.0f,   1.0f
-};
-
-static const float identity[16] =
-{
-   1.0f, 0.0f, 0.0f, 0.0f,
-   0.0f, 1.0f, 0.0f, 0.0f,
-   0.0f, 0.0f, 1.0f, 0.0f,
-   0.0f, 0.0f, 0.0f, 1.0f
-};
-
-void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
-                       struct vl_procamp *procamp,
-                       bool full_range,
-                       float *matrix)
-{
-   float ybias = full_range ? -16.0f/255.0f : 0.0f;
-   float cbbias = -128.0f/255.0f;
-   float crbias = -128.0f/255.0f;
-   float c = procamp ? procamp->contrast : 1.0f;
-   float s = procamp ? procamp->saturation : 1.0f;
-   float b = procamp ? procamp->brightness : 0.0f;
-   float h = procamp ? procamp->hue : 0.0f;
-   const float *cstd;
-
-   assert(matrix);
-
-   switch (cs) {
-      case VL_CSC_COLOR_STANDARD_BT_601:
-         cstd = full_range ? &bt_601_full[0] : &bt_601[0];
-         break;
-      case VL_CSC_COLOR_STANDARD_BT_709:
-         cstd = full_range ? &bt_709_full[0] : &bt_709[0];
-         break;
-      case VL_CSC_COLOR_STANDARD_IDENTITY:
-      default:
-         assert(cs == VL_CSC_COLOR_STANDARD_IDENTITY);
-         memcpy(matrix, &identity[0], sizeof(float) * 16);
-         return;
-   }
-
-   matrix[ 0] = c*cstd[ 0];
-   matrix[ 1] = c*cstd[ 1]*s*cosf(h) - c*cstd[ 2]*s*sinf(h);
-   matrix[ 2] = c*cstd[ 2]*s*cosf(h) + c*cstd[ 1]*s*sinf(h);
-   matrix[ 3] = cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 2]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
-
-   matrix[ 4] = c*cstd[ 4];
-   matrix[ 5] = c*cstd[ 5]*s*cosf(h) - c*cstd[ 6]*s*sinf(h);
-   matrix[ 6] = c*cstd[ 6]*s*cosf(h) + c*cstd[ 5]*s*sinf(h);
-   matrix[ 7] = cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 6]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
-
-   matrix[ 8] = c*cstd[ 8];
-   matrix[ 9] = c*cstd[ 9]*s*cosf(h) - c*cstd[10]*s*sinf(h);
-   matrix[10] = c*cstd[10]*s*cosf(h) + c*cstd[ 9]*s*sinf(h);
-   matrix[11] = cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[10]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
-
-   matrix[12] = c*cstd[12];
-   matrix[13] = c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h);
-   matrix[14] = c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h);
-   matrix[15] = cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h));
-}
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
deleted file mode 100644
index f323de0ea5..0000000000
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
+++ /dev/null
@@ -1,1672 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "vl_mpeg12_mc_renderer.h"
-#include <assert.h>
-#include <pipe/p_context.h>
-#include <util/u_inlines.h>
-#include <util/u_format.h>
-#include <util/u_math.h>
-#include <util/u_memory.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-#include "vl_shader_build.h"
-
-#define DEFAULT_BUF_ALIGNMENT 1
-#define MACROBLOCK_WIDTH 16
-#define MACROBLOCK_HEIGHT 16
-#define BLOCK_WIDTH 8
-#define BLOCK_HEIGHT 8
-#define ZERO_BLOCK_NIL -1.0f
-#define ZERO_BLOCK_IS_NIL(zb) ((zb).x < 0.0f)
-
-struct vertex2f
-{
-   float x, y;
-};
-
-struct vertex4f
-{
-   float x, y, z, w;
-};
-
-struct vertex_shader_consts
-{
-   struct vertex4f denorm;
-};
-
-struct fragment_shader_consts
-{
-   struct vertex4f multiplier;
-   struct vertex4f div;
-};
-
-/*
- * Muliplier renormalizes block samples from 16 bits to 12 bits.
- * Divider is used when calculating Y % 2 for choosing top or bottom
- * field for P or B macroblocks.
- * TODO: Use immediates.
- */
-static const struct fragment_shader_consts fs_consts = {
-   {32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
-   {0.5f, 2.0f, 0.0f, 0.0f}
-};
-
-struct vert_stream_0
-{
-   struct vertex2f pos;
-   struct vertex2f luma_tc;
-   struct vertex2f cb_tc;
-   struct vertex2f cr_tc;
-};
-
-enum MACROBLOCK_TYPE
-{
-   MACROBLOCK_TYPE_INTRA,
-   MACROBLOCK_TYPE_FWD_FRAME_PRED,
-   MACROBLOCK_TYPE_FWD_FIELD_PRED,
-   MACROBLOCK_TYPE_BKWD_FRAME_PRED,
-   MACROBLOCK_TYPE_BKWD_FIELD_PRED,
-   MACROBLOCK_TYPE_BI_FRAME_PRED,
-   MACROBLOCK_TYPE_BI_FIELD_PRED,
-
-   NUM_MACROBLOCK_TYPES
-};
-
-static void
-create_intra_vert_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 50;
-
-   struct pipe_shader_state vs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-   ti = 2;
-
-   /*
-    * decl i0              ; Vertex pos
-    * decl i1              ; Luma texcoords
-    * decl i2              ; Chroma Cb texcoords
-    * decl i3              ; Chroma Cr texcoords
-    */
-   for (i = 0; i < 4; i++) {
-      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * decl o0              ; Vertex pos
-    * decl o1              ; Luma texcoords
-    * decl o2              ; Chroma Cb texcoords
-    * decl o3              ; Chroma Cr texcoords
-    */
-   for (i = 0; i < 4; i++) {
-      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * mov o0, i0           ; Move input vertex pos to output
-    * mov o1, i1           ; Move input luma texcoords to output
-    * mov o2, i2           ; Move input chroma Cb texcoords to output
-    * mov o3, i3           ; Move input chroma Cr texcoords to output
-    */
-   for (i = 0; i < 4; ++i) {
-      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   vs.tokens = tokens;
-   r->i_vs = r->pipe->create_vs_state(r->pipe, &vs);
-   free(tokens);
-}
-
-static void
-create_intra_frag_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 100;
-
-   struct pipe_shader_state fs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-   ti = 2;
-
-   /*
-    * decl i0                      ; Luma texcoords
-    * decl i1                      ; Chroma Cb texcoords
-    * decl i2                      ; Chroma Cr texcoords
-    */
-   for (i = 0; i < 3; ++i) {
-      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl o0                      ; Fragment color */
-   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl t0, t1 */
-   decl = vl_decl_temps(0, 1);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * decl s0                      ; Sampler for luma texture
-    * decl s1                      ; Sampler for chroma Cb texture
-    * decl s2                      ; Sampler for chroma Cr texture
-    */
-   for (i = 0; i < 3; ++i) {
-      decl = vl_decl_samplers(i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * tex2d t1, i0, s0             ; Read texel from luma texture
-    * mov t0.x, t1.x               ; Move luma sample into .x component
-    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
-    * mov t0.y, t1.x               ; Move Cb sample into .y component
-    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
-    * mov t0.z, t1.x               ; Move Cr sample into .z component
-    */
-   for (i = 0; i < 3; ++i) {
-      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-      inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_X << i;
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* mul o0, t0, c0               ; Rescale texel to correct range */
-   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   fs.tokens = tokens;
-   r->i_fs = r->pipe->create_fs_state(r->pipe, &fs);
-   free(tokens);
-}
-
-static void
-create_frame_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 100;
-
-   struct pipe_shader_state vs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-   ti = 2;
-
-   /*
-    * decl i0              ; Vertex pos
-    * decl i1              ; Luma texcoords
-    * decl i2              ; Chroma Cb texcoords
-    * decl i3              ; Chroma Cr texcoords
-    * decl i4              ; Ref surface top field texcoords
-    * decl i5              ; Ref surface bottom field texcoords (unused, packed in the same stream)
-    */
-   for (i = 0; i < 6; i++) {
-      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * decl o0              ; Vertex pos
-    * decl o1              ; Luma texcoords
-    * decl o2              ; Chroma Cb texcoords
-    * decl o3              ; Chroma Cr texcoords
-    * decl o4              ; Ref macroblock texcoords
-    */
-   for (i = 0; i < 5; i++) {
-      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * mov o0, i0           ; Move input vertex pos to output
-    * mov o1, i1           ; Move input luma texcoords to output
-    * mov o2, i2           ; Move input chroma Cb texcoords to output
-    * mov o3, i3           ; Move input chroma Cr texcoords to output
-    */
-   for (i = 0; i < 4; ++i) {
-        inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-        ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* add o4, i0, i4       ; Translate vertex pos by motion vec to form ref macroblock texcoords */
-   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   vs.tokens = tokens;
-   r->p_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
-   free(tokens);
-}
-
-#if 0
-static void
-create_field_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(false);
-}
-#endif
-
-static void
-create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 100;
-
-   struct pipe_shader_state fs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-   ti = 2;
-
-   /*
-    * decl i0                      ; Luma texcoords
-    * decl i1                      ; Chroma Cb texcoords
-    * decl i2                      ; Chroma Cr texcoords
-    * decl i3                      ; Ref macroblock texcoords
-    */
-   for (i = 0; i < 4; ++i) {
-      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl o0                      ; Fragment color */
-   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl t0, t1 */
-   decl = vl_decl_temps(0, 1);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * decl s0                      ; Sampler for luma texture
-    * decl s1                      ; Sampler for chroma Cb texture
-    * decl s2                      ; Sampler for chroma Cr texture
-    * decl s3                      ; Sampler for ref surface texture
-    */
-   for (i = 0; i < 4; ++i) {
-      decl = vl_decl_samplers(i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * tex2d t1, i0, s0             ; Read texel from luma texture
-    * mov t0.x, t1.x               ; Move luma sample into .x component
-    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
-    * mov t0.y, t1.x               ; Move Cb sample into .y component
-    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
-    * mov t0.z, t1.x               ; Move Cr sample into .z component
-    */
-   for (i = 0; i < 3; ++i) {
-      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-      inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_X << i;
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* mul t0, t0, c0               ; Rescale texel to correct range */
-   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* tex2d t1, i3, s3             ; Read texel from ref macroblock */
-   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* add o0, t0, t1               ; Add ref and differential to form final output */
-   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   fs.tokens = tokens;
-   r->p_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
-   free(tokens);
-}
-
-#if 0
-static void
-create_field_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(false);
-}
-#endif
-
-static void
-create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 100;
-
-   struct pipe_shader_state vs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-   ti = 2;
-
-   /*
-    * decl i0              ; Vertex pos
-    * decl i1              ; Luma texcoords
-    * decl i2              ; Chroma Cb texcoords
-    * decl i3              ; Chroma Cr texcoords
-    * decl i4              ; First ref macroblock top field texcoords
-    * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
-    * decl i6              ; Second ref macroblock top field texcoords
-    * decl i7              ; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
-    */
-   for (i = 0; i < 8; i++) {
-      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * decl o0              ; Vertex pos
-    * decl o1              ; Luma texcoords
-    * decl o2              ; Chroma Cb texcoords
-    * decl o3              ; Chroma Cr texcoords
-    * decl o4              ; First ref macroblock texcoords
-    * decl o5              ; Second ref macroblock texcoords
-    */
-   for (i = 0; i < 6; i++) {
-      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * mov o0, i0           ; Move input vertex pos to output
-    * mov o1, i1           ; Move input luma texcoords to output
-    * mov o2, i2           ; Move input chroma Cb texcoords to output
-    * mov o3, i3           ; Move input chroma Cr texcoords to output
-    */
-   for (i = 0; i < 4; ++i) {
-      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * add o4, i0, i4       ; Translate vertex pos by motion vec to form first ref macroblock texcoords
-    * add o5, i0, i6       ; Translate vertex pos by motion vec to form second ref macroblock texcoords
-    */
-   for (i = 0; i < 2; ++i) {
-      inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   vs.tokens = tokens;
-   r->b_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
-   free(tokens);
-}
-
-#if 0
-static void
-create_field_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(false);
-}
-#endif
-
-static void
-create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   const unsigned max_tokens = 100;
-
-   struct pipe_shader_state fs;
-   struct tgsi_token *tokens;
-   struct tgsi_header *header;
-
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
-
-   unsigned ti;
-
-   unsigned i;
-
-   assert(r);
-
-   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
-   header = (struct tgsi_header *) &tokens[0];
-   *header = tgsi_build_header();
-   *(struct tgsi_processor *) &tokens[1] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-   ti = 2;
-
-   /*
-    * decl i0                      ; Luma texcoords
-    * decl i1                      ; Chroma Cb texcoords
-    * decl i2                      ; Chroma Cr texcoords
-    * decl i3                      ; First ref macroblock texcoords
-    * decl i4                      ; Second ref macroblock texcoords
-    */
-   for (i = 0; i < 5; ++i) {
-      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-    * decl c1                      ; Constant 1/2 in .x channel to use as weight to blend past and future texels
-    */
-   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl o0                      ; Fragment color */
-   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /* decl t0-t2 */
-   decl = vl_decl_temps(0, 2);
-   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * decl s0                      ; Sampler for luma texture
-    * decl s1                      ; Sampler for chroma Cb texture
-    * decl s2                      ; Sampler for chroma Cr texture
-    * decl s3                      ; Sampler for first ref surface texture
-    * decl s4                      ; Sampler for second ref surface texture
-    */
-   for (i = 0; i < 5; ++i) {
-      decl = vl_decl_samplers(i, i);
-      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /*
-    * tex2d t1, i0, s0             ; Read texel from luma texture
-    * mov t0.x, t1.x               ; Move luma sample into .x component
-    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
-    * mov t0.y, t1.x               ; Move Cb sample into .y component
-    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
-    * mov t0.z, t1.x               ; Move Cr sample into .z component
-    */
-   for (i = 0; i < 3; ++i) {
-      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-      inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
-      inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_X << i;
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* mul t0, t0, c0               ; Rescale texel to correct range */
-   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /*
-    * tex2d t1, i3, s3             ; Read texel from first ref macroblock
-    * tex2d t2, i4, s4             ; Read texel from second ref macroblock
-    */
-   for (i = 0; i < 2; ++i) {
-      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
-      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-   }
-
-   /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
-   inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-   inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-   inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
-   inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
-   inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X;
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
-   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   /* end */
-   inst = vl_end();
-   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-   assert(ti <= max_tokens);
-
-   fs.tokens = tokens;
-   r->b_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
-   free(tokens);
-}
-
-#if 0
-static void
-create_field_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(false);
-}
-#endif
-
-static void
-xfer_buffers_map(struct vl_mpeg12_mc_renderer *r)
-{
-   unsigned i;
-
-   assert(r);
-
-   for (i = 0; i < 3; ++i) {
-      r->tex_transfer[i] = r->pipe->screen->get_tex_transfer
-      (
-         r->pipe->screen, r->textures.all[i],
-         0, 0, 0, PIPE_TRANSFER_WRITE, 0, 0,
-         r->textures.all[i]->width0, r->textures.all[i]->height0
-      );
-
-      r->texels[i] = r->pipe->screen->transfer_map(r->pipe->screen, r->tex_transfer[i]);
-   }
-}
-
-static void
-xfer_buffers_unmap(struct vl_mpeg12_mc_renderer *r)
-{
-   unsigned i;
-
-   assert(r);
-
-   for (i = 0; i < 3; ++i) {
-      r->pipe->screen->transfer_unmap(r->pipe->screen, r->tex_transfer[i]);
-      r->pipe->screen->tex_transfer_destroy(r->tex_transfer[i]);
-   }
-}
-
-static bool
-init_pipe_state(struct vl_mpeg12_mc_renderer *r)
-{
-   struct pipe_sampler_state sampler;
-   unsigned filters[5];
-   unsigned i;
-
-   assert(r);
-
-   r->viewport.scale[0] = r->pot_buffers ?
-      util_next_power_of_two(r->picture_width) : r->picture_width;
-   r->viewport.scale[1] = r->pot_buffers ?
-      util_next_power_of_two(r->picture_height) : r->picture_height;
-   r->viewport.scale[2] = 1;
-   r->viewport.scale[3] = 1;
-   r->viewport.translate[0] = 0;
-   r->viewport.translate[1] = 0;
-   r->viewport.translate[2] = 0;
-   r->viewport.translate[3] = 0;
-
-   r->scissor.maxx = r->pot_buffers ?
-      util_next_power_of_two(r->picture_width) : r->picture_width;
-   r->scissor.maxy = r->pot_buffers ?
-      util_next_power_of_two(r->picture_height) : r->picture_height;
-
-   r->fb_state.width = r->pot_buffers ?
-      util_next_power_of_two(r->picture_width) : r->picture_width;
-   r->fb_state.height = r->pot_buffers ?
-      util_next_power_of_two(r->picture_height) : r->picture_height;
-   r->fb_state.nr_cbufs = 1;
-   r->fb_state.zsbuf = NULL;
-
-   /* Luma filter */
-   filters[0] = PIPE_TEX_FILTER_NEAREST;
-   /* Chroma filters */
-   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_444 ||
-       r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
-      filters[1] = PIPE_TEX_FILTER_NEAREST;
-      filters[2] = PIPE_TEX_FILTER_NEAREST;
-   }
-   else {
-      filters[1] = PIPE_TEX_FILTER_LINEAR;
-      filters[2] = PIPE_TEX_FILTER_LINEAR;
-   }
-   /* Fwd, bkwd ref filters */
-   filters[3] = PIPE_TEX_FILTER_LINEAR;
-   filters[4] = PIPE_TEX_FILTER_LINEAR;
-
-   for (i = 0; i < 5; ++i) {
-      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-      sampler.min_img_filter = filters[i];
-      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-      sampler.mag_img_filter = filters[i];
-      sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-      sampler.compare_func = PIPE_FUNC_ALWAYS;
-      sampler.normalized_coords = 1;
-      /*sampler.shadow_ambient = ; */
-      /*sampler.lod_bias = ; */
-      sampler.min_lod = 0;
-      /*sampler.max_lod = ; */
-      /*sampler.border_color[i] = ; */
-      /*sampler.max_anisotropy = ; */
-      r->samplers.all[i] = r->pipe->create_sampler_state(r->pipe, &sampler);
-   }
-
-   return true;
-}
-
-static void
-cleanup_pipe_state(struct vl_mpeg12_mc_renderer *r)
-{
-   unsigned i;
-
-   assert(r);
-
-   for (i = 0; i < 5; ++i)
-      r->pipe->delete_sampler_state(r->pipe, r->samplers.all[i]);
-}
-
-static bool
-init_shaders(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(r);
-
-   create_intra_vert_shader(r);
-   create_intra_frag_shader(r);
-   create_frame_pred_vert_shader(r);
-   create_frame_pred_frag_shader(r);
-   create_frame_bi_pred_vert_shader(r);
-   create_frame_bi_pred_frag_shader(r);
-
-   return true;
-}
-
-static void
-cleanup_shaders(struct vl_mpeg12_mc_renderer *r)
-{
-   assert(r);
-
-   r->pipe->delete_vs_state(r->pipe, r->i_vs);
-   r->pipe->delete_fs_state(r->pipe, r->i_fs);
-   r->pipe->delete_vs_state(r->pipe, r->p_vs[0]);
-   r->pipe->delete_fs_state(r->pipe, r->p_fs[0]);
-   r->pipe->delete_vs_state(r->pipe, r->b_vs[0]);
-   r->pipe->delete_fs_state(r->pipe, r->b_fs[0]);
-}
-
-static bool
-init_buffers(struct vl_mpeg12_mc_renderer *r)
-{
-   struct pipe_texture template;
-
-   const unsigned mbw =
-      align(r->picture_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
-   const unsigned mbh =
-      align(r->picture_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
-
-   unsigned i;
-
-   assert(r);
-
-   r->macroblocks_per_batch =
-      mbw * (r->bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE ? mbh : 1);
-   r->num_macroblocks = 0;
-   r->macroblock_buf = MALLOC(r->macroblocks_per_batch * sizeof(struct pipe_mpeg12_macroblock));
-
-   memset(&template, 0, sizeof(struct pipe_texture));
-   template.target = PIPE_TEXTURE_2D;
-   /* TODO: Accomodate HW that can't do this and also for cases when this isn't precise enough */
-   template.format = PIPE_FORMAT_R16_SNORM;
-   template.last_level = 0;
-   template.width0 = r->pot_buffers ?
-      util_next_power_of_two(r->picture_width) : r->picture_width;
-   template.height0 = r->pot_buffers ?
-      util_next_power_of_two(r->picture_height) : r->picture_height;
-   template.depth0 = 1;
-   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_DYNAMIC;
-
-   r->textures.individual.y = r->pipe->screen->texture_create(r->pipe->screen, &template);
-
-   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
-      template.width0 = r->pot_buffers ?
-         util_next_power_of_two(r->picture_width / 2) :
-         r->picture_width / 2;
-      template.height0 = r->pot_buffers ?
-         util_next_power_of_two(r->picture_height / 2) :
-         r->picture_height / 2;
-   }
-   else if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422)
-      template.height0 = r->pot_buffers ?
-         util_next_power_of_two(r->picture_height / 2) :
-         r->picture_height / 2;
-
-   r->textures.individual.cb =
-      r->pipe->screen->texture_create(r->pipe->screen, &template);
-   r->textures.individual.cr =
-      r->pipe->screen->texture_create(r->pipe->screen, &template);
-
-   r->vertex_bufs.individual.ycbcr.stride = sizeof(struct vertex2f) * 4;
-   r->vertex_bufs.individual.ycbcr.max_index = 24 * r->macroblocks_per_batch - 1;
-   r->vertex_bufs.individual.ycbcr.buffer_offset = 0;
-   r->vertex_bufs.individual.ycbcr.buffer = pipe_buffer_create
-   (
-      r->pipe->screen,
-      DEFAULT_BUF_ALIGNMENT,
-      PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
-      sizeof(struct vertex2f) * 4 * 24 * r->macroblocks_per_batch
-   );
-
-   for (i = 1; i < 3; ++i) {
-      r->vertex_bufs.all[i].stride = sizeof(struct vertex2f) * 2;
-      r->vertex_bufs.all[i].max_index = 24 * r->macroblocks_per_batch - 1;
-      r->vertex_bufs.all[i].buffer_offset = 0;
-      r->vertex_bufs.all[i].buffer = pipe_buffer_create
-      (
-         r->pipe->screen,
-         DEFAULT_BUF_ALIGNMENT,
-         PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
-         sizeof(struct vertex2f) * 2 * 24 * r->macroblocks_per_batch
-      );
-   }
-
-   /* Position element */
-   r->vertex_elems[0].src_offset = 0;
-   r->vertex_elems[0].instance_divisor = 0;
-   r->vertex_elems[0].vertex_buffer_index = 0;
-   r->vertex_elems[0].nr_components = 2;
-   r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* Luma, texcoord element */
-   r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
-   r->vertex_elems[1].instance_divisor = 0;
-   r->vertex_elems[1].vertex_buffer_index = 0;
-   r->vertex_elems[1].nr_components = 2;
-   r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* Chroma Cr texcoord element */
-   r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
-   r->vertex_elems[2].instance_divisor = 0;
-   r->vertex_elems[2].vertex_buffer_index = 0;
-   r->vertex_elems[2].nr_components = 2;
-   r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* Chroma Cb texcoord element */
-   r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
-   r->vertex_elems[3].instance_divisor = 0;
-   r->vertex_elems[3].vertex_buffer_index = 0;
-   r->vertex_elems[3].nr_components = 2;
-   r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* First ref surface top field texcoord element */
-   r->vertex_elems[4].src_offset = 0;
-   r->vertex_elems[4].instance_divisor = 0;
-   r->vertex_elems[4].vertex_buffer_index = 1;
-   r->vertex_elems[4].nr_components = 2;
-   r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* First ref surface bottom field texcoord element */
-   r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
-   r->vertex_elems[5].instance_divisor = 0;
-   r->vertex_elems[5].vertex_buffer_index = 1;
-   r->vertex_elems[5].nr_components = 2;
-   r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* Second ref surface top field texcoord element */
-   r->vertex_elems[6].src_offset = 0;
-   r->vertex_elems[6].instance_divisor = 0;
-   r->vertex_elems[6].vertex_buffer_index = 2;
-   r->vertex_elems[6].nr_components = 2;
-   r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   /* Second ref surface bottom field texcoord element */
-   r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
-   r->vertex_elems[7].instance_divisor = 0;
-   r->vertex_elems[7].vertex_buffer_index = 2;
-   r->vertex_elems[7].nr_components = 2;
-   r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-   r->vs_const_buf = pipe_buffer_create
-   (
-      r->pipe->screen,
-      DEFAULT_BUF_ALIGNMENT,
-      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
-      sizeof(struct vertex_shader_consts)
-   );
-
-   r->fs_const_buf = pipe_buffer_create
-   (
-      r->pipe->screen,
-      DEFAULT_BUF_ALIGNMENT,
-      PIPE_BUFFER_USAGE_CONSTANT, sizeof(struct fragment_shader_consts)
-   );
-
-   memcpy
-   (
-      pipe_buffer_map(r->pipe->screen, r->fs_const_buf, PIPE_BUFFER_USAGE_CPU_WRITE),
-      &fs_consts, sizeof(struct fragment_shader_consts)
-   );
-
-   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf);
-
-   return true;
-}
-
-static void
-cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
-{
-   unsigned i;
-
-   assert(r);
-
-   pipe_buffer_reference(&r->vs_const_buf, NULL);
-   pipe_buffer_reference(&r->fs_const_buf, NULL);
-
-   for (i = 0; i < 3; ++i)
-      pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
-
-   for (i = 0; i < 3; ++i)
-      pipe_texture_reference(&r->textures.all[i], NULL);
-
-   FREE(r->macroblock_buf);
-}
-
-static enum MACROBLOCK_TYPE
-get_macroblock_type(struct pipe_mpeg12_macroblock *mb)
-{
-   assert(mb);
-
-   switch (mb->mb_type) {
-      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
-         return MACROBLOCK_TYPE_INTRA;
-      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
-         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
-            MACROBLOCK_TYPE_FWD_FRAME_PRED : MACROBLOCK_TYPE_FWD_FIELD_PRED;
-      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
-         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
-            MACROBLOCK_TYPE_BKWD_FRAME_PRED : MACROBLOCK_TYPE_BKWD_FIELD_PRED;
-      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
-         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
-            MACROBLOCK_TYPE_BI_FRAME_PRED : MACROBLOCK_TYPE_BI_FIELD_PRED;
-      default:
-         assert(0);
-   }
-
-   /* Unreachable */
-   return -1;
-}
-
-/* XXX: One of these days this will have to be killed with fire */
-#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, use_zb, zb)				\
-	do {															\
-	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-																\
-	if (!use_zb || (cbp) & (lm))												\
-	{															\
-		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
-		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
-		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
-		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
-		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
-		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
-	}															\
-																\
-	if (!use_zb || (cbp) & (cbm))												\
-	{															\
-		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
-		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
-		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
-		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
-		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
-		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
-	}															\
-																\
-	if (!use_zb || (cbp) & (crm))												\
-	{															\
-		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
-		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
-		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
-		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
-		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
-		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
-	}															\
-	} while (0)
-
-static void
-gen_macroblock_verts(struct vl_mpeg12_mc_renderer *r,
-                     struct pipe_mpeg12_macroblock *mb, unsigned pos,
-                     struct vert_stream_0 *ycbcr_vb, struct vertex2f **ref_vb)
-{
-   struct vertex2f mo_vec[2];
-
-   unsigned i;
-
-   assert(r);
-   assert(mb);
-   assert(ycbcr_vb);
-   assert(pos < r->macroblocks_per_batch);
-
-   mo_vec[1].x = 0;
-   mo_vec[1].y = 0;
-
-   switch (mb->mb_type) {
-      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
-      {
-         struct vertex2f *vb;
-
-         assert(ref_vb && ref_vb[1]);
-
-         vb = ref_vb[1] + pos * 2 * 24;
-
-         mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
-         mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
-
-         if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
-            for (i = 0; i < 24 * 2; i += 2) {
-               vb[i].x = mo_vec[0].x;
-               vb[i].y = mo_vec[0].y;
-            }
-         }
-         else {
-            mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
-            mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
-
-            for (i = 0; i < 24 * 2; i += 2) {
-               vb[i].x = mo_vec[0].x;
-               vb[i].y = mo_vec[0].y;
-               vb[i + 1].x = mo_vec[1].x;
-               vb[i + 1].y = mo_vec[1].y;
-            }
-         }
-
-         /* fall-through */
-      }
-      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
-      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
-      {
-         struct vertex2f *vb;
-
-         assert(ref_vb && ref_vb[0]);
-
-         vb = ref_vb[0] + pos * 2 * 24;
-
-         if (mb->mb_type == PIPE_MPEG12_MACROBLOCK_TYPE_BKWD) {
-             mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
-             mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
-
-             if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
-                mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
-                mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
-             }
-         }
-         else {
-            mo_vec[0].x = mb->pmv[0][0][0] * 0.5f * r->surface_tex_inv_size.x;
-            mo_vec[0].y = mb->pmv[0][0][1] * 0.5f * r->surface_tex_inv_size.y;
-
-            if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
-               mo_vec[1].x = mb->pmv[1][0][0] * 0.5f * r->surface_tex_inv_size.x;
-               mo_vec[1].y = mb->pmv[1][0][1] * 0.5f * r->surface_tex_inv_size.y;
-            }
-         }
-
-         if (mb->mb_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
-            for (i = 0; i < 24 * 2; i += 2) {
-               vb[i].x = mo_vec[0].x;
-               vb[i].y = mo_vec[0].y;
-            }
-         }
-         else {
-            for (i = 0; i < 24 * 2; i += 2) {
-               vb[i].x = mo_vec[0].x;
-               vb[i].y = mo_vec[0].y;
-               vb[i + 1].x = mo_vec[1].x;
-               vb[i + 1].y = mo_vec[1].y;
-            }
-         }
-
-         /* fall-through */
-      }
-      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
-      {
-         const struct vertex2f unit =
-         {
-            r->surface_tex_inv_size.x * MACROBLOCK_WIDTH,
-            r->surface_tex_inv_size.y * MACROBLOCK_HEIGHT
-         };
-         const struct vertex2f half =
-         {
-            r->surface_tex_inv_size.x * (MACROBLOCK_WIDTH / 2),
-            r->surface_tex_inv_size.y * (MACROBLOCK_HEIGHT / 2)
-         };
-         const bool use_zb = r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE;
-
-         struct vert_stream_0 *vb = ycbcr_vb + pos * 24;
-
-         SET_BLOCK(vb, mb->cbp, mb->mbx, mb->mby,
-                   unit.x, unit.y, 0, 0, half.x, half.y,
-                   32, 2, 1, use_zb, r->zero_block);
-
-         SET_BLOCK(vb + 6, mb->cbp, mb->mbx, mb->mby,
-                   unit.x, unit.y, half.x, 0, half.x, half.y,
-                   16, 2, 1, use_zb, r->zero_block);
-
-         SET_BLOCK(vb + 12, mb->cbp, mb->mbx, mb->mby,
-                   unit.x, unit.y, 0, half.y, half.x, half.y,
-                   8, 2, 1, use_zb, r->zero_block);
-
-         SET_BLOCK(vb + 18, mb->cbp, mb->mbx, mb->mby,
-                   unit.x, unit.y, half.x, half.y, half.x, half.y,
-                   4, 2, 1, use_zb, r->zero_block);
-
-         break;
-      }
-      default:
-         assert(0);
-   }
-}
-
-static void
-gen_macroblock_stream(struct vl_mpeg12_mc_renderer *r,
-                      unsigned *num_macroblocks)
-{
-   unsigned offset[NUM_MACROBLOCK_TYPES];
-   struct vert_stream_0 *ycbcr_vb;
-   struct vertex2f *ref_vb[2];
-   unsigned i;
-
-   assert(r);
-   assert(num_macroblocks);
-
-   for (i = 0; i < r->num_macroblocks; ++i) {
-      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
-      ++num_macroblocks[mb_type];
-   }
-
-   offset[0] = 0;
-
-   for (i = 1; i < NUM_MACROBLOCK_TYPES; ++i)
-      offset[i] = offset[i - 1] + num_macroblocks[i - 1];
-
-   ycbcr_vb = (struct vert_stream_0 *)pipe_buffer_map
-   (
-      r->pipe->screen,
-      r->vertex_bufs.individual.ycbcr.buffer,
-      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-   );
-
-   for (i = 0; i < 2; ++i)
-      ref_vb[i] = (struct vertex2f *)pipe_buffer_map
-      (
-         r->pipe->screen,
-         r->vertex_bufs.individual.ref[i].buffer,
-         PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-      );
-
-   for (i = 0; i < r->num_macroblocks; ++i) {
-      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
-
-      gen_macroblock_verts(r, &r->macroblock_buf[i], offset[mb_type],
-                           ycbcr_vb, ref_vb);
-
-      ++offset[mb_type];
-   }
-
-   pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ycbcr.buffer);
-   for (i = 0; i < 2; ++i)
-      pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ref[i].buffer);
-}
-
-static void
-flush(struct vl_mpeg12_mc_renderer *r)
-{
-   unsigned num_macroblocks[NUM_MACROBLOCK_TYPES] = { 0 };
-   unsigned vb_start = 0;
-   struct vertex_shader_consts *vs_consts;
-   unsigned i;
-
-   assert(r);
-   assert(r->num_macroblocks == r->macroblocks_per_batch);
-
-   gen_macroblock_stream(r, num_macroblocks);
-
-   r->fb_state.cbufs[0] = r->pipe->screen->get_tex_surface
-   (
-      r->pipe->screen, r->surface,
-      0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE
-   );
-
-   r->pipe->set_framebuffer_state(r->pipe, &r->fb_state);
-   r->pipe->set_viewport_state(r->pipe, &r->viewport);
-   r->pipe->set_scissor_state(r->pipe, &r->scissor);
-
-   vs_consts = pipe_buffer_map
-   (
-      r->pipe->screen, r->vs_const_buf,
-      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-   );
-
-   vs_consts->denorm.x = r->surface->width0;
-   vs_consts->denorm.y = r->surface->height0;
-
-   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf);
-
-   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
-                                r->vs_const_buf);
-   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_FRAGMENT, 0,
-                                r->fs_const_buf);
-
-   if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
-      r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 4, r->vertex_elems);
-      r->pipe->set_fragment_sampler_textures(r->pipe, 3, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 3, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->i_vs);
-      r->pipe->bind_fs_state(r->pipe, r->i_fs);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24;
-   }
-
-   if (num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] > 0) {
-      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
-      r->textures.individual.ref[0] = r->past;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 4, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 4, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
-      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24;
-   }
-
-   if (false /*num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] > 0 */ ) {
-      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
-      r->textures.individual.ref[0] = r->past;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 4, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 4, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
-      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24;
-   }
-
-   if (num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] > 0) {
-      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
-      r->textures.individual.ref[0] = r->future;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 4, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 4, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
-      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24;
-   }
-
-   if (false /*num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] > 0 */ ) {
-      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
-      r->textures.individual.ref[0] = r->future;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 4, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 4, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
-      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24;
-   }
-
-   if (num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] > 0) {
-      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
-      r->textures.individual.ref[0] = r->past;
-      r->textures.individual.ref[1] = r->future;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 5, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 5, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->b_vs[0]);
-      r->pipe->bind_fs_state(r->pipe, r->b_fs[0]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24;
-   }
-
-   if (false /*num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] > 0 */ ) {
-      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
-      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
-      r->textures.individual.ref[0] = r->past;
-      r->textures.individual.ref[1] = r->future;
-      r->pipe->set_fragment_sampler_textures(r->pipe, 5, r->textures.all);
-      r->pipe->bind_fragment_sampler_states(r->pipe, 5, r->samplers.all);
-      r->pipe->bind_vs_state(r->pipe, r->b_vs[1]);
-      r->pipe->bind_fs_state(r->pipe, r->b_fs[1]);
-
-      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
-                           num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24);
-      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24;
-   }
-
-   r->pipe->flush(r->pipe, PIPE_FLUSH_RENDER_CACHE, r->fence);
-   pipe_surface_reference(&r->fb_state.cbufs[0], NULL);
-
-   if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE)
-      for (i = 0; i < 3; ++i)
-         r->zero_block[i].x = ZERO_BLOCK_NIL;
-
-   r->num_macroblocks = 0;
-}
-
-static void
-grab_frame_coded_block(short *src, short *dst, unsigned dst_pitch)
-{
-   unsigned y;
-
-   assert(src);
-   assert(dst);
-
-   for (y = 0; y < BLOCK_HEIGHT; ++y)
-      memcpy(dst + y * dst_pitch, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
-}
-
-static void
-grab_field_coded_block(short *src, short *dst, unsigned dst_pitch)
-{
-   unsigned y;
-
-   assert(src);
-   assert(dst);
-
-   for (y = 0; y < BLOCK_HEIGHT; ++y)
-      memcpy(dst + y * dst_pitch * 2, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
-}
-
-static void
-fill_zero_block(short *dst, unsigned dst_pitch)
-{
-   unsigned y;
-
-   assert(dst);
-
-   for (y = 0; y < BLOCK_HEIGHT; ++y)
-      memset(dst + y * dst_pitch, 0, BLOCK_WIDTH * 2);
-}
-
-static void
-grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
-            enum pipe_mpeg12_dct_type dct_type, unsigned cbp, short *blocks)
-{
-   unsigned tex_pitch;
-   short *texels;
-   unsigned tb = 0, sb = 0;
-   unsigned mbpx = mbx * MACROBLOCK_WIDTH, mbpy = mby * MACROBLOCK_HEIGHT;
-   unsigned x, y;
-
-   assert(r);
-   assert(blocks);
-
-   tex_pitch = r->tex_transfer[0]->stride / util_format_get_blocksize(r->tex_transfer[0]->texture->format);
-   texels = r->texels[0] + mbpy * tex_pitch + mbpx;
-
-   for (y = 0; y < 2; ++y) {
-      for (x = 0; x < 2; ++x, ++tb) {
-         if ((cbp >> (5 - tb)) & 1) {
-            if (dct_type == PIPE_MPEG12_DCT_TYPE_FRAME) {
-               grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
-                                      texels + y * tex_pitch * BLOCK_WIDTH +
-                                      x * BLOCK_WIDTH, tex_pitch);
-            }
-            else {
-               grab_field_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
-                                      texels + y * tex_pitch + x * BLOCK_WIDTH,
-                                      tex_pitch);
-            }
-
-            ++sb;
-         }
-         else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
-            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
-                ZERO_BLOCK_IS_NIL(r->zero_block[0])) {
-               fill_zero_block(texels + y * tex_pitch * BLOCK_WIDTH + x * BLOCK_WIDTH, tex_pitch);
-               if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
-                  r->zero_block[0].x = (mbpx + x * 8) * r->surface_tex_inv_size.x;
-                  r->zero_block[0].y = (mbpy + y * 8) * r->surface_tex_inv_size.y;
-               }
-            }
-         }
-      }
-   }
-
-   /* TODO: Implement 422, 444 */
-   assert(r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420);
-
-   mbpx /= 2;
-   mbpy /= 2;
-
-   for (tb = 0; tb < 2; ++tb) {
-      tex_pitch = r->tex_transfer[tb + 1]->stride / util_format_get_blocksize(r->tex_transfer[tb + 1]->texture->format);
-      texels = r->texels[tb + 1] + mbpy * tex_pitch + mbpx;
-
-      if ((cbp >> (1 - tb)) & 1) {
-         grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT, texels, tex_pitch);
-         ++sb;
-      }
-      else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
-         if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
-             ZERO_BLOCK_IS_NIL(r->zero_block[tb + 1])) {
-            fill_zero_block(texels, tex_pitch);
-            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
-               r->zero_block[tb + 1].x = (mbpx << 1) * r->surface_tex_inv_size.x;
-               r->zero_block[tb + 1].y = (mbpy << 1) * r->surface_tex_inv_size.y;
-            }
-         }
-      }
-   }
-}
-
-static void
-grab_macroblock(struct vl_mpeg12_mc_renderer *r,
-                struct pipe_mpeg12_macroblock *mb)
-{
-   assert(r);
-   assert(mb);
-   assert(r->num_macroblocks < r->macroblocks_per_batch);
-
-   memcpy(&r->macroblock_buf[r->num_macroblocks], mb,
-          sizeof(struct pipe_mpeg12_macroblock));
-
-   grab_blocks(r, mb->mbx, mb->mby, mb->dct_type, mb->cbp, mb->blocks);
-
-   ++r->num_macroblocks;
-}
-
-bool
-vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
-                           struct pipe_context *pipe,
-                           unsigned picture_width,
-                           unsigned picture_height,
-                           enum pipe_video_chroma_format chroma_format,
-                           enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
-                           enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
-                           bool pot_buffers)
-{
-   unsigned i;
-
-   assert(renderer);
-   assert(pipe);
-   /* TODO: Implement other policies */
-   assert(bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE);
-   /* TODO: Implement this */
-   /* XXX: XFER_ALL sampling issue at block edges when using bilinear filtering */
-   assert(eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE);
-   /* TODO: Non-pot buffers untested, probably doesn't work without changes to texcoord generation, vert shader, etc */
-   assert(pot_buffers);
-
-   memset(renderer, 0, sizeof(struct vl_mpeg12_mc_renderer));
-
-   renderer->pipe = pipe;
-   renderer->picture_width = picture_width;
-   renderer->picture_height = picture_height;
-   renderer->chroma_format = chroma_format;
-   renderer->bufmode = bufmode;
-   renderer->eb_handling = eb_handling;
-   renderer->pot_buffers = pot_buffers;
-
-   if (!init_pipe_state(renderer))
-      return false;
-   if (!init_shaders(renderer)) {
-      cleanup_pipe_state(renderer);
-      return false;
-   }
-   if (!init_buffers(renderer)) {
-      cleanup_shaders(renderer);
-      cleanup_pipe_state(renderer);
-      return false;
-   }
-
-   renderer->surface = NULL;
-   renderer->past = NULL;
-   renderer->future = NULL;
-   for (i = 0; i < 3; ++i)
-      renderer->zero_block[i].x = ZERO_BLOCK_NIL;
-   renderer->num_macroblocks = 0;
-
-   xfer_buffers_map(renderer);
-
-   return true;
-}
-
-void
-vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer)
-{
-   assert(renderer);
-
-   xfer_buffers_unmap(renderer);
-
-   cleanup_pipe_state(renderer);
-   cleanup_shaders(renderer);
-   cleanup_buffers(renderer);
-}
-
-void
-vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer
-                                         *renderer,
-                                         struct pipe_texture *surface,
-                                         struct pipe_texture *past,
-                                         struct pipe_texture *future,
-                                         unsigned num_macroblocks,
-                                         struct pipe_mpeg12_macroblock
-                                         *mpeg12_macroblocks,
-                                         struct pipe_fence_handle **fence)
-{
-   bool new_surface = false;
-
-   assert(renderer);
-   assert(surface);
-   assert(num_macroblocks);
-   assert(mpeg12_macroblocks);
-
-   if (renderer->surface) {
-      if (surface != renderer->surface) {
-         if (renderer->num_macroblocks > 0) {
-            xfer_buffers_unmap(renderer);
-            flush(renderer);
-         }
-         
-         new_surface = true;
-      }
-
-      /* If the surface we're rendering hasn't changed the ref frames shouldn't change. */
-      assert(surface != renderer->surface || renderer->past == past);
-      assert(surface != renderer->surface || renderer->future == future);
-   }
-   else
-      new_surface = true;
-
-   if (new_surface) {
-      renderer->surface = surface;
-      renderer->past = past;
-      renderer->future = future;
-      renderer->fence = fence;
-      renderer->surface_tex_inv_size.x = 1.0f / surface->width0;
-      renderer->surface_tex_inv_size.y = 1.0f / surface->height0;
-   }
-
-   while (num_macroblocks) {
-      unsigned left_in_batch = renderer->macroblocks_per_batch - renderer->num_macroblocks;
-      unsigned num_to_submit = MIN2(num_macroblocks, left_in_batch);
-      unsigned i;
-
-      for (i = 0; i < num_to_submit; ++i) {
-         assert(mpeg12_macroblocks[i].base.codec == PIPE_VIDEO_CODEC_MPEG12);
-         grab_macroblock(renderer, &mpeg12_macroblocks[i]);
-      }
-
-      num_macroblocks -= num_to_submit;
-
-      if (renderer->num_macroblocks == renderer->macroblocks_per_batch) {
-         xfer_buffers_unmap(renderer);
-         flush(renderer);
-         xfer_buffers_map(renderer);
-         /* Next time we get this surface it may have new ref frames */
-         renderer->surface = NULL;
-      }
-   }
-}
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
deleted file mode 100644
index f00b8c7b8b..0000000000
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef vl_mpeg12_mc_renderer_h
-#define vl_mpeg12_mc_renderer_h
-
-#include <pipe/p_compiler.h>
-#include <pipe/p_state.h>
-#include <pipe/p_video_state.h>
-
-struct pipe_context;
-struct pipe_video_surface;
-struct pipe_macroblock;
-
-/* A slice is video-width (rounded up to a multiple of macroblock width) x macroblock height */
-enum VL_MPEG12_MC_RENDERER_BUFFER_MODE
-{
-   VL_MPEG12_MC_RENDERER_BUFFER_SLICE,  /* Saves memory at the cost of smaller batches */
-   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE /* Larger batches, more memory */
-};
-
-enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK
-{
-   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL, /* Waste of memory bandwidth */
-   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE, /* Can only do point-filtering when interpolating subsampled chroma channels */
-   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE /* Needs conditional texel fetch! */
-};
-
-struct vl_mpeg12_mc_renderer
-{
-   struct pipe_context *pipe;
-   unsigned picture_width;
-   unsigned picture_height;
-   enum pipe_video_chroma_format chroma_format;
-   enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode;
-   enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling;
-   bool pot_buffers;
-   unsigned macroblocks_per_batch;
-
-   struct pipe_viewport_state viewport;
-   struct pipe_scissor_state scissor;
-   struct pipe_buffer *vs_const_buf;
-   struct pipe_buffer *fs_const_buf;
-   struct pipe_framebuffer_state fb_state;
-   struct pipe_vertex_element vertex_elems[8];
-	
-   union
-   {
-      void *all[5];
-      struct { void *y, *cb, *cr, *ref[2]; } individual;
-   } samplers;
-	
-   void *i_vs, *p_vs[2], *b_vs[2];
-   void *i_fs, *p_fs[2], *b_fs[2];
-	
-   union
-   {
-      struct pipe_texture *all[5];
-      struct { struct pipe_texture *y, *cb, *cr, *ref[2]; } individual;
-   } textures;
-
-   union
-   {
-      struct pipe_vertex_buffer all[3];
-      struct { struct pipe_vertex_buffer ycbcr, ref[2]; } individual;
-   } vertex_bufs;
-	
-   struct pipe_texture *surface, *past, *future;
-   struct pipe_fence_handle **fence;
-   unsigned num_macroblocks;
-   struct pipe_mpeg12_macroblock *macroblock_buf;
-   struct pipe_transfer *tex_transfer[3];
-   short *texels[3];
-   struct { float x, y; } surface_tex_inv_size;
-   struct { float x, y; } zero_block[3];
-};
-
-bool vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
-                                struct pipe_context *pipe,
-                                unsigned picture_width,
-                                unsigned picture_height,
-                                enum pipe_video_chroma_format chroma_format,
-                                enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
-                                enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
-                                bool pot_buffers);
-
-void vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer);
-
-void vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer *renderer,
-                                              struct pipe_texture *surface,
-                                              struct pipe_texture *past,
-                                              struct pipe_texture *future,
-                                              unsigned num_macroblocks,
-                                              struct pipe_mpeg12_macroblock *mpeg12_macroblocks,
-                                              struct pipe_fence_handle **fence);
-
-#endif /* vl_mpeg12_mc_renderer_h */
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.c b/src/gallium/auxiliary/vl/vl_shader_build.c
deleted file mode 100644
index d011ef97bd..0000000000
--- a/src/gallium/auxiliary/vl/vl_shader_build.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "vl_shader_build.h"
-#include <assert.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-
-struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   decl.Declaration.File = TGSI_FILE_INPUT;
-   decl.Declaration.Semantic = 1;
-   decl.Semantic.Name = name;
-   decl.Semantic.Index = index;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_declaration vl_decl_interpolated_input
-(
-   unsigned int name,
-   unsigned int index,
-   unsigned int first,
-   unsigned int last,
-   int interpolation
-)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   assert
-   (
-      interpolation == TGSI_INTERPOLATE_CONSTANT ||
-      interpolation == TGSI_INTERPOLATE_LINEAR ||
-      interpolation == TGSI_INTERPOLATE_PERSPECTIVE
-   );
-
-   decl.Declaration.File = TGSI_FILE_INPUT;
-   decl.Declaration.Semantic = 1;
-   decl.Semantic.Name = name;
-   decl.Semantic.Index = index;
-   decl.Declaration.Interpolate = interpolation;;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   decl.Declaration.File = TGSI_FILE_CONSTANT;
-   decl.Declaration.Semantic = 1;
-   decl.Semantic.Name = name;
-   decl.Semantic.Index = index;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   decl.Declaration.File = TGSI_FILE_OUTPUT;
-   decl.Declaration.Semantic = 1;
-   decl.Semantic.Name = name;
-   decl.Semantic.Index = index;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_TEMPORARY;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
-{
-   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_SAMPLER;
-   decl.Range.First = first;
-   decl.Range.Last = last;
-
-   return decl;
-}
-
-struct tgsi_full_instruction vl_inst2
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src_file,
-   unsigned int src_index
-)
-{
-   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-   inst.Instruction.Opcode = opcode;
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File = dst_file;
-   inst.Dst[0].Register.Index = dst_index;
-   inst.Instruction.NumSrcRegs = 1;
-   inst.Src[0].Register.File = src_file;
-   inst.Src[0].Register.Index = src_index;
-
-   return inst;
-}
-
-struct tgsi_full_instruction vl_inst3
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index
-)
-{
-   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-   inst.Instruction.Opcode = opcode;
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File = dst_file;
-   inst.Dst[0].Register.Index = dst_index;
-   inst.Instruction.NumSrcRegs = 2;
-   inst.Src[0].Register.File = src1_file;
-   inst.Src[0].Register.Index = src1_index;
-   inst.Src[1].Register.File = src2_file;
-   inst.Src[1].Register.Index = src2_index;
-
-   return inst;
-}
-
-struct tgsi_full_instruction vl_tex
-(
-   int tex,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index
-)
-{
-   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File = dst_file;
-   inst.Dst[0].Register.Index = dst_index;
-   inst.Instruction.NumSrcRegs = 2;
-   inst.Instruction.Texture = 1;
-   inst.Texture.Texture = tex;
-   inst.Src[0].Register.File = src1_file;
-   inst.Src[0].Register.Index = src1_index;
-   inst.Src[1].Register.File = src2_file;
-   inst.Src[1].Register.Index = src2_index;
-
-   return inst;
-}
-
-struct tgsi_full_instruction vl_inst4
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index,
-   enum tgsi_file_type src3_file,
-   unsigned int src3_index
-)
-{
-   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-   inst.Instruction.Opcode = opcode;
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File = dst_file;
-   inst.Dst[0].Register.Index = dst_index;
-   inst.Instruction.NumSrcRegs = 3;
-   inst.Src[0].Register.File = src1_file;
-   inst.Src[0].Register.Index = src1_index;
-   inst.Src[1].Register.File = src2_file;
-   inst.Src[1].Register.Index = src2_index;
-   inst.Src[2].Register.File = src3_file;
-   inst.Src[2].Register.Index = src3_index;
-
-   return inst;
-}
-
-struct tgsi_full_instruction vl_end(void)
-{
-   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-   inst.Instruction.Opcode = TGSI_OPCODE_END;
-   inst.Instruction.NumDstRegs = 0;
-   inst.Instruction.NumSrcRegs = 0;
-
-   return inst;
-}
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.h b/src/gallium/auxiliary/vl/vl_shader_build.h
deleted file mode 100644
index 5da71f8e13..0000000000
--- a/src/gallium/auxiliary/vl/vl_shader_build.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2009 Younes Manton.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef vl_shader_build_h
-#define vl_shader_build_h
-
-#include <pipe/p_shader_tokens.h>
-
-struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_interpolated_input
-(
-   unsigned int name,
-   unsigned int index,
-   unsigned int first,
-   unsigned int last,
-   int interpolation
-);
-struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
-struct tgsi_full_instruction vl_inst2
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src_file,
-   unsigned int src_index
-);
-struct tgsi_full_instruction vl_inst3
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index
-);
-struct tgsi_full_instruction vl_tex
-(
-   int tex,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index
-);
-struct tgsi_full_instruction vl_inst4
-(
-   int opcode,
-   enum tgsi_file_type dst_file,
-   unsigned int dst_index,
-   enum tgsi_file_type src1_file,
-   unsigned int src1_index,
-   enum tgsi_file_type src2_file,
-   unsigned int src2_index,
-   enum tgsi_file_type src3_file,
-   unsigned int src3_index
-);
-struct tgsi_full_instruction vl_end(void);
-
-#endif