268 files changed, 18573 insertions, 7550 deletions
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index 89c69d7205..8be84cddbe 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -23,6 +23,7 @@ SConscript([
 	'auxiliary/pipebuffer/SConscript',
 	'auxiliary/indices/SConscript',
 	'auxiliary/rbug/SConscript',
+	'auxiliary/vl/SConscript',
 ])
 
 for driver in env['drivers']:
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 00d7197b13..e25f16c354 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -104,7 +104,7 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    unsigned clipped = 0;
    unsigned j;
 
-   if (0) debug_printf("%s\n");
+   if (0) debug_printf("%s\n", __FUNCTION__);
 
    for (j = 0; j < count; j++) {
       float *position = out->data[pos];
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 109ac7c9d6..d01f866622 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -542,7 +542,7 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       debug_printf("%10p %7u %7u\n",
                    fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count);
+                   p_atomic_read(&fenced_buf->base.base.reference.count));
       curr = next; 
       next = curr->next;
    }
@@ -556,7 +556,7 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       debug_printf("%10p %7u %7u %10p %s\n",
                    fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count,
+                   p_atomic_read(&fenced_buf->base.base.reference.count),
                    fenced_buf->fence,
                    signaled == 0 ? "y" : "n");
       curr = next; 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 1b4df28c70..6e3214ca9c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -350,7 +350,7 @@ pb_debug_manager_dump(struct pb_debug_manager *mgr)
       buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
 
       debug_printf("buffer = %p\n", buf);
-      debug_printf("    .size = %p\n", buf->base.base.size);
+      debug_printf("    .size = 0x%x\n", buf->base.base.size);
       debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
       
       curr = next; 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 8a13885da9..53e13b30e6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -358,7 +358,7 @@ epilog(
 
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens )
+   const struct tgsi_token *tokens )
 {
    struct sanity_check_ctx ctx;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index ca45e94c7a..52263ff883 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -40,7 +40,7 @@ extern "C" {
  */
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens );
+   const struct tgsi_token *tokens );
 
 #if defined __cplusplus
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index c535788819..0db4481a3d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -132,6 +132,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                if (file == TGSI_FILE_INPUT) {
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.SemanticName;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
                   info->num_inputs++;
                }
                else if (file == TGSI_FILE_OUTPUT) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 2c1a75bc81..8a7ee0c7e4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte num_outputs;
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 1e719940ec..5f6b83b236 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -39,8 +39,9 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
-#include "tgsi_exec.h"
-#include "tgsi_sse2.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -1360,6 +1361,32 @@ emit_store(
    const struct tgsi_full_instruction *inst,
    unsigned chan_index )
 {
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      sse_maxps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ) );
+
+      sse_minps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C ) );
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+
+
    switch( reg->DstRegister.File ) {
    case TGSI_FILE_OUTPUT:
       emit_output(
@@ -1388,19 +1415,6 @@ emit_store(
    default:
       assert( 0 );
    }
-
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
 }
 
 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
@@ -1747,14 +1761,6 @@ emit_instruction(
    if (indirect_temp_reference(inst))
       return FALSE;
 
-   /* we don't handle saturation/clamping yet */
-   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
-      return FALSE;
-
-   /* need to use extra temps to fix SOA dependencies : */
-   if (tgsi_check_soa_dependencies(inst))
-      return FALSE;
-
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1768,8 +1774,10 @@ emit_instruction(
    case TGSI_OPCODE_MOV:
    case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 4 + chan_index, 0, chan_index );
+      }
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 4 + chan_index, 0, chan_index );
       }
       break;
 
@@ -1847,7 +1855,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_rcp( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1856,7 +1863,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_abs( func, 0 );
       emit_rsqrt( func, 1, 0 );
@@ -1954,7 +1960,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -1972,7 +1977,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -2043,17 +2047,14 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
       emit_setcc( func, inst, cc_LessThan );
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
       emit_setcc( func, inst, cc_NotLessThan );
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
@@ -2283,7 +2284,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      return 0;
+      emit_setcc( func, inst, cc_Equal );
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2291,7 +2292,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      return 0;
+      emit_setcc( func, inst, cc_NotLessThanEqual );
       break;
 
    case TGSI_OPCODE_SIN:
@@ -2303,11 +2304,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLE:
-      return 0;
+      emit_setcc( func, inst, cc_LessThanEqual );
       break;
 
    case TGSI_OPCODE_SNE:
-      return 0;
+      emit_setcc( func, inst, cc_NotEqual );
       break;
 
    case TGSI_OPCODE_STR:
@@ -2371,7 +2372,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          emit_sgn( func, 0, 0 );
@@ -2929,6 +2929,22 @@ tgsi_emit_sse2(
                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
                          "vertex shader" : "fragment shader");
 	 }
+
+         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
+            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+
+            /* XXX: we only handle src/dst aliasing in a few opcodes
+             * currently.  Need to use an additional temporay to hold
+             * the result in the cases where the code is too opaque to
+             * fix.
+             */
+            if (opcode != TGSI_OPCODE_MOV &&
+                opcode != TGSI_OPCODE_SWZ) {
+               debug_printf("Warning: src/dst aliasing in instruction"
+                            " is not handled:\n");
+               tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
+            }
+         }
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index f7096bd8e2..654426a903 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -31,6 +31,7 @@
 #include "tgsi/tgsi_ureg.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_sanity.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -70,6 +71,7 @@ struct ureg_tokens {
 
 #define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 32
 #define UREG_MAX_TEMP 256
 #define UREG_MAX_ADDR 2
@@ -86,8 +88,10 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
-   } input[UREG_MAX_INPUT];
-   unsigned nr_inputs;
+   } fs_input[UREG_MAX_INPUT];
+   unsigned nr_fs_inputs;
+
+   unsigned vs_inputs[UREG_MAX_INPUT/32];
 
    struct {
       unsigned semantic_name;
@@ -107,9 +111,13 @@ struct ureg_program
    unsigned temps_active[UREG_MAX_TEMP / 32];
    unsigned nr_temps;
 
-   unsigned nr_addrs;
+   struct {
+      unsigned first;
+      unsigned last;
+   } constant_range[UREG_MAX_CONSTANT_RANGE];
+   unsigned nr_constant_ranges;
 
-   unsigned nr_constants;
+   unsigned nr_addrs;
    unsigned nr_instructions;
 
    struct ureg_tokens domain[2];
@@ -119,6 +127,9 @@ static union tgsi_any_token error_tokens[32];
 
 static void tokens_error( struct ureg_tokens *tokens )
 {
+   if (tokens->tokens && tokens->tokens != error_tokens)
+      FREE(tokens->tokens);
+
    tokens->tokens = error_tokens;
    tokens->size = Elements(error_tokens);
    tokens->count = 0;
@@ -228,25 +239,25 @@ ureg_src_register( unsigned file,
 
 
 
-static struct ureg_src 
-ureg_DECL_input( struct ureg_program *ureg,
-                 unsigned name,
-                 unsigned index,
-                 unsigned interp_mode )
+struct ureg_src 
+ureg_DECL_fs_input( struct ureg_program *ureg,
+                    unsigned name,
+                    unsigned index,
+                    unsigned interp_mode )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      if (ureg->input[i].semantic_name == name &&
-          ureg->input[i].semantic_index == index) 
+   for (i = 0; i < ureg->nr_fs_inputs; i++) {
+      if (ureg->fs_input[i].semantic_name == name &&
+          ureg->fs_input[i].semantic_index == index) 
          goto out;
    }
 
-   if (ureg->nr_inputs < UREG_MAX_INPUT) {
-      ureg->input[i].semantic_name = name;
-      ureg->input[i].semantic_index = index;
-      ureg->input[i].interp = interp_mode;
-      ureg->nr_inputs++;
+   if (ureg->nr_fs_inputs < UREG_MAX_INPUT) {
+      ureg->fs_input[i].semantic_name = name;
+      ureg->fs_input[i].semantic_index = index;
+      ureg->fs_input[i].interp = interp_mode;
+      ureg->nr_fs_inputs++;
    }
    else {
       set_bad( ureg );
@@ -257,25 +268,14 @@ out:
 }
 
 
-
-struct ureg_src 
-ureg_DECL_fs_input( struct ureg_program *ureg,
-                    unsigned name,
-                    unsigned index,
-                    unsigned interp )
-{
-   assert(ureg->processor == TGSI_PROCESSOR_FRAGMENT);
-   return ureg_DECL_input( ureg, name, index, interp );
-}
-
-
 struct ureg_src 
 ureg_DECL_vs_input( struct ureg_program *ureg,
-                    unsigned name,
                     unsigned index )
 {
    assert(ureg->processor == TGSI_PROCESSOR_VERTEX);
-   return ureg_DECL_input( ureg, name, index, TGSI_INTERPOLATE_CONSTANT );
+   
+   ureg->vs_inputs[index/32] |= 1 << (index % 32);
+   return ureg_src_register( TGSI_FILE_INPUT, index );
 }
 
 
@@ -313,9 +313,57 @@ out:
  * value or manage any constant_buffer contents -- that's the
  * resposibility of the calling code.
  */
-struct ureg_src ureg_DECL_constant(struct ureg_program *ureg )
+struct ureg_src ureg_DECL_constant(struct ureg_program *ureg, 
+                                   unsigned index )
 {
-   return ureg_src_register( TGSI_FILE_CONSTANT, ureg->nr_constants++ );
+   unsigned minconst = index, maxconst = index;
+   unsigned i;
+
+   /* Inside existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].first <= index &&
+          ureg->constant_range[i].last >= index)
+         goto out;
+   }
+
+   /* Extend existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].last == index - 1) {
+         ureg->constant_range[i].last = index;
+         goto out;
+      }
+
+      if (ureg->constant_range[i].first == index + 1) {
+         ureg->constant_range[i].first = index;
+         goto out;
+      }
+
+      minconst = MIN2(minconst, ureg->constant_range[i].first);
+      maxconst = MAX2(maxconst, ureg->constant_range[i].last);
+   }
+
+   /* Create new range?
+    */
+   if (ureg->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
+      i = ureg->nr_constant_ranges++;
+      ureg->constant_range[i].first = index;
+      ureg->constant_range[i].last = index;
+   }
+
+   /* Collapse all ranges down to one:
+    */
+   i = 0;
+   ureg->constant_range[0].first = minconst;
+   ureg->constant_range[0].last = maxconst;
+   ureg->nr_constant_ranges = 1;
+
+out:
+   assert(i < ureg->nr_constant_ranges);
+   assert(ureg->constant_range[i].first <= index);
+   assert(ureg->constant_range[i].last >= index);
+   return ureg_src_register( TGSI_FILE_CONSTANT, index );
 }
 
 
@@ -566,6 +614,19 @@ ureg_emit_dst( struct ureg_program *ureg,
 }
 
 
+static void validate( unsigned opcode,
+                      unsigned nr_dst,
+                      unsigned nr_src )
+{
+#ifdef DEBUG
+   const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
+   assert(info);
+   if(info) {
+      assert(nr_dst == info->num_dst);
+      assert(nr_src == info->num_src);
+   }
+#endif
+}
 
 unsigned
 ureg_emit_insn(struct ureg_program *ureg,
@@ -576,6 +637,8 @@ ureg_emit_insn(struct ureg_program *ureg,
 {
    union tgsi_any_token *out;
 
+   validate( opcode, num_dst, num_src );
+   
    out = get_tokens( ureg, DOMAIN_INSN, 1 );
    out[0].value = 0;
    out[0].insn.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
@@ -678,17 +741,6 @@ ureg_insn(struct ureg_program *ureg,
    unsigned insn, i;
    boolean saturate;
 
-#ifdef DEBUG
-   {
-      const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
-      assert(info);
-      if(info) {
-         assert(nr_dst == info->num_dst);
-         assert(nr_src == info->num_src);
-      }
-   }
-#endif
-   
    saturate = nr_dst ? dst[0].Saturate : FALSE;
 
    insn = ureg_emit_insn( ureg, opcode, saturate, nr_dst, nr_src );
@@ -702,6 +754,53 @@ ureg_insn(struct ureg_program *ureg,
    ureg_fixup_insn_size( ureg, insn );
 }
 
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src )
+{
+   unsigned insn, i;
+   boolean saturate;
+
+   saturate = nr_dst ? dst[0].Saturate : FALSE;
+
+   insn = ureg_emit_insn( ureg, opcode, saturate, nr_dst, nr_src );
+
+   ureg_emit_texture( ureg, insn, target );                             \
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst( ureg, dst[i] );
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn );
+}
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label_token )
+{
+   unsigned insn, i;
+
+   insn = ureg_emit_insn( ureg, opcode, FALSE, 0, nr_src );
+
+   ureg_emit_label( ureg, insn, label_token );                  \
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn );
+}
+
 
 
 static void emit_decl( struct ureg_program *ureg,
@@ -777,13 +876,22 @@ static void emit_decls( struct ureg_program *ureg )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      emit_decl( ureg, 
-                 TGSI_FILE_INPUT, 
-                 i,
-                 ureg->input[i].semantic_name,
-                 ureg->input[i].semantic_index,
-                 ureg->input[i].interp );
+   if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
+      for (i = 0; i < UREG_MAX_INPUT; i++) {
+         if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
+            emit_decl_range( ureg, TGSI_FILE_INPUT, i, 1 );
+         }
+      }
+   }
+   else {
+      for (i = 0; i < ureg->nr_fs_inputs; i++) {
+         emit_decl( ureg, 
+                    TGSI_FILE_INPUT, 
+                    i,
+                    ureg->fs_input[i].semantic_name,
+                    ureg->fs_input[i].semantic_index,
+                    ureg->fs_input[i].interp );
+      }
    }
 
    for (i = 0; i < ureg->nr_outputs; i++) {
@@ -801,10 +909,13 @@ static void emit_decls( struct ureg_program *ureg )
                        ureg->sampler[i].Index, 1 );
    }
 
-   if (ureg->nr_constants) {
-      emit_decl_range( ureg,
-                       TGSI_FILE_CONSTANT,
-                       0, ureg->nr_constants );
+   if (ureg->nr_constant_ranges) {
+      for (i = 0; i < ureg->nr_constant_ranges; i++)
+         emit_decl_range( ureg,
+                          TGSI_FILE_CONSTANT,
+                          ureg->constant_range[i].first, 
+                          (ureg->constant_range[i].last + 1 -
+                           ureg->constant_range[i].first) );
    }
 
    if (ureg->nr_temps) {
@@ -890,6 +1001,15 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
                    ureg->domain[DOMAIN_DECL].count);
       tgsi_dump( tokens, 0 );
    }
+
+#if DEBUG
+   if (tokens && !tgsi_sanity_check(tokens)) {
+      debug_printf("tgsi_ureg.c, sanity check failed on generated tokens:\n");
+      tgsi_dump(tokens, 0);
+      assert(0);
+   }
+#endif
+
    
    return tokens;
 }
@@ -911,6 +1031,25 @@ void *ureg_create_shader( struct ureg_program *ureg,
 }
 
 
+const struct tgsi_token *ureg_get_tokens( struct ureg_program *ureg,
+                                          unsigned *nr_tokens )
+{
+   const struct tgsi_token *tokens;
+
+   ureg_finalize(ureg);
+
+   tokens = &ureg->domain[DOMAIN_DECL].tokens[0].token;
+
+   if (nr_tokens) 
+      *nr_tokens = ureg->domain[DOMAIN_DECL].size;
+
+   ureg->domain[DOMAIN_DECL].tokens = 0;
+   ureg->domain[DOMAIN_DECL].size = 0;
+   ureg->domain[DOMAIN_DECL].order = 0;
+   ureg->domain[DOMAIN_DECL].count = 0;
+
+   return tokens;
+}
 
 
 struct ureg_program *ureg_create( unsigned processor )
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index acbca59040..f04f443b9e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -82,10 +82,21 @@ ureg_create( unsigned processor );
 const struct tgsi_token *
 ureg_finalize( struct ureg_program * );
 
+/* Create and return a shader:
+ */
 void *
 ureg_create_shader( struct ureg_program *,
                     struct pipe_context *pipe );
 
+
+/* Alternately, return the built token stream and hand ownership of
+ * that memory to the caller:
+ */
+const struct tgsi_token *
+ureg_get_tokens( struct ureg_program *ureg,
+                 unsigned *nr_tokens );
+
+
 void 
 ureg_destroy( struct ureg_program * );
 
@@ -116,8 +127,7 @@ ureg_DECL_fs_input( struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_vs_input( struct ureg_program *,
-                    unsigned semantic_name,
-                    unsigned semantic_index );
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_output( struct ureg_program *,
@@ -130,7 +140,8 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
-ureg_DECL_constant( struct ureg_program * );
+ureg_DECL_constant( struct ureg_program *,
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_temporary( struct ureg_program * );
@@ -233,6 +244,24 @@ ureg_insn(struct ureg_program *ureg,
           unsigned nr_src );
 
 
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src );
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label);
+
+
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
  */
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index ae8d330a78..1d8bb55bbd 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_debug_stack.c \
 	u_blit.c \
 	u_cache.c \
+	u_cpu_detect.c \
 	u_draw_quad.c \
 	u_format.c \
 	u_format_access.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 28a5ab4256..2187935fa4 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -24,6 +24,7 @@ util = env.ConvenienceLibrary(
 		'u_bitmask.c',
 		'u_blit.c',
 		'u_cache.c',
+		'u_cpu_detect.c',
 		'u_debug.c',
 		'u_debug_dump.c',
 		'u_debug_memory.c',
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index d9f2f8fc28..ecfb96138d 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -24,23 +24,21 @@
  * 
  **************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-/* FIXME: clean this entire file up */
+#include "pipe/p_config.h"
 
+#include "u_debug.h"
 #include "u_cpu_detect.h"
 
-#ifdef __linux__
-#define OS_LINUX
-#endif
-#ifdef WIN32
-#define OS_WIN32
-#endif
-
-#if defined(ARCH_POWERPC)
-#if defined(OS_DARWIN)
+#if defined(PIPE_ARCH_PPC)
+#if defined(PIPE_OS_DARWIN)
 #include <sys/sysctl.h>
 #else
 #include <signal.h>
@@ -48,137 +46,140 @@
 #endif
 #endif
 
-#if defined(OS_NETBSD) || defined(OS_OPENBSD)
+#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <machine/cpu.h>
 #endif
 
-#if defined(OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
-#if defined(OS_LINUX)
+#if defined(PIPE_OS_LINUX)
 #include <signal.h>
 #endif
 
-#if defined(OS_WIN32)
-#include <windows.h>
+#ifdef PIPE_OS_UNIX
+#include <unistd.h>
 #endif
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
+#if defined(PIPE_OS_WINDOWS)
+#include <windows.h>
+#endif
 
 
-static struct cpu_detect_caps __cpu_detect_caps;
-static int __cpu_detect_initialized = 0;
+struct util_cpu_caps util_cpu_caps;
 
 static int has_cpuid(void);
 static int cpuid(unsigned int ax, unsigned int *p);
 
+#if defined(PIPE_ARCH_X86)
+
 /* The sigill handlers */
-#if defined(ARCH_X86) /* x86 (linux katmai handler check thing) */
-#if defined(OS_LINUX) && defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
-static void sigill_handler_sse(int signal, struct sigcontext sc)
+#if defined(PIPE_OS_LINUX) //&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
+static void
+sigill_handler_sse(int signal, struct sigcontext sc)
 {
-	/* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
-	 * instructions are 3 bytes long.  We must increment the instruction
-	 * pointer manually to avoid repeated execution of the offending
-	 * instruction.
-	 *
-	 * If the SIGILL is caused by a divide-by-zero when unmasked
-	 * exceptions aren't supported, the SIMD FPU status and control
-	 * word will be restored at the end of the test, so we don't need
-	 * to worry about doing it here.  Besides, we may not be able to...
-	 */
-	sc.eip += 3;
-
-	__cpu_detect_caps.hasSSE=0;
+   /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
+    * instructions are 3 bytes long.  We must increment the instruction
+    * pointer manually to avoid repeated execution of the offending
+    * instruction.
+    *
+    * If the SIGILL is caused by a divide-by-zero when unmasked
+    * exceptions aren't supported, the SIMD FPU status and control
+    * word will be restored at the end of the test, so we don't need
+    * to worry about doing it here.  Besides, we may not be able to...
+    */
+   sc.eip += 3;
+
+   util_cpu_caps.has_sse=0;
 }
 
-static void sigfpe_handler_sse(int signal, struct sigcontext sc)
+static void
+sigfpe_handler_sse(int signal, struct sigcontext sc)
 {
-	if (sc.fpstate->magic != 0xffff) {
-		/* Our signal context has the extended FPU state, so reset the
-		 * divide-by-zero exception mask and clear the divide-by-zero
-		 * exception bit.
-		 */
-		sc.fpstate->mxcsr |= 0x00000200;
-		sc.fpstate->mxcsr &= 0xfffffffb;
-	} else {
-		/* If we ever get here, we're completely hosed.
-		*/
-	}
+   if (sc.fpstate->magic != 0xffff) {
+      /* Our signal context has the extended FPU state, so reset the
+       * divide-by-zero exception mask and clear the divide-by-zero
+       * exception bit.
+       */
+      sc.fpstate->mxcsr |= 0x00000200;
+      sc.fpstate->mxcsr &= 0xfffffffb;
+   } else {
+      /* If we ever get here, we're completely hosed.
+      */
+   }
 }
-#endif
-#endif /* OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
+#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
 
-#if defined(OS_WIN32)
-LONG CALLBACK win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
+#if defined(PIPE_OS_WINDOWS)
+static LONG CALLBACK
+win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
 {
-	if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
-		ep->ContextRecord->Eip +=3;
-		__cpu_detect_caps.hasSSE=0;
-		return EXCEPTION_CONTINUE_EXECUTION;
-	}
-	return EXCEPTION_CONTINUE_SEARCH;
+   if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
+      ep->ContextRecord->Eip +=3;
+      util_cpu_caps.has_sse=0;
+      return EXCEPTION_CONTINUE_EXECUTION;
+   }
+   return EXCEPTION_CONTINUE_SEARCH;
 }
-#endif /* OS_WIN32 */
+#endif /* PIPE_OS_WINDOWS */
+
+#endif /* PIPE_ARCH_X86 */
 
 
-#if defined(ARCH_POWERPC) && !defined(OS_DARWIN)
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN)
 static sigjmp_buf __lv_powerpc_jmpbuf;
 static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
-static void sigill_handler (int sig);
-
-static void sigill_handler (int sig)
+static void
+sigill_handler(int sig)
 {
-	if (!__lv_powerpc_canjump) {
-		signal (sig, SIG_DFL);
-		raise (sig);
-	}
+   if (!__lv_powerpc_canjump) {
+      signal (sig, SIG_DFL);
+      raise (sig);
+   }
 
-	__lv_powerpc_canjump = 0;
-	siglongjmp(__lv_powerpc_jmpbuf, 1);
+   __lv_powerpc_canjump = 0;
+   siglongjmp(__lv_powerpc_jmpbuf, 1);
 }
 
-static void check_os_altivec_support(void)
+static void
+check_os_altivec_support(void)
 {
-#if defined(OS_DARWIN)
-	int sels[2] = {CTL_HW, HW_VECTORUNIT};
-	int has_vu = 0;
-	int len = sizeof (has_vu);
-	int err;
-
-	err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
-
-	if (err == 0) {
-		if (has_vu != 0) {
-			__cpu_detect_caps.hasAltiVec = 1;
-		}
-	}
-#else /* !OS_DARWIN */
-	/* no Darwin, do it the brute-force way */
-	/* this is borrowed from the libmpeg2 library */
-	signal(SIGILL, sigill_handler);
-	if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
-		signal(SIGILL, SIG_DFL);
-	} else {
-		__lv_powerpc_canjump = 1;
-
-		__asm __volatile
-			("mtspr 256, %0\n\t"
-			 "vand %%v0, %%v0, %%v0"
-			 :
-			 : "r" (-1));
-
-		signal(SIGILL, SIG_DFL);
-		__cpu_detect_caps.hasAltiVec = 1;
-	}
+#if defined(PIPE_OS_DARWIN)
+   int sels[2] = {CTL_HW, HW_VECTORUNIT};
+   int has_vu = 0;
+   int len = sizeof (has_vu);
+   int err;
+
+   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+   if (err == 0) {
+      if (has_vu != 0) {
+         util_cpu_caps.has_altivec = 1;
+      }
+   }
+#else /* !PIPE_OS_DARWIN */
+   /* no Darwin, do it the brute-force way */
+   /* this is borrowed from the libmpeg2 library */
+   signal(SIGILL, sigill_handler);
+   if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
+      signal(SIGILL, SIG_DFL);
+   } else {
+      __lv_powerpc_canjump = 1;
+
+      __asm __volatile
+         ("mtspr 256, %0\n\t"
+          "vand %%v0, %%v0, %%v0"
+          :
+          : "r" (-1));
+
+      signal(SIGILL, SIG_DFL);
+      util_cpu_caps.has_altivec = 1;
+   }
 #endif
 }
 #endif
@@ -189,318 +190,312 @@ static void check_os_altivec_support(void)
  * and RedHat patched 2.2 kernels that have broken exception handling
  * support for user space apps that do SSE.
  */
-static void check_os_katmai_support(void)
+static void
+check_os_katmai_support(void)
 {
-#if defined(ARCH_X86)
-#if defined(OS_FREEBSD)
-	int has_sse=0, ret;
-	int len = sizeof (has_sse);
-
-	ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-	if (ret || !has_sse)
-		__cpu_detect_caps.hasSSE=0;
-
-#elif defined(OS_NETBSD) || defined(OS_OPENBSD)
-	int has_sse, has_sse2, ret, mib[2];
-	int varlen;
-
-	mib[0] = CTL_MACHDEP;
-	mib[1] = CPU_SSE;
-	varlen = sizeof (has_sse);
-
-	ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse) {
-		__cpu_detect_caps.hasSSE = 0;
-	} else {
-		__cpu_detect_caps.hasSSE = 1;
-	}
-
-	mib[1] = CPU_SSE2;
-	varlen = sizeof (has_sse2);
-	ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse2) {
-		__cpu_detect_caps.hasSSE2 = 0;
-	} else {
-		__cpu_detect_caps.hasSSE2 = 1;
-	}
-	__cpu_detect_caps.hasSSE = 0; /* FIXME ?!?!? */
-
-#elif defined(OS_WIN32)
-	LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-	if (__cpu_detect_caps.hasSSE) {
-		exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-		__asm __volatile ("xorps %xmm0, %xmm0");
-		SetUnhandledExceptionFilter(exc_fil);
-	}
-#elif defined(OS_LINUX)
-	struct sigaction saved_sigill;
-	struct sigaction saved_sigfpe;
-
-	/* Save the original signal handlers.
-	*/
-	sigaction(SIGILL, NULL, &saved_sigill);
-	sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-	signal(SIGILL, (void (*)(int))sigill_handler_sse);
-	signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-	/* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-	 * supports the extended FPU save and restore required for SSE.  If
-	 * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-	 * doesn't support Streaming SIMD Exceptions, even if the processor
-	 * does.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-		__asm __volatile ("xorps %xmm1, %xmm0");
-	}
-
-	/* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-	 * it supports unmasked SIMD FPU exceptions.  If we unmask the
-	 * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-	 * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-	 * as expected, we're okay but we need to clean up after it.
-	 *
-	 * Are we being too stringent in our requirement that the OS support
-	 * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-	 * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-	 * doesn't even support them.  We at least know the user-space SSE
-	 * support is good in kernels that do support unmasked exceptions,
-	 * and therefore to be safe I'm going to leave this test in here.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-		//      test_os_katmai_exception_support();
-	}
-
-	/* Restore the original signal handlers.
-	*/
-	sigaction(SIGILL, &saved_sigill, NULL);
-	sigaction(SIGFPE, &saved_sigfpe, NULL);
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_FREEBSD)
+   int has_sse=0, ret;
+   int len = sizeof (has_sse);
+
+   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
+   if (ret || !has_sse)
+      util_cpu_caps.has_sse=0;
+
+#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
+   int has_sse, has_sse2, ret, mib[2];
+   int varlen;
+
+   mib[0] = CTL_MACHDEP;
+   mib[1] = CPU_SSE;
+   varlen = sizeof (has_sse);
+
+   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse) {
+      util_cpu_caps.has_sse = 0;
+   } else {
+      util_cpu_caps.has_sse = 1;
+   }
+
+   mib[1] = CPU_SSE2;
+   varlen = sizeof (has_sse2);
+   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse2) {
+      util_cpu_caps.has_sse2 = 0;
+   } else {
+      util_cpu_caps.has_sse2 = 1;
+   }
+   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
+
+#elif defined(PIPE_OS_WINDOWS)
+   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
+   if (util_cpu_caps.has_sse) {
+      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
+#if defined(PIPE_CC_GCC)
+      __asm __volatile ("xorps %xmm0, %xmm0");
+#elif defined(PIPE_CC_MSVC)
+      __asm {
+          xorps xmm0, xmm0        // executing SSE instruction
+      }
+#else
+#error Unsupported compiler
+#endif
+      SetUnhandledExceptionFilter(exc_fil);
+   }
+#elif defined(PIPE_OS_LINUX)
+   struct sigaction saved_sigill;
+   struct sigaction saved_sigfpe;
+
+   /* Save the original signal handlers.
+   */
+   sigaction(SIGILL, NULL, &saved_sigill);
+   sigaction(SIGFPE, NULL, &saved_sigfpe);
+
+   signal(SIGILL, (void (*)(int))sigill_handler_sse);
+   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
+
+   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
+    * supports the extended FPU save and restore required for SSE.  If
+    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
+    * doesn't support Streaming SIMD Exceptions, even if the processor
+    * does.
+    */
+   if (util_cpu_caps.has_sse) {
+      __asm __volatile ("xorps %xmm1, %xmm0");
+   }
+
+   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
+    * it supports unmasked SIMD FPU exceptions.  If we unmask the
+    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
+    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
+    * as expected, we're okay but we need to clean up after it.
+    *
+    * Are we being too stringent in our requirement that the OS support
+    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
+    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
+    * doesn't even support them.  We at least know the user-space SSE
+    * support is good in kernels that do support unmasked exceptions,
+    * and therefore to be safe I'm going to leave this test in here.
+    */
+   if (util_cpu_caps.has_sse) {
+      //      test_os_katmai_exception_support();
+   }
+
+   /* Restore the original signal handlers.
+   */
+   sigaction(SIGILL, &saved_sigill, NULL);
+   sigaction(SIGFPE, &saved_sigfpe, NULL);
 
 #else
-	/* We can't use POSIX signal handling to test the availability of
-	 * SSE, so we disable it by default.
-	 */
-	__cpu_detect_caps.hasSSE = 0;
+   /* We can't use POSIX signal handling to test the availability of
+    * SSE, so we disable it by default.
+    */
+   util_cpu_caps.has_sse = 0;
 #endif /* __linux__ */
 #endif
+
+#if defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.has_sse = 1;
+#endif
 }
 
 
 static int has_cpuid(void)
 {
-#if defined(ARCH_X86)
-	int a, c;
-
-	__asm __volatile
-		("pushf\n"
-		 "popl %0\n"
-		 "movl %0, %1\n"
-		 "xorl $0x200000, %0\n"
-		 "push %0\n"
-		 "popf\n"
-		 "pushf\n"
-		 "popl %0\n"
-		 : "=a" (a), "=c" (c)
-		 :
-		 : "cc");
-
-	return a != c;
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_GCC)
+   int a, c;
+
+   __asm __volatile
+      ("pushf\n"
+       "popl %0\n"
+       "movl %0, %1\n"
+       "xorl $0x200000, %0\n"
+       "push %0\n"
+       "popf\n"
+       "pushf\n"
+       "popl %0\n"
+       : "=a" (a), "=c" (c)
+       :
+       : "cc");
+
+   return a != c;
+#else
+   /* FIXME */
+   return 1;
+#endif
+#elif defined(PIPE_ARCH_X86_64)
+   return 1;
 #else
-	return 0;
+   return 0;
 #endif
 }
 
-static int cpuid(unsigned int ax, unsigned int *p)
+static INLINE int
+cpuid(unsigned int ax, unsigned int *p)
 {
-#if defined(ARCH_X86)
-	unsigned int flags;
-
-	__asm __volatile
-		("movl %%ebx, %%esi\n\t"
-		 "cpuid\n\t"
-		 "xchgl %%ebx, %%esi"
-		 : "=a" (p[0]), "=S" (p[1]),
-		 "=c" (p[2]), "=d" (p[3])
-		 : "0" (ax));
-
-	return 0;
-#else
-	return -1;
+   int ret = -1;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC)
+   __asm __volatile
+      ("movl %%ebx, %%esi\n\t"
+       "cpuid\n\t"
+       "xchgl %%ebx, %%esi"
+       : "=a" (p[0]), "=S" (p[1]),
+       "=c" (p[2]), "=d" (p[3])
+       : "0" (ax));
+
+   ret = 0;
+#elif defined(PIPE_CC_MSVC)
+   __cpuid(ax, p);
+
+   ret = 0;
+#endif
 #endif
+
+   return ret;
 }
 
-void cpu_detect_initialize()
+void
+util_cpu_detect(void)
 {
-	unsigned int regs[4];
-	unsigned int regs2[4];
-
-	int mib[2], ncpu;
-	int len;
-
-	memset(&__cpu_detect_caps, 0, sizeof (struct cpu_detect_caps));
-
-	/* Check for arch type */
-#if defined(ARCH_MIPS)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_MIPS;
-#elif defined(ARCH_ALPHA)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_ALPHA;
-#elif defined(ARCH_SPARC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_SPARC;
-#elif defined(ARCH_X86)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_X86;
-#elif defined(ARCH_POWERPC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_POWERPC;
+   static boolean util_cpu_detect_initialized = FALSE;
+
+   if(util_cpu_detect_initialized)
+      return;
+
+   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
+
+   /* Check for arch type */
+#if defined(PIPE_ARCH_MIPS)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS;
+#elif defined(PIPE_ARCH_ALPHA)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA;
+#elif defined(PIPE_ARCH_SPARC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC;
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_X86;
+#elif defined(PIPE_ARCH_PPC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC;
 #else
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_OTHER;
+   util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN;
 #endif
 
-	/* Count the number of CPUs in system */
-#if !defined(OS_WIN32) && !defined(OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
-	__cpu_detect_caps.nrcpu = sysconf(_SC_NPROCESSORS_ONLN);
-	if (__cpu_detect_caps.nrcpu == -1)
-		__cpu_detect_caps.nrcpu = 1;
-
-#elif defined(OS_NETBSD) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
+   /* Count the number of CPUs in system */
+#if !defined(PIPE_OS_WINDOWS) && !defined(PIPE_OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
+   util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+   if (util_cpu_caps.nr_cpus == -1)
+      util_cpu_caps.nr_cpus = 1;
 
-	mib[0] = CTL_HW;
-	mib[1] = HW_NCPU;
+#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_OPENBSD)
+   {
+      int mib[2], ncpu;
+      int len;
 
-	len = sizeof (ncpu);
-	sysctl(mib, 2, &ncpu, &len, NULL, 0);
-	__cpu_detect_caps.nrcpu = ncpu;
+      mib[0] = CTL_HW;
+      mib[1] = HW_NCPU;
 
+      len = sizeof (ncpu);
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      util_cpu_caps.nr_cpus = ncpu;
+   }
 #else
-	__cpu_detect_caps.nrcpu = 1;
+   util_cpu_caps.nr_cpus = 1;
 #endif
 
-#if defined(ARCH_X86)
-	/* No cpuid, old 486 or lower */
-	if (has_cpuid() == 0)
-		return;
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if (has_cpuid()) {
+      unsigned int regs[4];
+      unsigned int regs2[4];
 
-	__cpu_detect_caps.cacheline = 32;
+      util_cpu_caps.cacheline = 32;
 
-	/* Get max cpuid level */
-	cpuid(0x00000000, regs);
+      /* Get max cpuid level */
+      cpuid(0x00000000, regs);
 
-	if (regs[0] >= 0x00000001) {
-		unsigned int cacheline;
+      if (regs[0] >= 0x00000001) {
+         unsigned int cacheline;
 
-		cpuid (0x00000001, regs2);
+         cpuid (0x00000001, regs2);
 
-		__cpu_detect_caps.x86cpuType = (regs2[0] >> 8) & 0xf;
-		if (__cpu_detect_caps.x86cpuType == 0xf)
-		    __cpu_detect_caps.x86cpuType = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
+         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
+         if (util_cpu_caps.x86_cpu_type == 0xf)
+             util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
 
-		/* general feature flags */
-		__cpu_detect_caps.hasTSC  = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
-		__cpu_detect_caps.hasMMX  = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasSSE  = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
-		__cpu_detect_caps.hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
-		__cpu_detect_caps.hasSSE3 = (regs2[2] & (1));	       /* 0x0000001 */
-		__cpu_detect_caps.hasSSSE3 = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
-		__cpu_detect_caps.hasMMX2 = __cpu_detect_caps.hasSSE; /* SSE cpus supports mmxext too */
+         /* general feature flags */
+         util_cpu_caps.has_tsc    = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
+         util_cpu_caps.has_mmx    = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_sse    = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
+         util_cpu_caps.has_sse2   = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
+         util_cpu_caps.has_sse3   = (regs2[2] & (1));          /* 0x0000001 */
+         util_cpu_caps.has_ssse3  = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
+         util_cpu_caps.has_sse4_1 = (regs2[2] & (1 << 19)) >> 19;
+         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
 
-		cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
-		if (cacheline > 0)
-			__cpu_detect_caps.cacheline = cacheline;
-	}
+         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+         if (cacheline > 0)
+            util_cpu_caps.cacheline = cacheline;
+      }
 
-	cpuid(0x80000000, regs);
+      cpuid(0x80000000, regs);
 
-	if (regs[0] >= 0x80000001) {
+      if (regs[0] >= 0x80000001) {
 
-		cpuid(0x80000001, regs2);
+         cpuid(0x80000001, regs2);
 
-		__cpu_detect_caps.hasMMX  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasMMX2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
-		__cpu_detect_caps.has3DNow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
-		__cpu_detect_caps.has3DNowExt = (regs2[3] & (1 << 30 )) >> 30;
-	}
+         util_cpu_caps.has_mmx  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_mmx2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
+         util_cpu_caps.has_3dnow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
+         util_cpu_caps.has_3dnow_ext = (regs2[3] & (1 << 30 )) >> 30;
+      }
 
-	if (regs[0] >= 0x80000006) {
-		cpuid(0x80000006, regs2);
-		__cpu_detect_caps.cacheline = regs2[2] & 0xFF;
-	}
+      if (regs[0] >= 0x80000006) {
+         cpuid(0x80000006, regs2);
+         util_cpu_caps.cacheline = regs2[2] & 0xFF;
+      }
 
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_CYGWIN) || defined(PIPE_OS_OPENBSD)
+      if (util_cpu_caps.has_sse)
+         check_os_katmai_support();
 
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_CYGWIN) || defined(OS_OPENBSD)
-	if (__cpu_detect_caps.hasSSE)
-		check_os_katmai_support();
-
-	if (!__cpu_detect_caps.hasSSE) {
-		__cpu_detect_caps.hasSSE2 = 0;
-		__cpu_detect_caps.hasSSE3 = 0;
-		__cpu_detect_caps.hasSSSE3 = 0;
-	}
+      if (!util_cpu_caps.has_sse) {
+         util_cpu_caps.has_sse2 = 0;
+         util_cpu_caps.has_sse3 = 0;
+         util_cpu_caps.has_ssse3 = 0;
+      }
 #else
-	__cpu_detect_caps.hasSSE = 0;
-	__cpu_detect_caps.hasSSE2 = 0;
-	__cpu_detect_caps.hasSSE3 = 0;
-	__cpu_detect_caps.hasSSSE3 = 0;
+      util_cpu_caps.has_sse = 0;
+      util_cpu_caps.has_sse2 = 0;
+      util_cpu_caps.has_sse3 = 0;
+      util_cpu_caps.has_ssse3 = 0;
+#endif
+   }
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#if defined(PIPE_ARCH_PPC)
+   check_os_altivec_support();
+#endif /* PIPE_ARCH_PPC */
+
+#ifdef DEBUG
+   debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch);
+   debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
+
+   debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
+   debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
+
+   debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
+   debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
+   debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
+   debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
+   debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
+   debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
+   debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
+   debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
+   debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
+   debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+   debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
 #endif
-#endif /* ARCH_X86 */
-
-#if defined(ARCH_POWERPC)
-	check_os_altivec_support();
-#endif /* ARCH_POWERPC */
-
-	__cpu_detect_initialized = 1;
-}
-
-struct cpu_detect_caps *cpu_detect_get_caps()
-{
-	return &__cpu_detect_caps;
-}
-
-/* The getters and setters for feature flags */
-int cpu_detect_get_tsc()
-{
-	return __cpu_detect_caps.hasTSC;
-}
-
-int cpu_detect_get_mmx()
-{
-	return __cpu_detect_caps.hasMMX;
-}
-
-int cpu_detect_get_mmx2()
-{
-	return __cpu_detect_caps.hasMMX2;
-}
-
-int cpu_detect_get_sse()
-{
-	return __cpu_detect_caps.hasSSE;
-}
-
-int cpu_detect_get_sse2()
-{
-	return __cpu_detect_caps.hasSSE2;
-}
-
-int cpu_detect_get_sse3()
-{
-	return __cpu_detect_caps.hasSSE3;
-}
-
-int cpu_detect_get_ssse3()
-{
-	return __cpu_detect_caps.hasSSSE3;
-}
-
-int cpu_detect_get_3dnow()
-{
-	return __cpu_detect_caps.has3DNow;
-}
-
-int cpu_detect_get_3dnow2()
-{
-	return __cpu_detect_caps.has3DNowExt;
-}
 
-int cpu_detect_get_altivec()
-{
-	return __cpu_detect_caps.hasAltiVec;
+   util_cpu_detect_initialized = TRUE;
 }
-
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 1612d49286..7ea0121c07 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -24,55 +24,53 @@
  *
  ***************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-#ifndef _CPU_DETECT_H
-#define _CPU_DETECT_H
+#ifndef _UTIL_CPU_DETECT_H
+#define _UTIL_CPU_DETECT_H
+
+#include "pipe/p_compiler.h"
 
-typedef enum {
-	CPU_DETECT_TYPE_MIPS,
-	CPU_DETECT_TYPE_ALPHA,
-	CPU_DETECT_TYPE_SPARC,
-	CPU_DETECT_TYPE_X86,
-	CPU_DETECT_TYPE_POWERPC,
-	CPU_DETECT_TYPE_OTHER
-} cpu_detect_type;
+enum util_cpu_arch {
+   UTIL_CPU_ARCH_UNKNOWN = 0,
+   UTIL_CPU_ARCH_MIPS,
+   UTIL_CPU_ARCH_ALPHA,
+   UTIL_CPU_ARCH_SPARC,
+   UTIL_CPU_ARCH_X86,
+   UTIL_CPU_ARCH_POWERPC
+};
 
-struct cpu_detect_caps {
-	cpu_detect_type	type;
-	int		nrcpu;
+struct util_cpu_caps {
+   enum util_cpu_arch arch;
+   unsigned nr_cpus;
 
-	/* Feature flags */
-	int		x86cpuType;
-	int		cacheline;
+   /* Feature flags */
+   int x86_cpu_type;
+   unsigned cacheline;
 
-	int		hasTSC;
-	int		hasMMX;
-	int		hasMMX2;
-	int		hasSSE;
-	int		hasSSE2;
-	int		hasSSE3;
-	int		hasSSSE3;
-	int		has3DNow;
-	int		has3DNowExt;
-	int		hasAltiVec;
+   unsigned has_tsc:1;
+   unsigned has_mmx:1;
+   unsigned has_mmx2:1;
+   unsigned has_sse:1;
+   unsigned has_sse2:1;
+   unsigned has_sse3:1;
+   unsigned has_ssse3:1;
+   unsigned has_sse4_1:1;
+   unsigned has_3dnow:1;
+   unsigned has_3dnow_ext:1;
+   unsigned has_altivec:1;
 };
 
-/* prototypes */
-void cpu_detect_initialize(void);
-struct cpu_detect_caps *cpu_detect_get_caps(void);
+extern struct util_cpu_caps
+util_cpu_caps;
+
+void util_cpu_detect(void);
 
-int cpu_detect_get_tsc(void);
-int cpu_detect_get_mmx(void);
-int cpu_detect_get_mmx2(void);
-int cpu_detect_get_sse(void);
-int cpu_detect_get_sse2(void);
-int cpu_detect_get_sse3(void);
-int cpu_detect_get_ssse3(void);
-int cpu_detect_get_3dnow(void);
-int cpu_detect_get_3dnow2(void);
-int cpu_detect_get_altivec(void);
 
-#endif /* _CPU_DETECT_H */
+#endif /* _UTIL_CPU_DETECT_H */
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 1380d98d7e..b82e7cb4d4 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -65,6 +65,11 @@ extern "C" {
 #define __FUNCTION__ "???"
 #endif
 
+#if defined(__GNUC__)
+#define _util_printf_format(fmt, list) __attribute__ ((format (printf, fmt, list)))
+#else
+#define _util_printf_format(fmt, list)
+#endif
 
 void _debug_vprintf(const char *format, va_list ap);
    
@@ -82,14 +87,17 @@ _debug_printf(const char *format, ...)
 /**
  * Print debug messages.
  *
- * The actual channel used to output debug message is platform specific. To 
- * avoid misformating or truncation, follow these rules of thumb:   
+ * The actual channel used to output debug message is platform specific. To
+ * avoid misformating or truncation, follow these rules of thumb:
  * - output whole lines
- * - avoid outputing large strings (512 bytes is the current maximum length 
+ * - avoid outputing large strings (512 bytes is the current maximum length
  * that is guaranteed to be printed in all platforms)
  */
 #if !defined(PIPE_OS_HAIKU)
 static INLINE void
+debug_printf(const char *format, ...) _util_printf_format(1,2);
+
+static INLINE void
 debug_printf(const char *format, ...)
 {
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/util/u_fifo.h b/src/gallium/auxiliary/util/u_fifo.h
new file mode 100644
index 0000000000..9e007de1ad
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_fifo.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_FIFO_H
+#define U_FIFO_H
+
+#include "util/u_memory.h"
+
+struct util_fifo
+{
+   size_t head;
+   size_t tail;
+   size_t num;
+   size_t size;
+};
+
+static INLINE struct util_fifo *
+u_fifo_create(size_t size)
+{
+   struct util_fifo *fifo;
+   fifo = MALLOC(sizeof(*fifo) + size * sizeof(void*));
+
+   fifo->head = 0;
+   fifo->tail = 0;
+   fifo->num = 0;
+   fifo->size = size;
+
+   return fifo;
+}
+
+static INLINE boolean
+u_fifo_add(struct util_fifo *fifo, void *ptr)
+{
+   void **array = (void**)&fifo[1];
+   if (fifo->num >= fifo->size)
+      return FALSE;
+
+   if (++fifo->head >= fifo->size)
+      fifo->head = 0;
+
+   array[fifo->head] = ptr;
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE boolean
+u_fifo_pop(struct util_fifo *fifo, void **ptr)
+{
+   void **array = (void**)&fifo[1];
+
+   if (!fifo->num)
+      return FALSE;
+
+   if (++fifo->tail >= fifo->size)
+      fifo->tail = 0;
+
+   *ptr = array[fifo->tail];
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE void
+u_fifo_destroy(struct util_fifo *fifo)
+{
+   FREE(fifo);
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 00a46d0cc4..f1bf94f17d 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -2,7 +2,7 @@ PIPE_FORMAT_A8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyxw,
 PIPE_FORMAT_X8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
 PIPE_FORMAT_B8G8R8A8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
 PIPE_FORMAT_B8G8R8X8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
-PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un1 , un5 , un5 , un5 , zyxw, rgb
+PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
 PIPE_FORMAT_A4R4G4B4_UNORM        , arith , 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
 PIPE_FORMAT_R5G6B5_UNORM          , arith , 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
 PIPE_FORMAT_A2B10G10R10_UNORM     , arith , 1, 1, un10, un10, un10, un2 , xyzw, rgb
@@ -14,10 +14,10 @@ PIPE_FORMAT_L16_UNORM             , arith , 1, 1, un16,     ,     ,     , xxx1,
 PIPE_FORMAT_Z16_UNORM             , array , 1, 1, un16,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_UNORM             , array , 1, 1, un32,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_FLOAT             , array , 1, 1, f32 ,     ,     ,     , x___, zs 
-PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
-PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
-PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
-PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
+PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
+PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
 PIPE_FORMAT_S8_UNORM              , array , 1, 1, un8 ,     ,     ,     , _x__, zs 
 PIPE_FORMAT_R64_FLOAT             , array , 1, 1, f64 ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R64G64_FLOAT          , array , 1, 1, f64 , f64 ,     ,     , xy01, rgb
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 4c6c2bc00e..75b075f160 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -283,6 +283,14 @@ util_fast_pow(float x, float y)
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
+/* Note that this counts zero as a power of two.
+ */
+static INLINE boolean
+util_is_power_of_two( unsigned v )
+{
+   return (v & (v-1)) == 0;
+}
+
 
 /**
  * Floor(x), returned as int.
@@ -341,10 +349,22 @@ util_is_inf_or_nan(float x)
 
 
 /**
+ * Test whether x is a power of two.
+ */
+static INLINE boolean
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+
+/**
  * Find first bit set in word.  Least significant bit is 1.
  * Return 0 if no bits set.
  */
-#if defined(_MSC_VER) && _MSC_VER >= 1300
+#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64)
+unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
+#pragma intrinsic(_BitScanForward)
 static INLINE
 unsigned long ffs( unsigned long u )
 {
@@ -451,6 +471,26 @@ util_logbase2(unsigned n)
 
 
 /**
+ * Returns the smallest power of two >= x
+ */
+static INLINE unsigned
+util_next_power_of_two(unsigned x)
+{
+   unsigned i;
+
+   if (x == 0)
+      return 1;
+
+   --x;
+
+   for (i = 1; i < sizeof(unsigned) * 8; i <<= 1)
+      x |= x >> i;
+
+   return x + 1;
+}
+
+
+/**
  * Clamp X to [MIN, MAX].
  * This is a macro to allow float, int, uint, etc. types.
  */
diff --git a/src/gallium/auxiliary/util/u_simple_screen.c b/src/gallium/auxiliary/util/u_simple_screen.c
index f01296b40f..5238299015 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.c
+++ b/src/gallium/auxiliary/util/u_simple_screen.c
@@ -52,8 +52,7 @@ pass_user_buffer_create(struct pipe_screen *screen,
                         unsigned bytes)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->user_buffer_create(screen->winsys,
-                                             ptr, bytes);
+      screen->winsys->user_buffer_create(screen->winsys, ptr, bytes);
 
    buffer->screen = screen;
 
@@ -69,9 +68,8 @@ pass_surface_buffer_create(struct pipe_screen *screen,
                            unsigned *stride)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->surface_buffer_create(screen->winsys,
-                                                width, height,
-                                                format, usage, tex_usage, stride);
+      screen->winsys->surface_buffer_create(screen->winsys, width, height,
+                                            format, usage, tex_usage, stride);
 
    buffer->screen = screen;
 
@@ -83,8 +81,7 @@ pass_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
                 unsigned usage)
 {
-   return screen->winsys->buffer_map(screen->winsys,
-                                     buf, usage);
+   return screen->winsys->buffer_map(screen->winsys, buf, usage);
 }
 
 static void
@@ -106,8 +103,7 @@ pass_flush_frontbuffer(struct pipe_screen *screen,
                        struct pipe_surface *surf,
                        void *context_private)
 {
-   screen->winsys->flush_frontbuffer(screen->winsys,
-                                     surf, context_private);
+   screen->winsys->flush_frontbuffer(screen->winsys, surf, context_private);
 }
 
 static void
@@ -115,8 +111,7 @@ pass_fence_reference(struct pipe_screen *screen,
                      struct pipe_fence_handle **ptr,
                      struct pipe_fence_handle *fence)
 {
-   screen->winsys->fence_reference(screen->winsys,
-                                   ptr, fence);
+   screen->winsys->fence_reference(screen->winsys, ptr, fence);
 }
 
 static int
@@ -124,8 +119,7 @@ pass_fence_signalled(struct pipe_screen *screen,
                      struct pipe_fence_handle *fence,
                      unsigned flag)
 {
-   return screen->winsys->fence_signalled(screen->winsys,
-                                          fence, flag);
+   return screen->winsys->fence_signalled(screen->winsys, fence, flag);
 }
 
 static int
@@ -133,11 +127,11 @@ pass_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
                   unsigned flag)
 {
-   return screen->winsys->fence_finish(screen->winsys,
-                                       fence, flag);
+   return screen->winsys->fence_finish(screen->winsys, fence, flag);
 }
 
-void u_simple_screen_init(struct pipe_screen *screen)
+void
+u_simple_screen_init(struct pipe_screen *screen)
 {
    screen->buffer_create = pass_buffer_create;
    screen->user_buffer_create = pass_user_buffer_create;
@@ -152,7 +146,8 @@ void u_simple_screen_init(struct pipe_screen *screen)
    screen->fence_finish = pass_fence_finish;
 }
 
-const char* u_simple_screen_winsys_name(struct pipe_screen *screen)
+const char *
+u_simple_screen_winsys_name(struct pipe_screen *screen)
 {
    return screen->winsys->get_name(screen->winsys);
 }
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index ab754296fa..0d706f9449 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -34,14 +34,8 @@
 
 
 #include "pipe/p_context.h"
-#include "util/u_debug.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_screen.h"
 #include "pipe/p_shader_tokens.h"
-
-#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
-
 #include "tgsi/tgsi_ureg.h"
 
 
@@ -67,9 +61,7 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
       struct ureg_src src;
       struct ureg_dst dst;
 
-      src = ureg_DECL_vs_input( ureg,
-                                semantic_names[i],
-                                semantic_indexes[i]);
+      src = ureg_DECL_vs_input( ureg, i );
       
       dst = ureg_DECL_output( ureg,
                               semantic_names[i],
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 1235a67d26..0d6489c26e 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -170,7 +170,7 @@ x8r8g8b8_get_tile_rgba(const unsigned *src,
          pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
          pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
          pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
-         pRow[3] = ubyte_to_float(0xff);
+         pRow[3] = 1.0F;
       }
       p += dst_stride;
    }
@@ -394,6 +394,52 @@ r5g6b5_put_tile_rgba(ushort *dst,
 
 
 
+/*** PIPE_FORMAT_R8G8B8_UNORM ***/
+
+static void
+r8g8b8_get_tile_rgba(const ubyte *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] = ubyte_to_float(src[0]);
+         pRow[1] = ubyte_to_float(src[1]);
+         pRow[2] = ubyte_to_float(src[2]);
+         pRow[3] = 1.0f;
+         src += 3;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r8g8b8_put_tile_rgba(ubyte *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         dst[0] = float_to_ubyte(pRow[0]);
+         dst[1] = float_to_ubyte(pRow[1]);
+         dst[2] = float_to_ubyte(pRow[2]);
+         dst += 3;
+      }
+      p += src_stride;
+   }
+}
+
+
+
 /*** PIPE_FORMAT_Z16_UNORM ***/
 
 /**
@@ -1106,6 +1152,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_L8_UNORM:
       l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
       break;
@@ -1222,6 +1271,9 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       assert(0);
       break;
diff --git a/src/gallium/auxiliary/vl/Makefile b/src/gallium/auxiliary/vl/Makefile
new file mode 100644
index 0000000000..71bfb937ad
--- /dev/null
+++ b/src/gallium/auxiliary/vl/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = vl
+
+C_SOURCES = \
+	vl_bitstream_parser.c \
+	vl_mpeg12_mc_renderer.c \
+	vl_compositor.c \
+	vl_shader_build.c
+
+include ../../Makefile.template
diff --git a/src/gallium/auxiliary/vl/SConscript b/src/gallium/auxiliary/vl/SConscript
new file mode 100644
index 0000000000..eb50940c35
--- /dev/null
+++ b/src/gallium/auxiliary/vl/SConscript
@@ -0,0 +1,12 @@
+Import('*')
+
+vl = env.ConvenienceLibrary(
+	target = 'vl',
+	source = [
+		'vl_bitstream_parser.c',
+		'vl_mpeg12_mc_renderer.c',
+		'vl_compositor.c',
+		'vl_shader_build.c',
+	])
+
+auxiliaries.insert(0, vl)
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.c b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
new file mode 100644
index 0000000000..45826bad45
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
@@ -0,0 +1,140 @@
+#include "vl_bitstream_parser.h"
+#include <assert.h>
+#include <limits.h>
+#include <util/u_memory.h>
+
+static unsigned
+grab_bits(unsigned cursor, unsigned how_many_bits, unsigned bitstream_elt)
+{
+   unsigned excess_bits = sizeof(unsigned) * CHAR_BIT - how_many_bits - cursor;
+	
+   assert(cursor < sizeof(unsigned) * CHAR_BIT);
+   assert(how_many_bits > 0 && how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+   assert(cursor + how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+
+   return (bitstream_elt << excess_bits) >> (excess_bits + cursor);
+}
+
+static unsigned
+show_bits(unsigned cursor, unsigned how_many_bits, const unsigned *bitstream)
+{	
+   unsigned cur_int = cursor / (sizeof(unsigned) * CHAR_BIT);
+   unsigned cur_bit = cursor % (sizeof(unsigned) * CHAR_BIT);
+	
+   assert(bitstream);
+	
+   if (cur_bit + how_many_bits > sizeof(unsigned) * CHAR_BIT) {
+      unsigned lower = grab_bits(cur_bit, sizeof(unsigned) * CHAR_BIT - cur_bit,
+                                 bitstream[cur_int]);
+      unsigned upper = grab_bits(0, cur_bit + how_many_bits - sizeof(unsigned) * CHAR_BIT,
+                                 bitstream[cur_int + 1]);
+      return lower | upper << (sizeof(unsigned) * CHAR_BIT - cur_bit);
+   }
+   else
+      return grab_bits(cur_bit, how_many_bits, bitstream[cur_int]);
+}
+
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes)
+{
+   assert(parser);
+   assert(num_bitstreams);
+   assert(bitstreams);
+   assert(sizes);
+
+   parser->num_bitstreams = num_bitstreams;
+   parser->bitstreams = (const unsigned**)bitstreams;
+   parser->sizes = sizes;
+   parser->cur_bitstream = 0;
+   parser->cursor = 0;
+
+   return true;
+}
+
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser)
+{
+   assert(parser);
+}
+
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits)
+{
+   unsigned bits;
+
+   assert(parser);
+
+   bits = vl_bitstream_parser_show_bits(parser, how_many_bits);
+
+   vl_bitstream_parser_forward(parser, how_many_bits);
+
+   return bits;
+}
+
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits)
+{	
+   unsigned bits = 0;
+   unsigned shift = 0;
+   unsigned cursor;
+   unsigned cur_bitstream;
+
+   assert(parser);
+
+   cursor = parser->cursor;
+   cur_bitstream = parser->cur_bitstream;
+
+   while (1) {
+      unsigned bits_left = parser->sizes[cur_bitstream] * CHAR_BIT - cursor;
+      unsigned bits_to_show = how_many_bits > bits_left ? bits_left : how_many_bits;
+
+      bits |= show_bits(cursor, bits_to_show,
+                        parser->bitstreams[cur_bitstream]) << shift;
+		
+      if (how_many_bits > bits_to_show) {
+         how_many_bits -= bits_to_show;
+         cursor = 0;
+         ++cur_bitstream;
+         shift += bits_to_show;
+      }
+      else
+         break;
+   }
+
+   return bits;
+}
+
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits)
+{
+   assert(parser);
+   assert(how_many_bits);
+
+   parser->cursor += how_many_bits;
+
+   while (parser->cursor > parser->sizes[parser->cur_bitstream] * CHAR_BIT) {
+      parser->cursor -= parser->sizes[parser->cur_bitstream++] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+}
+
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits)
+{
+   signed c;
+	
+   assert(parser);
+   assert(how_many_bits);
+	
+   c = parser->cursor - how_many_bits;
+
+   while (c < 0) {
+      c += parser->sizes[parser->cur_bitstream--] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+
+   parser->cursor = (unsigned)c;
+}
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.h b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
new file mode 100644
index 0000000000..91ebaab45b
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
@@ -0,0 +1,36 @@
+#ifndef vl_bitstream_parser_h
+#define vl_bitstream_parser_h
+
+#include "pipe/p_compiler.h"
+
+struct vl_bitstream_parser
+{
+   unsigned num_bitstreams;
+   const unsigned **bitstreams;
+   const unsigned *sizes;
+   unsigned cur_bitstream;
+   unsigned cursor;
+};
+
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes);
+
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser);
+
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits);
+
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits);
+
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits);
+
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits);
+
+#endif /* vl_bitstream_parser_h */
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
new file mode 100644
index 0000000000..6431da6611
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -0,0 +1,590 @@
+#include "vl_compositor.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include <util/u_memory.h>
+#include "vl_shader_build.h"
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f dst_scale;
+   struct vertex4f dst_trans;
+   struct vertex4f src_scale;
+   struct vertex4f src_trans;
+};
+
+struct fragment_shader_consts
+{
+   struct vertex4f bias;
+   float matrix[16];
+};
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vertex2f surface_verts[4] =
+{
+   {0.0f, 0.0f},
+   {0.0f, 1.0f},
+   {1.0f, 0.0f},
+   {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vertex2f *surface_texcoords = surface_verts;
+
+/*
+ * Identity color conversion constants, for debugging
+ */
+static const struct fragment_shader_consts identity =
+{
+   {
+      0.0f, 0.0f, 0.0f, 0.0f
+   },
+   {
+      1.0f, 0.0f, 0.0f, 0.0f,
+      0.0f, 1.0f, 0.0f, 0.0f,
+      0.0f, 0.0f, 1.0f, 0.0f,
+      0.0f, 0.0f, 0.0f, 1.0f
+   }
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct fragment_shader_consts bt_601 =
+{
+   {
+      0.0f, 0.501960784f, 0.501960784f,	0.0f
+   },
+   {
+      1.0f, 0.0f,   1.371f,   0.0f,
+      1.0f, -0.336f, -0.698f, 0.0f,
+      1.0f, 1.732f, 0.0f,     0.0f,
+      0.0f, 0.0f,   0.0f,     1.0f
+   }
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const struct fragment_shader_consts bt_601_full =
+{
+   {
+      0.062745098f, 0.501960784f, 0.501960784f,	0.0f
+   },
+   {
+      1.164f, 0.0f,    1.596f,  0.0f,
+      1.164f, -0.391f, -0.813f, 0.0f,
+      1.164f, 2.018f,  0.0f,    0.0f,
+      0.0f,   0.0f,    0.0f,    1.0f
+   }
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct fragment_shader_consts bt_709 =
+{
+   {
+      0.0f, 0.501960784f, 0.501960784f,	0.0f
+   },
+   {
+      1.0f, 0.0f,    1.540f,  0.0f,
+      1.0f, -0.183f, -0.459f, 0.0f,
+      1.0f, 1.816f,  0.0f,    0.0f,
+      0.0f, 0.0f,    0.0f,    1.0f
+   }
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+const struct fragment_shader_consts bt_709_full =
+{
+   {
+      0.062745098f, 0.501960784f, 0.501960784f,	0.0f
+   },
+   {
+      1.164f, 0.0f,    1.793f,  0.0f,
+      1.164f, -0.213f, -0.534f,	0.0f,
+      1.164f, 2.115f,  0.0f,    0.0f,
+      0.0f,   0.0f,    0.0f,    1.0f
+   }
+};
+
+static void
+create_vert_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0             ; Vertex pos
+    * decl i1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0             ; Scaling vector to scale vertex pos rect to destination size
+    * decl c1             ; Translation vector to move vertex pos rect into position
+    * decl c2             ; Scaling vector to scale texcoord rect to source size
+    * decl c3             ; Translation vector to move texcoord rect into position
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl o0             ; Vertex pos
+    * decl o1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * mad o0, i0, c0, c1  ; Scale and translate unit output rect to destination size and pos
+    * mad o1, i1, c2, c3  ; Scale and translate unit texcoord rect to source size and pos
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst4(TGSI_OPCODE_MAD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   c->vertex_shader = c->pipe->create_vs_state(c->pipe, &vs);
+   FREE(tokens);
+}
+
+static void
+create_frag_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /* decl i0             ; Texcoords for s0 */
+   decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl c0             ; Bias vector for CSC
+    * decl c1-c4          ; CSC matrix c1-c4
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0             ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0 */
+   decl = vl_decl_temps(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl s0             ; Sampler for tex containing picture to display */
+   decl = vl_decl_samplers(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t0, i0, s0    ; Read src pixel */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* sub t0, t0, c0      ; Subtract bias vector from pixel */
+   inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * dp4 o0.x, t0, c1    ; Multiply pixel by the color conversion matrix
+    * dp4 o0.y, t0, c2
+    * dp4 o0.z, t0, c3
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   c->fragment_shader = c->pipe->create_fs_state(c->pipe, &fs);
+   FREE(tokens);
+}
+
+static bool
+init_pipe_state(struct vl_compositor *c)
+{
+   struct pipe_sampler_state sampler;
+
+   assert(c);
+
+   c->fb_state.nr_cbufs = 1;
+   c->fb_state.zsbuf = NULL;
+
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+   sampler.compare_func = PIPE_FUNC_ALWAYS;
+   sampler.normalized_coords = 1;
+   /*sampler.prefilter = ;*/
+   /*sampler.lod_bias = ;*/
+   /*sampler.min_lod = ;*/
+   /*sampler.max_lod = ;*/
+   /*sampler.border_color[i] = ;*/
+   /*sampler.max_anisotropy = ;*/
+   c->sampler = c->pipe->create_sampler_state(c->pipe, &sampler);
+	
+   return true;
+}
+
+static void cleanup_pipe_state(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_sampler_state(c->pipe, c->sampler);
+}
+
+static bool
+init_shaders(struct vl_compositor *c)
+{
+   assert(c);
+
+   create_vert_shader(c);
+   create_frag_shader(c);
+
+   return true;
+}
+
+static void cleanup_shaders(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_vs_state(c->pipe, c->vertex_shader);
+   c->pipe->delete_fs_state(c->pipe, c->fragment_shader);
+}
+
+static bool
+init_buffers(struct vl_compositor *c)
+{
+   assert(c);
+	
+   /*
+    * Create our vertex buffer and vertex buffer element
+    * VB contains 4 vertices that render a quad covering the entire window
+    * to display a rendered surface
+    * Quad is rendered as a tri strip
+    */
+   c->vertex_bufs[0].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[0].max_index = 3;
+   c->vertex_bufs[0].buffer_offset = 0;
+   c->vertex_bufs[0].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_verts,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[0].buffer);
+
+   c->vertex_elems[0].src_offset = 0;
+   c->vertex_elems[0].vertex_buffer_index = 0;
+   c->vertex_elems[0].nr_components = 2;
+   c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our texcoord buffer and texcoord buffer element
+    * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+    */
+   c->vertex_bufs[1].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[1].max_index = 3;
+   c->vertex_bufs[1].buffer_offset = 0;
+   c->vertex_bufs[1].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_texcoords,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[1].buffer);
+
+   c->vertex_elems[1].src_offset = 0;
+   c->vertex_elems[1].vertex_buffer_index = 1;
+   c->vertex_elems[1].nr_components = 2;
+   c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our vertex shader's constant buffer
+    * Const buffer contains scaling and translation vectors
+    */
+   c->vs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   /*
+    * Create our fragment shader's constant buffer
+    * Const buffer contains the color conversion matrix and bias vectors
+    */
+   c->fs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   /*
+    * TODO: Refactor this into a seperate function,
+    * allow changing the CSC matrix at runtime to switch between regular & full versions
+    */
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      &bt_601_full,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->fs_const_buf.buffer);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_compositor *c)
+{
+   unsigned i;
+
+   assert(c);
+	
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_reference(&c->vertex_bufs[i].buffer, NULL);
+
+   pipe_buffer_reference(&c->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&c->fs_const_buf.buffer, NULL);
+}
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
+{
+   assert(compositor);
+
+   memset(compositor, 0, sizeof(struct vl_compositor));
+
+   compositor->pipe = pipe;
+
+   if (!init_pipe_state(compositor))
+      return false;
+   if (!init_shaders(compositor)) {
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+   if (!init_buffers(compositor)) {
+      cleanup_shaders(compositor);
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+
+   return true;
+}
+
+void vl_compositor_cleanup(struct vl_compositor *compositor)
+{
+   assert(compositor);
+	
+   cleanup_buffers(compositor);
+   cleanup_shaders(compositor);
+   cleanup_pipe_state(compositor);
+}
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas*/
+                          struct pipe_fence_handle      **fence)
+{
+   struct vertex_shader_consts *vs_consts;
+
+   assert(compositor);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+   assert(picture_type == PIPE_MPEG12_PICTURE_TYPE_FRAME);
+
+   compositor->fb_state.width = dst_surface->width[0];
+   compositor->fb_state.height = dst_surface->height[0];
+   compositor->fb_state.cbufs[0] = compositor->pipe->screen->get_tex_surface
+   (
+      compositor->pipe->screen,
+      dst_surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   compositor->viewport.scale[0] = compositor->fb_state.width;
+   compositor->viewport.scale[1] = compositor->fb_state.height;
+   compositor->viewport.scale[2] = 1;
+   compositor->viewport.scale[3] = 1;
+   compositor->viewport.translate[0] = 0;
+   compositor->viewport.translate[1] = 0;
+   compositor->viewport.translate[2] = 0;
+   compositor->viewport.translate[3] = 0;
+
+   compositor->pipe->set_framebuffer_state(compositor->pipe, &compositor->fb_state);
+   compositor->pipe->set_viewport_state(compositor->pipe, &compositor->viewport);
+   compositor->pipe->bind_sampler_states(compositor->pipe, 1, &compositor->sampler);
+   compositor->pipe->set_sampler_textures(compositor->pipe, 1, &src_surface);
+   compositor->pipe->bind_vs_state(compositor->pipe, compositor->vertex_shader);
+   compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
+   compositor->pipe->set_vertex_buffers(compositor->pipe, 2, compositor->vertex_bufs);
+   compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, &compositor->vs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, &compositor->fs_const_buf);
+
+   vs_consts = pipe_buffer_map
+   (
+      compositor->pipe->screen,
+      compositor->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->dst_scale.x = dst_area->w / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_scale.y = dst_area->h / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_scale.z = 1;
+   vs_consts->dst_scale.w = 1;
+   vs_consts->dst_trans.x = dst_area->x / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_trans.y = dst_area->y / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_trans.z = 0;
+   vs_consts->dst_trans.w = 0;
+
+   vs_consts->src_scale.x = src_area->w / (float)src_surface->width[0];
+   vs_consts->src_scale.y = src_area->h / (float)src_surface->height[0];
+   vs_consts->src_scale.z = 1;
+   vs_consts->src_scale.w = 1;
+   vs_consts->src_trans.x = src_area->x / (float)src_surface->width[0];
+   vs_consts->src_trans.y = src_area->y / (float)src_surface->height[0];
+   vs_consts->src_trans.z = 0;
+   vs_consts->src_trans.w = 0;
+
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf.buffer);
+
+   compositor->pipe->draw_arrays(compositor->pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+   compositor->pipe->flush(compositor->pipe, PIPE_FLUSH_RENDER_CACHE, fence);
+
+   pipe_surface_reference(&compositor->fb_state.cbufs[0], NULL);
+}
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
new file mode 100644
index 0000000000..19ad66d9c6
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -0,0 +1,47 @@
+#ifndef vl_compositor_h
+#define vl_compositor_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_texture;
+
+struct vl_compositor
+{
+   struct pipe_context *pipe;
+
+   struct pipe_framebuffer_state fb_state;
+   void *sampler;
+   void *vertex_shader;
+   void *fragment_shader;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_bufs[2];
+   struct pipe_vertex_element vertex_elems[2];
+   struct pipe_constant_buffer vs_const_buf, fs_const_buf;
+};
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe);
+
+void vl_compositor_cleanup(struct vl_compositor *compositor);
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas,*/
+                          struct pipe_fence_handle      **fence);
+
+#endif /* vl_compositor_h */
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
new file mode 100644
index 0000000000..9b69f2956c
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
@@ -0,0 +1,1627 @@
+#include "vl_mpeg12_mc_renderer.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <util/u_math.h>
+#include <util/u_memory.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_shader_build.h"
+
+#define DEFAULT_BUF_ALIGNMENT 1
+#define MACROBLOCK_WIDTH 16
+#define MACROBLOCK_HEIGHT 16
+#define BLOCK_WIDTH 8
+#define BLOCK_HEIGHT 8
+#define ZERO_BLOCK_NIL -1.0f
+#define ZERO_BLOCK_IS_NIL(zb) ((zb).x < 0.0f)
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f denorm;
+};
+
+struct fragment_shader_consts
+{
+   struct vertex4f multiplier;
+   struct vertex4f div;
+};
+
+/*
+ * Muliplier renormalizes block samples from 16 bits to 12 bits.
+ * Divider is used when calculating Y % 2 for choosing top or bottom
+ * field for P or B macroblocks.
+ * TODO: Use immediates.
+ */
+static const struct fragment_shader_consts fs_consts = {
+   {32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+   {0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+struct vert_stream_0
+{
+   struct vertex2f pos;
+   struct vertex2f luma_tc;
+   struct vertex2f cb_tc;
+   struct vertex2f cr_tc;
+};
+
+enum MACROBLOCK_TYPE
+{
+   MACROBLOCK_TYPE_INTRA,
+   MACROBLOCK_TYPE_FWD_FRAME_PRED,
+   MACROBLOCK_TYPE_FWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BKWD_FRAME_PRED,
+   MACROBLOCK_TYPE_BKWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BI_FRAME_PRED,
+   MACROBLOCK_TYPE_BI_FIELD_PRED,
+
+   NUM_MACROBLOCK_TYPES
+};
+
+static void
+create_intra_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->i_vs = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_intra_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul o0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->i_fs = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_frame_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; Ref surface top field texcoords
+    * decl i5              ; Ref surface bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 5; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+        inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+        ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* add o4, i0, i4       ; Translate vertex pos by motion vec to form ref macroblock texcoords */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->p_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for ref surface texture
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t1, i3, s3             ; Read texel from ref macroblock */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->p_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; First ref macroblock top field texcoords
+    * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
+    * decl i6              ; Second ref macroblock top field texcoords
+    * decl i7              ; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 8; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; First ref macroblock texcoords
+    * decl o5              ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * add o4, i0, i4       ; Translate vertex pos by motion vec to form first ref macroblock texcoords
+    * add o5, i0, i6       ; Translate vertex pos by motion vec to form second ref macroblock texcoords
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->b_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; First ref macroblock texcoords
+    * decl i4                      ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+    * decl c1                      ; Constant 1/2 in .x channel to use as weight to blend past and future texels
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0-t2 */
+   decl = vl_decl_temps(0, 2);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for first ref surface texture
+    * decl s4                      ; Sampler for second ref surface texture
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * tex2d t1, i3, s3             ; Read texel from first ref macroblock
+    * tex2d t2, i4, s4             ; Read texel from second ref macroblock
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
+   inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->b_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+xfer_buffers_map(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->tex_transfer[i] = r->pipe->screen->get_tex_transfer
+      (
+         r->pipe->screen, r->textures.all[i],
+         0, 0, 0, PIPE_TRANSFER_WRITE, 0, 0,
+         r->textures.all[i]->width[0], r->textures.all[i]->height[0]
+      );
+
+      r->texels[i] = r->pipe->screen->transfer_map(r->pipe->screen, r->tex_transfer[i]);
+   }
+}
+
+static void
+xfer_buffers_unmap(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->pipe->screen->transfer_unmap(r->pipe->screen, r->tex_transfer[i]);
+      r->pipe->screen->tex_transfer_destroy(r->tex_transfer[i]);
+   }
+}
+
+static bool
+init_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_sampler_state sampler;
+   unsigned filters[5];
+   unsigned i;
+
+   assert(r);
+
+   r->viewport.scale[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->viewport.scale[1] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->viewport.scale[2] = 1;
+   r->viewport.scale[3] = 1;
+   r->viewport.translate[0] = 0;
+   r->viewport.translate[1] = 0;
+   r->viewport.translate[2] = 0;
+   r->viewport.translate[3] = 0;
+
+   r->fb_state.width = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->fb_state.height = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->fb_state.nr_cbufs = 1;
+   r->fb_state.zsbuf = NULL;
+
+   /* Luma filter */
+   filters[0] = PIPE_TEX_FILTER_NEAREST;
+   /* Chroma filters */
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_444 ||
+       r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+      filters[1] = PIPE_TEX_FILTER_NEAREST;
+      filters[2] = PIPE_TEX_FILTER_NEAREST;
+   }
+   else {
+      filters[1] = PIPE_TEX_FILTER_LINEAR;
+      filters[2] = PIPE_TEX_FILTER_LINEAR;
+   }
+   /* Fwd, bkwd ref filters */
+   filters[3] = PIPE_TEX_FILTER_LINEAR;
+   filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+   for (i = 0; i < 5; ++i) {
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_img_filter = filters[i];
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.mag_img_filter = filters[i];
+      sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+      sampler.compare_func = PIPE_FUNC_ALWAYS;
+      sampler.normalized_coords = 1;
+      /*sampler.prefilter = ; */
+      /*sampler.shadow_ambient = ; */
+      /*sampler.lod_bias = ; */
+      sampler.min_lod = 0;
+      /*sampler.max_lod = ; */
+      /*sampler.border_color[i] = ; */
+      /*sampler.max_anisotropy = ; */
+      r->samplers.all[i] = r->pipe->create_sampler_state(r->pipe, &sampler);
+   }
+
+   return true;
+}
+
+static void
+cleanup_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 5; ++i)
+      r->pipe->delete_sampler_state(r->pipe, r->samplers.all[i]);
+}
+
+static bool
+init_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   create_intra_vert_shader(r);
+   create_intra_frag_shader(r);
+   create_frame_pred_vert_shader(r);
+   create_frame_pred_frag_shader(r);
+   create_frame_bi_pred_vert_shader(r);
+   create_frame_bi_pred_frag_shader(r);
+
+   return true;
+}
+
+static void
+cleanup_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   r->pipe->delete_vs_state(r->pipe, r->i_vs);
+   r->pipe->delete_fs_state(r->pipe, r->i_fs);
+   r->pipe->delete_vs_state(r->pipe, r->p_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->p_fs[0]);
+   r->pipe->delete_vs_state(r->pipe, r->b_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->b_fs[0]);
+}
+
+static bool
+init_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_texture template;
+
+   const unsigned mbw =
+      align(r->picture_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
+   const unsigned mbh =
+      align(r->picture_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
+
+   unsigned i;
+
+   assert(r);
+
+   r->macroblocks_per_batch =
+      mbw * (r->bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE ? mbh : 1);
+   r->num_macroblocks = 0;
+   r->macroblock_buf = MALLOC(r->macroblocks_per_batch * sizeof(struct pipe_mpeg12_macroblock));
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   /* TODO: Accomodate HW that can't do this and also for cases when this isn't precise enough */
+   template.format = PIPE_FORMAT_R16_SNORM;
+   template.last_level = 0;
+   template.width[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   template.height[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_DYNAMIC;
+
+   r->textures.individual.y = r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
+      template.width[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_width / 2) :
+         r->picture_width / 2;
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+   }
+   else if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422)
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+
+   r->textures.individual.cb =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+   r->textures.individual.cr =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   r->vertex_bufs.individual.ycbcr.stride = sizeof(struct vertex2f) * 4;
+   r->vertex_bufs.individual.ycbcr.max_index = 24 * r->macroblocks_per_batch - 1;
+   r->vertex_bufs.individual.ycbcr.buffer_offset = 0;
+   r->vertex_bufs.individual.ycbcr.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex2f) * 4 * 24 * r->macroblocks_per_batch
+   );
+
+   for (i = 1; i < 3; ++i) {
+      r->vertex_bufs.all[i].stride = sizeof(struct vertex2f) * 2;
+      r->vertex_bufs.all[i].max_index = 24 * r->macroblocks_per_batch - 1;
+      r->vertex_bufs.all[i].buffer_offset = 0;
+      r->vertex_bufs.all[i].buffer = pipe_buffer_create
+      (
+         r->pipe->screen,
+         DEFAULT_BUF_ALIGNMENT,
+         PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+         sizeof(struct vertex2f) * 2 * 24 * r->macroblocks_per_batch
+      );
+   }
+
+   /* Position element */
+   r->vertex_elems[0].src_offset = 0;
+   r->vertex_elems[0].vertex_buffer_index = 0;
+   r->vertex_elems[0].nr_components = 2;
+   r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Luma, texcoord element */
+   r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[1].vertex_buffer_index = 0;
+   r->vertex_elems[1].nr_components = 2;
+   r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cr texcoord element */
+   r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
+   r->vertex_elems[2].vertex_buffer_index = 0;
+   r->vertex_elems[2].nr_components = 2;
+   r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cb texcoord element */
+   r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
+   r->vertex_elems[3].vertex_buffer_index = 0;
+   r->vertex_elems[3].nr_components = 2;
+   r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface top field texcoord element */
+   r->vertex_elems[4].src_offset = 0;
+   r->vertex_elems[4].vertex_buffer_index = 1;
+   r->vertex_elems[4].nr_components = 2;
+   r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface bottom field texcoord element */
+   r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[5].vertex_buffer_index = 1;
+   r->vertex_elems[5].nr_components = 2;
+   r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface top field texcoord element */
+   r->vertex_elems[6].src_offset = 0;
+   r->vertex_elems[6].vertex_buffer_index = 2;
+   r->vertex_elems[6].nr_components = 2;
+   r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface bottom field texcoord element */
+   r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[7].vertex_buffer_index = 2;
+   r->vertex_elems[7].nr_components = 2;
+   r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   r->vs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   r->fs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT, sizeof(struct fragment_shader_consts)
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(r->pipe->screen, r->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      &fs_consts, sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf.buffer);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   pipe_buffer_reference(&r->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&r->fs_const_buf.buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_texture_reference(&r->textures.all[i], NULL);
+
+   FREE(r->macroblock_buf);
+}
+
+static enum MACROBLOCK_TYPE
+get_macroblock_type(struct pipe_mpeg12_macroblock *mb)
+{
+   assert(mb);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+         return MACROBLOCK_TYPE_INTRA;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_FWD_FRAME_PRED : MACROBLOCK_TYPE_FWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BKWD_FRAME_PRED : MACROBLOCK_TYPE_BKWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BI_FRAME_PRED : MACROBLOCK_TYPE_BI_FIELD_PRED;
+      default:
+         assert(0);
+   }
+
+   /* Unreachable */
+   return -1;
+}
+
+/* XXX: One of these days this will have to be killed with fire */
+#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, use_zb, zb)				\
+	do {															\
+	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+																\
+	if (!use_zb || (cbp) & (lm))												\
+	{															\
+		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
+		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
+		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
+		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (cbm))												\
+	{															\
+		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
+		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
+		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
+		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (crm))												\
+	{															\
+		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
+		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
+		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
+		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
+	}															\
+	} while (0)
+
+static void
+gen_macroblock_verts(struct vl_mpeg12_mc_renderer *r,
+                     struct pipe_mpeg12_macroblock *mb, unsigned pos,
+                     struct vert_stream_0 *ycbcr_vb, struct vertex2f **ref_vb)
+{
+   struct vertex2f mo_vec[2];
+
+   unsigned i;
+
+   assert(r);
+   assert(mb);
+   assert(ycbcr_vb);
+   assert(pos < r->macroblocks_per_batch);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[1]);
+
+         vb = ref_vb[1] + pos * 2 * 24;
+
+         mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+         mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+         if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[0]);
+
+         vb = ref_vb[0] + pos * 2 * 24;
+
+         if (mb->mb_type == PIPE_MPEG12_MACROBLOCK_TYPE_BKWD) {
+             mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+             mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+             if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+                mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+                mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+             }
+         }
+         else {
+            mo_vec[0].x = mb->pmv[0][0][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[0].y = mb->pmv[0][0][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+               mo_vec[1].x = mb->pmv[1][0][0] * 0.5f * r->surface_tex_inv_size.x;
+               mo_vec[1].y = mb->pmv[1][0][1] * 0.5f * r->surface_tex_inv_size.y;
+            }
+         }
+
+         if (mb->mb_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+      {
+         const struct vertex2f unit =
+         {
+            r->surface_tex_inv_size.x * MACROBLOCK_WIDTH,
+            r->surface_tex_inv_size.y * MACROBLOCK_HEIGHT
+         };
+         const struct vertex2f half =
+         {
+            r->surface_tex_inv_size.x * (MACROBLOCK_WIDTH / 2),
+            r->surface_tex_inv_size.y * (MACROBLOCK_HEIGHT / 2)
+         };
+         const bool use_zb = r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE;
+
+         struct vert_stream_0 *vb = ycbcr_vb + pos * 24;
+
+         SET_BLOCK(vb, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, 0, half.x, half.y,
+                   32, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 6, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, 0, half.x, half.y,
+                   16, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 12, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, half.y, half.x, half.y,
+                   8, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 18, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, half.y, half.x, half.y,
+                   4, 2, 1, use_zb, r->zero_block);
+
+         break;
+      }
+      default:
+         assert(0);
+   }
+}
+
+static void
+gen_macroblock_stream(struct vl_mpeg12_mc_renderer *r,
+                      unsigned *num_macroblocks)
+{
+   unsigned offset[NUM_MACROBLOCK_TYPES];
+   struct vert_stream_0 *ycbcr_vb;
+   struct vertex2f *ref_vb[2];
+   unsigned i;
+
+   assert(r);
+   assert(num_macroblocks);
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+      ++num_macroblocks[mb_type];
+   }
+
+   offset[0] = 0;
+
+   for (i = 1; i < NUM_MACROBLOCK_TYPES; ++i)
+      offset[i] = offset[i - 1] + num_macroblocks[i - 1];
+
+   ycbcr_vb = (struct vert_stream_0 *)pipe_buffer_map
+   (
+      r->pipe->screen,
+      r->vertex_bufs.individual.ycbcr.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   for (i = 0; i < 2; ++i)
+      ref_vb[i] = (struct vertex2f *)pipe_buffer_map
+      (
+         r->pipe->screen,
+         r->vertex_bufs.individual.ref[i].buffer,
+         PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+      );
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+
+      gen_macroblock_verts(r, &r->macroblock_buf[i], offset[mb_type],
+                           ycbcr_vb, ref_vb);
+
+      ++offset[mb_type];
+   }
+
+   pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ycbcr.buffer);
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ref[i].buffer);
+}
+
+static void
+flush(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned num_macroblocks[NUM_MACROBLOCK_TYPES] = { 0 };
+   unsigned vb_start = 0;
+   struct vertex_shader_consts *vs_consts;
+   unsigned i;
+
+   assert(r);
+   assert(r->num_macroblocks == r->macroblocks_per_batch);
+
+   gen_macroblock_stream(r, num_macroblocks);
+
+   r->fb_state.cbufs[0] = r->pipe->screen->get_tex_surface
+   (
+      r->pipe->screen, r->surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   r->pipe->set_framebuffer_state(r->pipe, &r->fb_state);
+   r->pipe->set_viewport_state(r->pipe, &r->viewport);
+
+   vs_consts = pipe_buffer_map
+   (
+      r->pipe->screen, r->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->denorm.x = r->surface->width[0];
+   vs_consts->denorm.y = r->surface->height[0];
+
+   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf.buffer);
+
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
+                                &r->vs_const_buf);
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_FRAGMENT, 0,
+                                &r->fs_const_buf);
+
+   if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 4, r->vertex_elems);
+      r->pipe->set_sampler_textures(r->pipe, 3, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 3, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->i_vs);
+      r->pipe->bind_fs_state(r->pipe, r->i_fs);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24;
+   }
+
+   r->pipe->flush(r->pipe, PIPE_FLUSH_RENDER_CACHE, r->fence);
+   pipe_surface_reference(&r->fb_state.cbufs[0], NULL);
+
+   if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE)
+      for (i = 0; i < 3; ++i)
+         r->zero_block[i].x = ZERO_BLOCK_NIL;
+
+   r->num_macroblocks = 0;
+}
+
+static void
+grab_frame_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_field_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch * 2, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+fill_zero_block(short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memset(dst + y * dst_pitch, 0, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
+            enum pipe_mpeg12_dct_type dct_type, unsigned cbp, short *blocks)
+{
+   unsigned tex_pitch;
+   short *texels;
+   unsigned tb = 0, sb = 0;
+   unsigned mbpx = mbx * MACROBLOCK_WIDTH, mbpy = mby * MACROBLOCK_HEIGHT;
+   unsigned x, y;
+
+   assert(r);
+   assert(blocks);
+
+   tex_pitch = r->tex_transfer[0]->stride / r->tex_transfer[0]->block.size;
+   texels = r->texels[0] + mbpy * tex_pitch + mbpx;
+
+   for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x, ++tb) {
+         if ((cbp >> (5 - tb)) & 1) {
+            if (dct_type == PIPE_MPEG12_DCT_TYPE_FRAME) {
+               grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch * BLOCK_WIDTH +
+                                      x * BLOCK_WIDTH, tex_pitch);
+            }
+            else {
+               grab_field_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch + x * BLOCK_WIDTH,
+                                      tex_pitch);
+            }
+
+            ++sb;
+         }
+         else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+                ZERO_BLOCK_IS_NIL(r->zero_block[0])) {
+               fill_zero_block(texels + y * tex_pitch * BLOCK_WIDTH + x * BLOCK_WIDTH, tex_pitch);
+               if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+                  r->zero_block[0].x = (mbpx + x * 8) * r->surface_tex_inv_size.x;
+                  r->zero_block[0].y = (mbpy + y * 8) * r->surface_tex_inv_size.y;
+               }
+            }
+         }
+      }
+   }
+
+   /* TODO: Implement 422, 444 */
+   assert(r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420);
+
+   mbpx /= 2;
+   mbpy /= 2;
+
+   for (tb = 0; tb < 2; ++tb) {
+      tex_pitch = r->tex_transfer[tb + 1]->stride / r->tex_transfer[tb + 1]->block.size;
+      texels = r->texels[tb + 1] + mbpy * tex_pitch + mbpx;
+
+      if ((cbp >> (1 - tb)) & 1) {
+         grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT, texels, tex_pitch);
+         ++sb;
+      }
+      else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+         if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+             ZERO_BLOCK_IS_NIL(r->zero_block[tb + 1])) {
+            fill_zero_block(texels, tex_pitch);
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+               r->zero_block[tb + 1].x = (mbpx << 1) * r->surface_tex_inv_size.x;
+               r->zero_block[tb + 1].y = (mbpy << 1) * r->surface_tex_inv_size.y;
+            }
+         }
+      }
+   }
+}
+
+static void
+grab_macroblock(struct vl_mpeg12_mc_renderer *r,
+                struct pipe_mpeg12_macroblock *mb)
+{
+   assert(r);
+   assert(mb);
+   assert(r->num_macroblocks < r->macroblocks_per_batch);
+
+   memcpy(&r->macroblock_buf[r->num_macroblocks], mb,
+          sizeof(struct pipe_mpeg12_macroblock));
+
+   grab_blocks(r, mb->mbx, mb->mby, mb->dct_type, mb->cbp, mb->blocks);
+
+   ++r->num_macroblocks;
+}
+
+bool
+vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                           struct pipe_context *pipe,
+                           unsigned picture_width,
+                           unsigned picture_height,
+                           enum pipe_video_chroma_format chroma_format,
+                           enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                           enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                           bool pot_buffers)
+{
+   unsigned i;
+
+   assert(renderer);
+   assert(pipe);
+   /* TODO: Implement other policies */
+   assert(bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE);
+   /* TODO: Implement this */
+   /* XXX: XFER_ALL sampling issue at block edges when using bilinear filtering */
+   assert(eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE);
+   /* TODO: Non-pot buffers untested, probably doesn't work without changes to texcoord generation, vert shader, etc */
+   assert(pot_buffers);
+
+   memset(renderer, 0, sizeof(struct vl_mpeg12_mc_renderer));
+
+   renderer->pipe = pipe;
+   renderer->picture_width = picture_width;
+   renderer->picture_height = picture_height;
+   renderer->chroma_format = chroma_format;
+   renderer->bufmode = bufmode;
+   renderer->eb_handling = eb_handling;
+   renderer->pot_buffers = pot_buffers;
+
+   if (!init_pipe_state(renderer))
+      return false;
+   if (!init_shaders(renderer)) {
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+   if (!init_buffers(renderer)) {
+      cleanup_shaders(renderer);
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+
+   renderer->surface = NULL;
+   renderer->past = NULL;
+   renderer->future = NULL;
+   for (i = 0; i < 3; ++i)
+      renderer->zero_block[i].x = ZERO_BLOCK_NIL;
+   renderer->num_macroblocks = 0;
+
+   xfer_buffers_map(renderer);
+
+   return true;
+}
+
+void
+vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer)
+{
+   assert(renderer);
+
+   xfer_buffers_unmap(renderer);
+
+   cleanup_pipe_state(renderer);
+   cleanup_shaders(renderer);
+   cleanup_buffers(renderer);
+}
+
+void
+vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer
+                                         *renderer,
+                                         struct pipe_texture *surface,
+                                         struct pipe_texture *past,
+                                         struct pipe_texture *future,
+                                         unsigned num_macroblocks,
+                                         struct pipe_mpeg12_macroblock
+                                         *mpeg12_macroblocks,
+                                         struct pipe_fence_handle **fence)
+{
+   bool new_surface = false;
+
+   assert(renderer);
+   assert(surface);
+   assert(num_macroblocks);
+   assert(mpeg12_macroblocks);
+
+   if (renderer->surface) {
+      if (surface != renderer->surface) {
+         if (renderer->num_macroblocks > 0) {
+            xfer_buffers_unmap(renderer);
+            flush(renderer);
+         }
+         
+         new_surface = true;
+      }
+
+      /* If the surface we're rendering hasn't changed the ref frames shouldn't change. */
+      assert(surface != renderer->surface || renderer->past == past);
+      assert(surface != renderer->surface || renderer->future == future);
+   }
+   else
+      new_surface = true;
+
+   if (new_surface) {
+      renderer->surface = surface;
+      renderer->past = past;
+      renderer->future = future;
+      renderer->fence = fence;
+      renderer->surface_tex_inv_size.x = 1.0f / surface->width[0];
+      renderer->surface_tex_inv_size.y = 1.0f / surface->height[0];
+   }
+
+   while (num_macroblocks) {
+      unsigned left_in_batch = renderer->macroblocks_per_batch - renderer->num_macroblocks;
+      unsigned num_to_submit = MIN2(num_macroblocks, left_in_batch);
+      unsigned i;
+
+      for (i = 0; i < num_to_submit; ++i) {
+         assert(mpeg12_macroblocks[i].base.codec == PIPE_VIDEO_CODEC_MPEG12);
+         grab_macroblock(renderer, &mpeg12_macroblocks[i]);
+      }
+
+      num_macroblocks -= num_to_submit;
+
+      if (renderer->num_macroblocks == renderer->macroblocks_per_batch) {
+         xfer_buffers_unmap(renderer);
+         flush(renderer);
+         xfer_buffers_map(renderer);
+         /* Next time we get this surface it may have new ref frames */
+         renderer->surface = NULL;
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
new file mode 100644
index 0000000000..0c2f679664
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
@@ -0,0 +1,93 @@
+#ifndef vl_mpeg12_mc_renderer_h
+#define vl_mpeg12_mc_renderer_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_video_surface;
+struct pipe_macroblock;
+
+/* A slice is video-width (rounded up to a multiple of macroblock width) x macroblock height */
+enum VL_MPEG12_MC_RENDERER_BUFFER_MODE
+{
+   VL_MPEG12_MC_RENDERER_BUFFER_SLICE,  /* Saves memory at the cost of smaller batches */
+   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE /* Larger batches, more memory */
+};
+
+enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK
+{
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL, /* Waste of memory bandwidth */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE, /* Can only do point-filtering when interpolating subsampled chroma channels */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE /* Needs conditional texel fetch! */
+};
+
+struct vl_mpeg12_mc_renderer
+{
+   struct pipe_context *pipe;
+   unsigned picture_width;
+   unsigned picture_height;
+   enum pipe_video_chroma_format chroma_format;
+   enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode;
+   enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling;
+   bool pot_buffers;
+   unsigned macroblocks_per_batch;
+
+   struct pipe_viewport_state viewport;
+   struct pipe_constant_buffer vs_const_buf;
+   struct pipe_constant_buffer fs_const_buf;
+   struct pipe_framebuffer_state fb_state;
+   struct pipe_vertex_element vertex_elems[8];
+	
+   union
+   {
+      void *all[5];
+      struct { void *y, *cb, *cr, *ref[2]; } individual;
+   } samplers;
+	
+   void *i_vs, *p_vs[2], *b_vs[2];
+   void *i_fs, *p_fs[2], *b_fs[2];
+	
+   union
+   {
+      struct pipe_texture *all[5];
+      struct { struct pipe_texture *y, *cb, *cr, *ref[2]; } individual;
+   } textures;
+
+   union
+   {
+      struct pipe_vertex_buffer all[3];
+      struct { struct pipe_vertex_buffer ycbcr, ref[2]; } individual;
+   } vertex_bufs;
+	
+   struct pipe_texture *surface, *past, *future;
+   struct pipe_fence_handle **fence;
+   unsigned num_macroblocks;
+   struct pipe_mpeg12_macroblock *macroblock_buf;
+   struct pipe_transfer *tex_transfer[3];
+   short *texels[3];
+   struct { float x, y; } surface_tex_inv_size;
+   struct { float x, y; } zero_block[3];
+};
+
+bool vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                                struct pipe_context *pipe,
+                                unsigned picture_width,
+                                unsigned picture_height,
+                                enum pipe_video_chroma_format chroma_format,
+                                enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                                enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                                bool pot_buffers);
+
+void vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer);
+
+void vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer *renderer,
+                                              struct pipe_texture *surface,
+                                              struct pipe_texture *past,
+                                              struct pipe_texture *future,
+                                              unsigned num_macroblocks,
+                                              struct pipe_mpeg12_macroblock *mpeg12_macroblocks,
+                                              struct pipe_fence_handle **fence);
+
+#endif /* vl_mpeg12_mc_renderer_h */
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.c b/src/gallium/auxiliary/vl/vl_shader_build.c
new file mode 100644
index 0000000000..9ad1e052c6
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.c
@@ -0,0 +1,215 @@
+#include "vl_shader_build.h"
+#include <assert.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   assert
+   (
+      interpolation == TGSI_INTERPOLATE_CONSTANT ||
+      interpolation == TGSI_INTERPOLATE_LINEAR ||
+      interpolation == TGSI_INTERPOLATE_PERSPECTIVE
+   );
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.Declaration.Interpolate = interpolation;;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_CONSTANT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_TEMPORARY;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 1;
+   inst.FullSrcRegisters[0].SrcRegister.File = src_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.InstructionExtTexture.Texture = tex;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+   inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
+   inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_end(void)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+
+   return inst;
+}
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.h b/src/gallium/auxiliary/vl/vl_shader_build.h
new file mode 100644
index 0000000000..c6c60b5552
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.h
@@ -0,0 +1,61 @@
+#ifndef vl_shader_build_h
+#define vl_shader_build_h
+
+#include <pipe/p_shader_tokens.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+);
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+);
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+);
+struct tgsi_full_instruction vl_end(void);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index bd48ce7005..d185c6b849 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -41,7 +41,7 @@
 static const char *
 cell_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -64,8 +64,6 @@ cell_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index b43f735245..e745f3342d 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -175,12 +175,19 @@ i915_is_buffer_referenced(struct pipe_context *pipe,
 static void i915_destroy(struct pipe_context *pipe)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
 
    draw_destroy(i915->draw);
    
    if(i915->batch)
       i915->iws->batchbuffer_destroy(i915->batch);
 
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
    FREE(i915);
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index 508f4560e4..8a3e466c84 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -44,6 +44,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_fifo.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
@@ -76,8 +77,13 @@ struct i915_vbuf_render {
    size_t vbo_size;
    size_t vbo_offset;
    void *vbo_ptr;
-   size_t vbo_alloc_size;
    size_t vbo_max_used;
+
+   /* stuff for the pool */
+   struct util_fifo *pool_fifo;
+   unsigned pool_used;
+   unsigned pool_buffer_size;
+   boolean pool_not_used;
 };
 
 
@@ -106,33 +112,72 @@ i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
 }
 
 static boolean
+i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+      return FALSE;
+
+   if (i915->vbo_flushed)
+      return FALSE;
+
+   return TRUE;
+}
+
+static void
+i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+   struct intel_winsys *iws = i915->iws;
+
+   if (i915_render->vbo) {
+      if (i915_render->pool_not_used)
+         iws->buffer_destroy(iws, i915_render->vbo);
+      else
+         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
+      i915_render->vbo = NULL;
+   }
+
+   i915->vbo_flushed = 0;
+
+   i915_render->vbo_size = MAX2(size, i915_render->pool_buffer_size);
+   i915_render->vbo_offset = 0;
+
+   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
+      i915_render->pool_not_used = TRUE;
+      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
+            INTEL_NEW_VERTEX);
+   } else {
+      i915_render->pool_not_used = FALSE;
+
+      if (i915_render->pool_used >= 2) {
+         FLUSH_BATCH(NULL);
+         i915->vbo_flushed = 0;
+         i915_render->pool_used = 0;
+      }
+      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
+   }
+}
+
+static boolean
 i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
                                    ushort vertex_size,
                                    ushort nr_vertices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct intel_winsys *iws = i915->iws;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
 
    /* FIXME: handle failure */
    assert(!i915->vbo);
 
-   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
-   } else {
-      i915->vbo_flushed = 0;
-      if (i915_render->vbo) {
-         iws->buffer_destroy(iws, i915_render->vbo);
-         i915_render->vbo = NULL;
-      }
-   }
+   if (!i915_vbuf_render_reserve(i915_render, size)) {
 
-   if (!i915_render->vbo) {
-      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
-      i915_render->vbo_offset = 0;
-      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                            INTEL_NEW_VERTEX);
+      if (i915->vbo_flushed)
+         i915_render->pool_used = 0;
 
+      i915_vbuf_render_new_buf(i915_render, size);
    }
 
    i915_render->vertex_size = vertex_size;
@@ -153,7 +198,7 @@ i915_vbuf_render_map_vertices(struct vbuf_render *render)
    struct intel_winsys *iws = i915->iws;
 
    if (i915->vbo_flushed)
-      debug_printf("%s bad vbo flush occured stalling on hw\n");
+      debug_printf("%s bad vbo flush occured stalling on hw\n", __FUNCTION__);
 
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
 
@@ -344,14 +389,43 @@ i915_vbuf_render_draw_arrays(struct vbuf_render *render,
                              uint nr)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
 
    if (i915_render->fallback) {
       draw_arrays_fallback(render, start, nr);
       return;
    }
 
-   /* JB: TODO submit direct cmds */
-   draw_arrays_fallback(render, start, nr);
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   if (!BEGIN_BATCH(2, 0)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(2, 0)) {
+         assert(0);
+         goto out;
+      }
+   }
+
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             PRIM_INDIRECT_SEQUENTIAL |
+             i915_render->hwprim |
+             nr);
+   OUT_BATCH(start); /* Beginning vertex index */
+
+out:
+   return;
 }
 
 /**
@@ -504,6 +578,7 @@ i915_vbuf_render_create(struct i915_context *i915)
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
    struct intel_winsys *iws = i915->iws;
+   int i;
 
    i915_render->i915 = i915;
    
@@ -524,14 +599,24 @@ i915_vbuf_render_create(struct i915_context *i915)
    i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
    i915_render->base.destroy = i915_vbuf_render_destroy;
 
-   i915_render->vbo_alloc_size = 128 * 4096;
-   i915_render->vbo_size = i915_render->vbo_alloc_size;
+
+   i915_render->vbo = NULL;
+   i915_render->vbo_size = 0;
    i915_render->vbo_offset = 0;
-   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                         INTEL_NEW_VERTEX);
+
+   i915_render->pool_used = FALSE;
+   i915_render->pool_buffer_size = 128 * 4096;
+   i915_render->pool_fifo = u_fifo_create(6);
+   for (i = 0; i < 6; i++)
+      u_fifo_add(i915_render->pool_fifo,
+                 iws->buffer_create(iws, i915_render->pool_buffer_size, 64,
+                                    INTEL_NEW_VERTEX));
+
+#if 0
    /* TODO JB: is this realy needed? */
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
    iws->buffer_unmap(iws, i915_render->vbo);
+#endif
 
    return &i915_render->base;
 }
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index 9f017a14cc..c66558c320 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -46,7 +46,7 @@
 static const char *
 i915_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 static const char *
@@ -101,8 +101,6 @@ i915_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index 0087dfa410..7d48e6e84d 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -588,9 +588,17 @@ static void i915_set_framebuffer_state(struct pipe_context *pipe,
 				       const struct pipe_framebuffer_state *fb)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
+
    draw_flush(i915->draw);
 
-   i915->framebuffer = *fb; /* struct copy */
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index 6a6c654271..15ccc1fc73 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -165,7 +165,7 @@ i915_scanout_layout(struct i915_texture *tex)
    struct pipe_texture *pt = &tex->base;
 
    if (pt->last_level > 0 || pt->block.size != 4)
-      return 0;
+      return FALSE;
 
    i915_miptree_set_level_info(tex, 0, 1,
                                tex->base.width[0],
@@ -191,6 +191,38 @@ i915_scanout_layout(struct i915_texture *tex)
    return TRUE;
 }
 
+/**
+ * Special case to deal with shared textures.
+ */
+static boolean
+i915_display_target_layout(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   if (pt->last_level > 0 || pt->block.size != 4)
+      return FALSE;
+
+   /* fallback to normal textures for small textures */
+   if (tex->base.width[0] < 240)
+      return FALSE;
+
+   i915_miptree_set_level_info(tex, 0, 1,
+                               tex->base.width[0],
+                               tex->base.height[0],
+                               1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+   tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   tex->hw_tiled = INTEL_TILE_X;
+
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      tex->base.width[0], tex->base.height[0], pt->block.size,
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+
+   return TRUE;
+}
+
 static void
 i915_miptree_layout_2d(struct i915_texture *tex)
 {
@@ -201,6 +233,16 @@ i915_miptree_layout_2d(struct i915_texture *tex)
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
+   /* used for scanouts that need special layouts */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+      if (i915_scanout_layout(tex))
+         return;
+
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
    tex->total_nblocksy = 0;
 
@@ -351,6 +393,11 @@ i945_miptree_layout_2d(struct i915_texture *tex)
       if (i915_scanout_layout(tex))
          return;
 
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
    /* May need to adjust pitch to accomodate the placement of
diff --git a/src/gallium/drivers/i915simple/intel_winsys.h b/src/gallium/drivers/i915simple/intel_winsys.h
index f949f52a9c..42c5e7470e 100644
--- a/src/gallium/drivers/i915simple/intel_winsys.h
+++ b/src/gallium/drivers/i915simple/intel_winsys.h
@@ -150,6 +150,17 @@ struct intel_winsys {
    void (*buffer_unmap)(struct intel_winsys *iws,
                         struct intel_buffer *buffer);
 
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pwrite(2)
+    */
+   int (*buffer_write)(struct intel_winsys *iws,
+                       struct intel_buffer *dst,
+                       const void *src,
+                       size_t size,
+                       size_t offset);
+
    void (*buffer_destroy)(struct intel_winsys *iws,
                           struct intel_buffer *buffer);
    /*@}*/
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index b22e105f10..4a84c4db23 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -39,7 +39,7 @@
 static const char *
 brw_get_vendor( struct pipe_screen *screen )
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -85,8 +85,6 @@ brw_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 5ac09de79e..cd7b6356d2 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -17,9 +17,11 @@ C_SOURCES = \
 	lp_bld_depth.c \
 	lp_bld_flow.c \
 	lp_bld_format_aos.c \
+	lp_bld_format_soa.c \
 	lp_bld_interp.c \
 	lp_bld_intr.c \
 	lp_bld_logic.c \
+	lp_bld_sample_soa.c \
 	lp_bld_swizzle.c \
 	lp_bld_struct.c \
 	lp_bld_tgsi_soa.c \
@@ -46,7 +48,8 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample.c \
+	lp_tex_sample_c.c \
+	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
 	lp_tile_soa.c
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 498d21dea6..89d08834a3 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -8,13 +8,16 @@ Done so far is:
 
  - the whole fragment pipeline is code generated in a single function
  
+   - input interpolation
+   
    - depth testing
  
+   - texture sampling (not all state/formats are supported) 
+   
    - fragment shader TGSI translation
      - same level of support as the TGSI SSE2 exec machine, with the exception
        we don't fallback to TGSI interpretation when an unsupported opcode is
        found, but just ignore it
-     - texture sampling via an intrinsic call
      - done in SoA layout
      - input interpolation also code generated
  
@@ -28,16 +31,17 @@ Done so far is:
      any width and length
    - not all operations are implemented for these types yet though
 
-Most mesa/progs/demos/* work. Speed is on par with Keith's softpipe-opt branch,
-which includes hand written fast implementations for common cases.
+Most mesa/progs/demos/* work. 
 
 To do (probably by this order):
 
  - code generate stipple and stencil testing
 
- - code generate texture sampling
+ - translate the remaining bits of texture sampling state
 
  - translate TGSI control flow instructions, and all other remaining opcodes
+ 
+ - integrate with the draw module for VS code generation
 
  - code generate the triangle setup and rasterization
 
@@ -93,7 +97,7 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
   make linux-llvm
 
-but the rest of these instructions assume scons is used.
+but the rest of these instructions assume that scons is used.
 
 
 Using
@@ -108,6 +112,9 @@ or
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
 
+For performance evaluation pass debug=no to scons, and use the corresponding
+lib directory without the "-debug" suffix.
+
 
 Unit testing
 ============
@@ -119,7 +126,7 @@ build/linux-???-debug/gallium/drivers/llvmpipe:
  - lp_test_conv: SIMD vector conversion
  - lp_test_format: pixel unpacking/packing
 
-Some of this tests can output results and benchmarks to a tab-seperated-file
+Some of this tests can output results and benchmarks to a tab-separated-file
 for posterior analysis, e.g.:
 
   build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
@@ -133,10 +140,10 @@ Development Notes
   at the top of the lp_bld_*.c functions.  
 
 - All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be 
-  put in a standalone Gallium state -> LLVM IR translation module.
+  put in a stand-alone Gallium state -> LLVM IR translation module.
 
 - We use LLVM-C bindings for now. They are not documented, but follow the C++
   interfaces very closely, and appear to be complete enough for code
   generation. See 
   http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
-  for a standalone example.
+  for a stand-alone example.
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 5c29bdac56..f4a9a3b22e 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -3,7 +3,7 @@ Import('*')
 env = env.Clone()
 
 env.Tool('llvm')
-if 'LLVM_VERSION' not in env:
+if not env.has_key('LLVM_VERSION'):
     print 'warning: LLVM not found: not building llvmpipe'
     Return()
 
@@ -23,8 +23,10 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_depth.c',
 		'lp_bld_flow.c',
 		'lp_bld_format_aos.c',
+		'lp_bld_format_soa.c',
 		'lp_bld_interp.c',
 		'lp_bld_intr.c',
+		'lp_bld_sample_soa.c',
 		'lp_bld_struct.c',
 		'lp_bld_logic.c',
 		'lp_bld_swizzle.c',
@@ -52,7 +54,8 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample.c',
+		'lp_tex_sample_c.c',
+		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
 		'lp_tile_soa.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index 49c2f911af..2b4bc5c819 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -45,7 +45,7 @@
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref)
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 9dbcdb4daa..634575670d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -38,14 +38,14 @@
 #include <llvm-c/Core.h>  
 
 struct pipe_alpha_state;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index 09a57ff33d..e8c5fa3c2a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -48,6 +48,7 @@
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -65,7 +66,7 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
@@ -113,36 +114,34 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.max.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -159,7 +158,7 @@ LLVMValueRef
 lp_build_comp(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->one)
       return bld->zero;
@@ -188,7 +187,7 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == bld->zero)
@@ -204,15 +203,14 @@ lp_build_add(struct lp_build_context *bld,
       if(a == bld->one || b == bld->one)
         return bld->one;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -241,7 +239,7 @@ lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(b == bld->zero)
@@ -257,15 +255,14 @@ lp_build_sub(struct lp_build_context *bld,
       if(b == bld->one)
         return bld->zero;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -405,7 +402,7 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->zero;
@@ -419,8 +416,7 @@ lp_build_mul(struct lp_build_context *bld,
       return bld->undef;
 
    if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
+      if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
          LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
          LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
          static LLVMValueRef ml = NULL;
@@ -456,7 +452,6 @@ lp_build_mul(struct lp_build_context *bld,
          
          return ab;
       }
-#endif
 
       /* FIXME */
       assert(0);
@@ -477,7 +472,7 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->zero;
@@ -493,15 +488,38 @@ lp_build_div(struct lp_build_context *bld,
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstFDiv(a, b);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
 
    return LLVMBuildFDiv(bld->builder, a, b, "");
 }
 
 
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
 /**
  * Generate min(a, b)
  * Do checks for special cases.
@@ -565,33 +583,216 @@ LLVMValueRef
 lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
 
    if(!type.sign)
       return a;
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(!type.floating && type.width*type.length == 128) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      if(type.width == 8)
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+      switch(type.width) {
+      case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-      if(type.width == 16)
+      case 16:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
-      if(type.width == 32)
+      case 32:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
    }
-#endif
 
    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 }
 
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   a = lp_build_floor(bld, a);
+   a = lp_build_int(bld, a);
+   return a;
+}
+
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -609,7 +810,7 @@ LLVMValueRef
 lp_build_rcp(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->undef;
@@ -623,11 +824,9 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
@@ -640,15 +839,12 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
                LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    assert(type.floating);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
@@ -661,7 +857,7 @@ LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -681,7 +877,7 @@ LLVMValueRef
 lp_build_sin(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -704,7 +900,8 @@ lp_build_pow(struct lp_build_context *bld,
 {
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 }
@@ -752,13 +949,14 @@ lp_build_polynomial(struct lp_build_context *bld,
                     const double *coeffs,
                     unsigned num_coeffs)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res = NULL;
    unsigned i;
 
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
@@ -800,7 +998,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_frac_part,
                      LLVMValueRef *p_exp2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef ipart = NULL;
@@ -812,7 +1010,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
@@ -893,7 +1092,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_floor_log2,
                      LLVMValueRef *p_log2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
@@ -911,7 +1110,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
index fc8cb25966..d68a97c4b8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -72,6 +72,26 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1);
+
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11);
+
+LLVMValueRef
 lp_build_min(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
@@ -86,6 +106,34 @@ lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a);
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index d19e18846c..da272e549f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -46,7 +46,7 @@
 
 
 struct pipe_blend_state;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -74,7 +74,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
@@ -84,7 +84,7 @@ lp_build_blend_aos(LLVMBuilderRef builder,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef const_[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c11a9398f8..d14f468ba9 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -303,7 +303,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index b92254a7d6..9511299d55 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -199,7 +199,7 @@ lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef con[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.c b/src/gallium/drivers/llvmpipe/lp_bld_const.c
index 21487365ea..c8eaa8c394 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.c
@@ -42,7 +42,7 @@
 
 
 unsigned
-lp_mantissa(union lp_type type)
+lp_mantissa(struct lp_type type)
 {
    assert(type.floating);
 
@@ -72,7 +72,7 @@ lp_mantissa(union lp_type type)
  * Same as lp_const_scale(), but in terms of shifts.
  */
 unsigned
-lp_const_shift(union lp_type type)
+lp_const_shift(struct lp_type type)
 {
    if(type.floating)
       return 0;
@@ -86,7 +86,7 @@ lp_const_shift(union lp_type type)
 
 
 unsigned
-lp_const_offset(union lp_type type)
+lp_const_offset(struct lp_type type)
 {
    if(type.floating || type.fixed)
       return 0;
@@ -104,7 +104,7 @@ lp_const_offset(union lp_type type)
  * else for the fixed points types and normalized integers.
  */
 double
-lp_const_scale(union lp_type type)
+lp_const_scale(struct lp_type type)
 {
    unsigned long long llscale;
    double dscale;
@@ -122,7 +122,7 @@ lp_const_scale(union lp_type type)
  * Minimum value representable by the type.
  */
 double
-lp_const_min(union lp_type type)
+lp_const_min(struct lp_type type)
 {
    unsigned bits;
 
@@ -158,7 +158,7 @@ lp_const_min(union lp_type type)
  * Maximum value representable by the type.
  */
 double
-lp_const_max(union lp_type type)
+lp_const_max(struct lp_type type)
 {
    unsigned bits;
 
@@ -190,7 +190,7 @@ lp_const_max(union lp_type type)
 
 
 double
-lp_const_eps(union lp_type type)
+lp_const_eps(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -211,7 +211,7 @@ lp_const_eps(union lp_type type)
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type)
+lp_build_undef(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMGetUndef(vec_type);
@@ -219,7 +219,7 @@ lp_build_undef(union lp_type type)
                
 
 LLVMValueRef
-lp_build_zero(union lp_type type)
+lp_build_zero(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMConstNull(vec_type);
@@ -227,7 +227,7 @@ lp_build_zero(union lp_type type)
                
 
 LLVMValueRef
-lp_build_one(union lp_type type)
+lp_build_one(struct lp_type type)
 {
    LLVMTypeRef elem_type;
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -269,7 +269,7 @@ lp_build_one(union lp_type type)
                
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
@@ -295,7 +295,7 @@ lp_build_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
@@ -312,7 +312,7 @@ lp_build_int_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle)
 {
@@ -352,8 +352,8 @@ lp_build_const_aos(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4])
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4])
 {
    LLVMTypeRef elem_type = LLVMIntType(type.width);
    LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
index 1934530ea3..ffb302f736 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.h
@@ -42,67 +42,67 @@
 #include <pipe/p_compiler.h>
 
 
-union lp_type type;
+struct lp_type type;
 
 
 unsigned
-lp_mantissa(union lp_type type);
+lp_mantissa(struct lp_type type);
 
 
 unsigned
-lp_const_shift(union lp_type type);
+lp_const_shift(struct lp_type type);
 
 
 unsigned
-lp_const_offset(union lp_type type);
+lp_const_offset(struct lp_type type);
 
 
 double
-lp_const_scale(union lp_type type);
+lp_const_scale(struct lp_type type);
 
 double
-lp_const_min(union lp_type type);
+lp_const_min(struct lp_type type);
 
 
 double
-lp_const_max(union lp_type type);
+lp_const_max(struct lp_type type);
 
 
 double
-lp_const_eps(union lp_type type);
+lp_const_eps(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type);
+lp_build_undef(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_zero(union lp_type type);
+lp_build_zero(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_one(union lp_type type);
+lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val);
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle);
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4]);
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4]);
 
 
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index c8954c8a34..20c8710214 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -63,6 +63,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -86,7 +87,7 @@
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src)
 {
@@ -122,7 +123,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
       int shift = dst_width - n;
       res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
 
-      /* Fill in the empty lower bits for added precision? */
+      /* TODO: Fill in the empty lower bits for additional precision? */
 #if 0
       {
          LLVMValueRef msb;
@@ -152,7 +153,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
@@ -244,12 +245,12 @@ lp_build_const_pack_shuffle(unsigned n)
  * Expand the bit width.
  *
  * This will only change the number of bits the values are represented, not the
- * values themselved.
+ * values themselves.
  */
 static void
 lp_build_expand(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
+               struct lp_type src_type,
+               struct lp_type dst_type,
                LLVMValueRef src,
                LLVMValueRef *dst, unsigned num_dsts)
 {
@@ -266,7 +267,7 @@ lp_build_expand(LLVMBuilderRef builder,
    dst[0] = src;
 
    while(src_type.width < dst_type.width) {
-      union lp_type new_type = src_type;
+      struct lp_type new_type = src_type;
       LLVMTypeRef new_vec_type;
 
       new_type.width *= 2;
@@ -314,8 +315,8 @@ lp_build_expand(LLVMBuilderRef builder,
  */
 static LLVMValueRef
 lp_build_pack2(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
+               struct lp_type src_type,
+               struct lp_type dst_type,
                boolean clamped,
                LLVMValueRef lo,
                LLVMValueRef hi)
@@ -334,8 +335,7 @@ lp_build_pack2(LLVMBuilderRef builder,
    assert(!src_type.floating);
    assert(!dst_type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
       /* All X86 non-interleaved pack instructions all take signed inputs and
        * saturate them, so saturate beforehand. */
       if(!src_type.sign && !clamped) {
@@ -349,7 +349,7 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       switch(src_type.width) {
       case 32:
-         if(dst_type.sign)
+         if(dst_type.sign || !util_cpu_caps.has_sse4_1)
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
          else
             /* PACKUSDW is the only instrinsic with a consistent signature */
@@ -372,7 +372,6 @@ lp_build_pack2(LLVMBuilderRef builder,
       res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
       return res;
    }
-#endif
 
    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
@@ -391,11 +390,11 @@ lp_build_pack2(LLVMBuilderRef builder,
  * TODO: Handle saturation consistently.
  */
 static LLVMValueRef
-lp_build_trunc(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
-               boolean clamped,
-               const LLVMValueRef *src, unsigned num_srcs)
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
 {
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned i;
@@ -410,7 +409,7 @@ lp_build_trunc(LLVMBuilderRef builder,
       tmp[i] = src[i];
 
    while(src_type.width > dst_type.width) {
-      union lp_type new_type = src_type;
+      struct lp_type new_type = src_type;
 
       new_type.width /= 2;
       new_type.length *= 2;
@@ -442,12 +441,12 @@ lp_build_trunc(LLVMBuilderRef builder,
  */
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *src, unsigned num_srcs,
               LLVMValueRef *dst, unsigned num_dsts)
 {
-   union lp_type tmp_type;
+   struct lp_type tmp_type;
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned num_tmps;
    unsigned i;
@@ -470,7 +469,7 @@ lp_build_conv(LLVMBuilderRef builder,
     * Clamp if necessary
     */
 
-   if(src_type.value != dst_type.value) {
+   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
       struct lp_build_context bld;
       double src_min = lp_const_min(src_type);
       double dst_min = lp_const_min(dst_type);
@@ -565,7 +564,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
    if(tmp_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      tmp[0] = lp_build_trunc(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
+      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
       tmp_type.width = dst_type.width;
       tmp_type.length = dst_type.length;
       num_tmps = 1;
@@ -656,8 +655,8 @@ lp_build_conv(LLVMBuilderRef builder,
  */
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts)
 {
@@ -689,7 +688,7 @@ lp_build_conv_mask(LLVMBuilderRef builder,
 
    if(src_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      dst[0] = lp_build_trunc(builder, src_type, dst_type, TRUE, src, num_srcs);
+      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
    }
    else if(src_type.width < dst_type.width) {
       assert(num_srcs == 1);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
index 05c1ef2a10..ca378804d2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
@@ -40,33 +40,33 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 
 
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src);
 
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src);
 
 
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *srcs, unsigned num_srcs,
               LLVMValueRef *dsts, unsigned num_dsts);
 
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
index 30925b5f41..59d8f492e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -30,10 +30,27 @@
 #include <udis86.h>
 #endif
 
+#include "util/u_math.h"
 #include "util/u_debug.h"
 #include "lp_bld_debug.h"
 
 
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_pot(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
 void
 lp_disassemble(const void* func)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.h b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
index ecdafef76d..583e6132b4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
@@ -53,6 +53,10 @@ lp_build_name(LLVMValueRef val, const char *format, ...)
 }
 
 
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment);
+
+
 void
 lp_disassemble(const void* func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 2cd6e6b921..21c665c4d4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -71,11 +71,11 @@
 /**
  * Return a type appropriate for depth/stencil testing.
  */
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length)
 {
-   union lp_type type;
+   struct lp_type type;
    unsigned swizzle;
 
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
@@ -85,7 +85,7 @@ lp_depth_type(const struct util_format_description *format_desc,
    swizzle = format_desc->swizzle[0];
    assert(swizzle < 4);
 
-   type.value = 0;
+   memset(&type, 0, sizeof type);
    type.width = format_desc->block.bits;
 
    if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
@@ -114,7 +114,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
@@ -179,12 +179,13 @@ lp_build_depth_test(LLVMBuilderRef builder,
       padding_right = 0;
       for(chan = 0; chan < z_swizzle; ++chan)
          padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits - format_desc->channel[z_swizzle].size;
+      padding_left = format_desc->block.bits -
+                     (padding_right + format_desc->channel[z_swizzle].size);
 
       if(padding_left || padding_right) {
-         const long long mask_left = ((long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const long long mask_right = ((long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left & mask_right);
+         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
+         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
+         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
       }
 
       if(padding_left)
@@ -210,5 +211,6 @@ lp_build_depth_test(LLVMBuilderRef builder,
       LLVMBuildStore(builder, dst, dst_ptr);
    }
 
+   /* FIXME */
    assert(!state->occlusion_count);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 5d2e042fcc..79d6981bb5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -41,11 +41,11 @@
  
 struct pipe_depth_state;
 struct util_format_description;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length);
 
@@ -53,7 +53,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
index 9d99e1a9d9..dcc25fbff8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -32,59 +32,261 @@
  */
 
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
 
 
+#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_DEPTH 32
+
+
+/**
+ * Enumeration of all possible flow constructs.
+ */
+enum lp_build_flow_construct_kind {
+   lP_BUILD_FLOW_SCOPE,
+   LP_BUILD_FLOW_SKIP,
+};
+
+
+/**
+ * Variable declaration scope.
+ */
+struct lp_build_flow_scope
+{
+   /** Number of variables declared in this scope */
+   unsigned num_variables;
+};
+
+
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_flow_skip
+{
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+
+   /** Number of variables declared at the beginning */
+   unsigned num_variables;
+
+   LLVMValueRef *phi;
+};
+
+
+/**
+ * Union of all possible flow constructs' data
+ */
+union lp_build_flow_construct_data
+{
+   struct lp_build_flow_scope scope;
+   struct lp_build_flow_skip skip;
+};
+
+
+/**
+ * Element of the flow construct stack.
+ */
+struct lp_build_flow_construct
+{
+   enum lp_build_flow_construct_kind kind;
+   union lp_build_flow_construct_data data;
+};
+
+
+/**
+ * All necessary data to generate LLVM control flow constructs.
+ *
+ * Besides keeping track of the control flow construct themselves we also
+ * need to keep track of variables in order to generate SSA Phi values.
+ */
+struct lp_build_flow_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * Control flow stack.
+    */
+   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
+   unsigned num_constructs;
+
+   /**
+    * Variable stack
+    */
+   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
+   unsigned num_variables;
+};
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder)
+{
+   struct lp_build_flow_context *flow;
+
+   flow = CALLOC_STRUCT(lp_build_flow_context);
+   if(!flow)
+      return NULL;
+
+   flow->builder = builder;
+
+   return flow;
+}
+
+
 void
-lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
-                    LLVMValueRef value)
+lp_build_flow_destroy(struct lp_build_flow_context *flow)
 {
-   memset(mask, 0, sizeof *mask);
+   assert(flow->num_constructs == 0);
+   assert(flow->num_variables == 0);
+   FREE(flow);
+}
 
-   mask->builder = builder;
-   mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+
+static union lp_build_flow_construct_data *
+lp_build_flow_push(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
+   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
+      return NULL;
+
+   flow->constructs[flow->num_constructs].kind = kind;
+   return &flow->constructs[flow->num_constructs++].data;
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_peek(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[flow->num_constructs - 1].data;
 }
 
 
+static union lp_build_flow_construct_data *
+lp_build_flow_pop(struct lp_build_flow_context *flow,
+                  enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[--flow->num_constructs].data;
+}
+
+
+/**
+ * Begin a variable scope.
+ *
+ *
+ */
 void
-lp_build_mask_update(struct lp_build_mask_context *mask,
-                     LLVMValueRef value)
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
 {
+   struct lp_build_flow_scope *scope;
 
-   LLVMValueRef cond;
-   LLVMBasicBlockRef current_block;
-   LLVMBasicBlockRef next_block;
-   LLVMBasicBlockRef new_block;
+   scope = &lp_build_flow_push(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(mask->value)
-      mask->value = LLVMBuildAnd(mask->builder, mask->value, value, "");
-   else
-      mask->value = value;
+   scope->num_variables = 0;
+}
 
-   /* FIXME: disabled until we have proper control flow helpers */
-#if 0
-   cond = LLVMBuildICmp(mask->builder,
-                        LLVMIntEQ,
-                        LLVMBuildBitCast(mask->builder, mask->value, mask->reg_type, ""),
-                        LLVMConstNull(mask->reg_type),
-                        "");
 
-   current_block = LLVMGetInsertBlock(mask->builder);
+/**
+ * Declare a variable.
+ *
+ * A variable is a named entity which can have different LLVMValueRef's at
+ * different points of the program. This is relevant for control flow because
+ * when there are mutiple branches to a same location we need to replace
+ * the variable's value with a Phi function as explained in
+ * http://en.wikipedia.org/wiki/Static_single_assignment_form .
+ *
+ * We keep track of variables by keeping around a pointer to where their
+ * current.
+ *
+ * There are a few cautions to observe:
+ *
+ * - Variable's value must not be NULL. If there is no initial value then
+ *   LLVMGetUndef() should be used.
+ *
+ * - Variable's value must be kept up-to-date. If the variable is going to be
+ *   modified by a function then a pointer should be passed so that its value
+ *   is accurate. Failure to do this will cause some of the variables'
+ *   transient values to be lost, leading to wrong results.
+ *
+ * - A program should be written from top to bottom, by always appending
+ *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
+ *   modifying existing statements will most likely lead to wrong results.
+ *
+ */
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_peek(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(!mask->skip_block) {
-      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-      mask->skip_block = LLVMAppendBasicBlock(function, "skip");
+   assert(*variable);
+   if(!*variable)
+      return;
+
+   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
+   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
+      return;
+
+   flow->variables[flow->num_variables++] = variable;
+   ++scope->num_variables;
+}
+
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_pop(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-      mask->phi = LLVMBuildPhi(mask->builder, LLVMTypeOf(mask->value), "");
+   assert(flow->num_variables >= scope->num_variables);
+   if(flow->num_variables < scope->num_variables) {
+      flow->num_variables = 0;
+      return;
    }
 
+   flow->num_variables -= scope->num_variables;
+}
+
+
+static LLVMBasicBlockRef
+lp_build_flow_insert_block(struct lp_build_flow_context *flow)
+{
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef next_block;
+   LLVMBasicBlockRef new_block;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
    next_block = LLVMGetNextBasicBlock(current_block);
-   assert(next_block);
    if(next_block) {
       new_block = LLVMInsertBasicBlock(next_block, "");
    }
@@ -93,30 +295,148 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
       new_block = LLVMAppendBasicBlock(function, "");
    }
 
-   LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-   LLVMBuildCondBr(mask->builder, cond, mask->skip_block, new_block);
+   return new_block;
+}
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBuilderRef builder;
+   unsigned i;
+
+   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   skip->block = lp_build_flow_insert_block(flow);
+   skip->num_variables = flow->num_variables;
+   if(!skip->num_variables) {
+      skip->phi = NULL;
+      return;
+   }
 
-   LLVMPositionBuilderAtEnd(mask->builder, new_block);
-#endif
+   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
+   if(!skip->phi) {
+      skip->num_variables = 0;
+      return;
+   }
+
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, skip->block);
+
+   for(i = 0; i < skip->num_variables; ++i)
+      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+   LLVMDisposeBuilder(builder);
 }
 
 
-LLVMValueRef
-lp_build_mask_end(struct lp_build_mask_context *mask)
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef new_block;
+   unsigned i;
+
+   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   new_block = lp_build_flow_insert_block(flow);
+
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+   }
+
+   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+
+   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+ }
+
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 {
-   if(mask->skip_block) {
-      LLVMBasicBlockRef current_block = LLVMGetInsertBlock(mask->builder);
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   unsigned i;
 
-      LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-      LLVMBuildBr(mask->builder, mask->skip_block);
+   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
 
-      LLVMPositionBuilderAtEnd(mask->builder, mask->skip_block);
+   current_block = LLVMGetInsertBlock(flow->builder);
 
-      mask->value = mask->phi;
-      mask->phi = NULL;
-      mask->skip_block = NULL;
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+      *flow->variables[i] = skip->phi[i];
    }
 
+   LLVMBuildBr(flow->builder, skip->block);
+   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+
+   FREE(skip->phi);
+}
+
+
+static void
+lp_build_mask_check(struct lp_build_mask_context *mask)
+{
+   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef cond;
+
+   cond = LLVMBuildICmp(builder,
+                        LLVMIntEQ,
+                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMConstNull(mask->reg_type),
+                        "");
+
+   lp_build_flow_skip_cond_break(mask->flow, cond);
+}
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value)
+{
+   memset(mask, 0, sizeof *mask);
+
+   mask->flow = flow;
+   mask->reg_type = LLVMIntType(type.width * type.length);
+   mask->value = value;
+
+   lp_build_flow_scope_begin(flow);
+   lp_build_flow_scope_declare(flow, &mask->value);
+   lp_build_flow_skip_begin(flow);
+
+   lp_build_mask_check(mask);
+}
+
+
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value)
+{
+   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+
+   lp_build_mask_check(mask);
+}
+
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask)
+{
+   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_scope_end(mask->flow);
    return mask->value;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.h b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
index 1b634ff038..e61999ff06 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
@@ -38,27 +38,53 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type;
+struct lp_type;
+
+
+struct lp_build_flow_context;
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder);
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable);
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond);
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow);
 
 
 struct lp_build_mask_context
 {
-   LLVMBuilderRef builder;
+   struct lp_build_flow_context *flow;
 
    LLVMTypeRef reg_type;
 
    LLVMValueRef value;
-
-   LLVMValueRef phi;
-
-   LLVMBasicBlockRef skip_block;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
                     LLVMValueRef value);
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
index 01c8a752d1..6d3f692619 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format.h
@@ -31,21 +31,15 @@
 
 /**
  * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
+ * Pixel format helpers.
  */
 
 #include <llvm-c/Core.h>  
- 
-#include "pipe/p_format.h"
 
+#include "pipe/p_format.h"
 
-union lp_type;
+struct util_format_description;
+struct lp_type;
 
 
 /**
@@ -56,9 +50,9 @@ union lp_type;
  * @return RGBA in a 4 floats vector.
  */
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format, 
-                     LLVMValueRef packed);
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed);
 
 
 /**
@@ -67,9 +61,9 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
  * @param rgba 4 float vector with the unpacked components.
  */
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba);
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba);
 
 
 /**
@@ -81,9 +75,9 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
  * @return RGBA in a 4 floats vector.
  */
 LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format, 
-                   LLVMValueRef ptr);
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr);
 
 
 /**
@@ -92,10 +86,34 @@ lp_build_load_rgba(LLVMBuilderRef builder,
  * @param rgba 4 float vector with the unpacked components.
  */
 void 
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba);
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba);
 
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba);
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba);
 
 #endif /* !LP_BLD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
index dcbc0076c7..b9b5d84bed 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -32,9 +32,9 @@
 
 
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format,
-                     LLVMValueRef packed)
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -152,9 +152,9 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
 
 
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba)
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -250,9 +250,9 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
 
 
 LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef ptr)
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -272,15 +272,15 @@ lp_build_load_rgba(LLVMBuilderRef builder,
 
    packed = LLVMBuildLoad(builder, ptr, "");
 
-   return lp_build_unpack_rgba(builder, format, packed);
+   return lp_build_unpack_rgba_aos(builder, format, packed);
 }
 
 
 void
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba)
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -294,7 +294,7 @@ lp_build_store_rgba(LLVMBuilderRef builder,
 
    type = LLVMIntType(desc->block.bits);
 
-   packed = lp_build_pack_rgba(builder, format, rgba);
+   packed = lp_build_pack_rgba_aos(builder, format, rgba);
 
    ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
new file mode 100644
index 0000000000..b5ff434e1a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+static LLVMValueRef
+lp_build_format_swizzle(struct lp_type type,
+                        const LLVMValueRef *inputs,
+                        enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case UTIL_FORMAT_SWIZZLE_X:
+   case UTIL_FORMAT_SWIZZLE_Y:
+   case UTIL_FORMAT_SWIZZLE_Z:
+   case UTIL_FORMAT_SWIZZLE_W:
+      return inputs[swizzle];
+   case UTIL_FORMAT_SWIZZLE_0:
+      return lp_build_zero(type);
+   case UTIL_FORMAT_SWIZZLE_1:
+      return lp_build_one(type);
+   case UTIL_FORMAT_SWIZZLE_NONE:
+      return lp_build_undef(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba)
+{
+   LLVMValueRef inputs[4];
+   unsigned start;
+   unsigned chan;
+
+   /* FIXME: Support more formats */
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   /* Decode the input vector components */
+   start = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned width = format_desc->channel[chan].size;
+      unsigned stop = start + width;
+      LLVMValueRef input;
+
+      input = packed;
+
+      switch(format_desc->channel[chan].type) {
+      case UTIL_FORMAT_TYPE_VOID:
+         input = NULL;
+         break;
+
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         if(type.floating) {
+            if(start)
+               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+            if(stop < format_desc->block.bits) {
+               unsigned mask = ((unsigned long long)1 << width) - 1;
+               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+            }
+
+            if(format_desc->channel[chan].normalized)
+               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
+            else
+               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      default:
+         /* fall through */
+         input = lp_build_undef(type);
+         break;
+      }
+
+      inputs[chan] = input;
+
+      start = stop;
+   }
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle(type, inputs, swizzle);
+      rgba[2] = rgba[1] = rgba[0] = depth;
+      rgba[3] = lp_build_one(type);
+   }
+   else {
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         rgba[chan] = lp_build_format_swizzle(type, inputs, swizzle);
+      }
+   }
+}
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba)
+{
+   LLVMValueRef packed;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   packed = lp_build_gather(builder,
+                            type.length, format_desc->block.bits, type.width,
+                            base_ptr, offsets);
+
+   lp_build_unpack_rgba_soa(builder, format_desc, type, packed, rgba);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index cfe20a0d75..338dbca6d1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -292,7 +292,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 9194f6233a..9c57a10879 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -83,7 +83,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 8631efd6c3..db22a8028a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -33,6 +33,8 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -45,7 +47,7 @@ lp_build_cmp(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
@@ -65,7 +67,7 @@ lp_build_cmp(struct lp_build_context *bld,
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
-      if(type.floating) {
+      if(type.floating && util_cpu_caps.has_sse) {
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -114,7 +116,7 @@ lp_build_cmp(struct lp_build_context *bld,
          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
          return res;
       }
-      else {
+      else if(util_cpu_caps.has_sse2) {
          static const struct {
             unsigned swap:1;
             unsigned eq:1;
@@ -301,7 +303,7 @@ lp_build_select(struct lp_build_context *bld,
                 LLVMValueRef a,
                 LLVMValueRef b)
 {
-   union lp_type type = bld->type;
+   struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == b)
@@ -313,8 +315,6 @@ lp_build_select(struct lp_build_context *bld,
       b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
    }
 
-   /* TODO: On SSE4 we could do this with a single instruction -- PBLENDVB */
-
    a = LLVMBuildAnd(bld->builder, a, mask, "");
 
    /* This often gets translated to PANDN, but sometimes the NOT is
@@ -339,9 +339,9 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4])
+                    const boolean cond[4])
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -376,9 +376,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 
       return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
    }
+   else {
 #if 0
-   else if(0) {
-      /* FIXME: Unfortunately select of vectors do not work */
+      /* XXX: Unfortunately select of vectors do not work */
       /* Use a select */
       LLVMTypeRef elem_type = LLVMInt1Type();
       LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
@@ -388,10 +388,9 @@ lp_build_select_aos(struct lp_build_context *bld,
             cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
 
       return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
-   }
-#endif
-   else {
+#else
       LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
       return lp_build_select(bld, mask, a, b);
+#endif
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
index 29b9e1c45b..a4ee7723b5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
@@ -42,7 +42,7 @@
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -66,7 +66,7 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4]);
+                    const boolean cond[4]);
 
 
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
new file mode 100644
index 0000000000..403d0e4836
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_SAMPLE_H
+#define LP_BLD_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+struct pipe_texture;
+struct pipe_sampler_state;
+struct lp_type;
+
+
+/**
+ * Sampler static state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are embedded in the generated code.
+ */
+struct lp_sampler_static_state
+{
+   /* pipe_texture's state */
+   enum pipe_format format;
+   unsigned target:2;
+   unsigned pot_width:1;
+   unsigned pot_height:1;
+   unsigned pot_depth:1;
+
+   /* pipe_sampler_state's state */
+   unsigned wrap_s:3;
+   unsigned wrap_t:3;
+   unsigned wrap_r:3;
+   unsigned min_img_filter:2;
+   unsigned min_mip_filter:2;
+   unsigned mag_img_filter:2;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+   unsigned normalized_coords:1;
+   unsigned prefilter:4;
+};
+
+
+/**
+ * Sampler dynamic state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are computed in runtime.
+ *
+ * There are obtained through callbacks, as we don't want to tie the texture
+ * sampling code generation logic to any particular texture layout or pipe
+ * driver.
+ */
+struct lp_sampler_dynamic_state
+{
+
+   /** Obtain the base texture width. */
+   LLVMValueRef
+   (*width)( struct lp_sampler_dynamic_state *state,
+             LLVMBuilderRef builder,
+             unsigned unit);
+
+   /** Obtain the base texture height. */
+   LLVMValueRef
+   (*height)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*stride)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+                LLVMBuilderRef builder,
+                unsigned unit);
+
+};
+
+
+/**
+ * Derive the sampler static state.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler);
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type fp_type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel);
+
+
+
+#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
new file mode 100644
index 0000000000..8ca1be6f1b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -0,0 +1,416 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width[0]);
+   state->pot_height        = util_is_pot(texture->height[0]);
+   state->pot_depth         = util_is_pot(texture->depth[0]);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   if(sampler->compare_mode) {
+      state->compare_mode      = sampler->compare_mode;
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+   state->prefilter         = sampler->prefilter;
+}
+
+
+
+/**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+static void
+lp_build_sample_texel(struct lp_build_sample_context *bld,
+                      LLVMValueRef x,
+                      LLVMValueRef y,
+                      LLVMValueRef y_stride,
+                      LLVMValueRef data_ptr,
+                      LLVMValueRef *texel)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->int_coord_type, bld->format_desc->block.bits/8);
+
+   if(bld->format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, int_coord_bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, int_coord_bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, int_coord_bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, int_coord_bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->int_coord_type, 2*bld->format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->int_coord_type, 4*bld->format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, int_coord_bld->one, "");
+
+      x_offset_lo = lp_build_mul(int_coord_bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(int_coord_bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(int_coord_bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(int_coord_bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(int_coord_bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(int_coord_bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(int_coord_bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(int_coord_bld, x, x_stride);
+      y_offset = lp_build_mul(int_coord_bld, y, y_stride);
+
+      offset = lp_build_add(int_coord_bld, x_offset, y_offset);
+   }
+
+   lp_build_load_rgba_soa(bld->builder,
+                          bld->format_desc,
+                          bld->texel_type,
+                          data_ptr,
+                          offset,
+                          texel);
+}
+
+
+static LLVMValueRef
+lp_build_sample_wrap(struct lp_build_sample_context *bld,
+                     LLVMValueRef coord,
+                     LLVMValueRef length,
+                     boolean is_pot,
+                     unsigned wrap_mode)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      /* FIXME */
+      _debug_printf("warning: failed to translate texture wrap mode %u\n", wrap_mode);
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return coord;
+}
+
+
+static void
+lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
+                               LLVMValueRef s,
+                               LLVMValueRef t,
+                               LLVMValueRef width,
+                               LLVMValueRef height,
+                               LLVMValueRef stride,
+                               LLVMValueRef data_ptr,
+                               LLVMValueRef *texel)
+{
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   x = lp_build_ifloor(&bld->coord_bld, s);
+   y = lp_build_ifloor(&bld->coord_bld, t);
+
+   x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x, y, stride, data_ptr, texel);
+}
+
+
+static void
+lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMValueRef half;
+   LLVMValueRef s_ipart;
+   LLVMValueRef t_ipart;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2][4];
+   unsigned chan;
+
+   half = lp_build_const_scalar(bld->coord_type, 0.5);
+   s = lp_build_sub(&bld->coord_bld, s, half);
+   t = lp_build_sub(&bld->coord_bld, t, half);
+
+   s_ipart = lp_build_floor(&bld->coord_bld, s);
+   t_ipart = lp_build_floor(&bld->coord_bld, t);
+
+   s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
+   t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
+
+   x0 = lp_build_int(&bld->coord_bld, s_ipart);
+   y0 = lp_build_int(&bld->coord_bld, t_ipart);
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
+
+   /* TODO: Don't interpolate missing channels */
+   for(chan = 0; chan < 4; ++chan) {
+      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                     s_fpart, t_fpart,
+                                     neighbors[0][0][chan],
+                                     neighbors[0][1][chan],
+                                     neighbors[1][0][chan],
+                                     neighbors[1][1][chan]);
+   }
+}
+
+
+static void
+lp_build_sample_compare(struct lp_build_sample_context *bld,
+                        LLVMValueRef p,
+                        LLVMValueRef *texel)
+{
+   struct lp_build_context *texel_bld = &bld->texel_bld;
+   LLVMValueRef res;
+   unsigned chan;
+
+   if(!bld->static_state->compare_mode)
+      return;
+
+   /* TODO: Compare before swizzling, to avoid redundant computations */
+   res = NULL;
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef cmp;
+      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
+      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
+
+      if(res)
+         res = lp_build_add(texel_bld, res, cmp);
+      else
+         res = cmp;
+   }
+
+   assert(res);
+   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for(chan = 0; chan < 3; ++chan)
+      texel[chan] = res;
+   texel[3] = texel_bld->one;
+}
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel)
+{
+   struct lp_build_sample_context bld;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef stride;
+   LLVMValueRef data_ptr;
+   LLVMValueRef s;
+   LLVMValueRef t;
+   LLVMValueRef p;
+
+   /* Setup our build context */
+   memset(&bld, 0, sizeof bld);
+   bld.builder = builder;
+   bld.static_state = static_state;
+   bld.dynamic_state = dynamic_state;
+   bld.format_desc = util_format_description(static_state->format);
+   bld.coord_type = type;
+   bld.int_coord_type = lp_int_type(type);
+   bld.texel_type = type;
+   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
+   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
+
+   /* Get the dynamic state */
+   width = dynamic_state->width(dynamic_state, builder, unit);
+   height = dynamic_state->height(dynamic_state, builder, unit);
+   stride = dynamic_state->stride(dynamic_state, builder, unit);
+   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+
+   s = coords[0];
+   t = coords[1];
+   p = coords[2];
+
+   width = lp_build_broadcast_scalar(&bld.int_coord_bld, width);
+   height = lp_build_broadcast_scalar(&bld.int_coord_bld, height);
+   stride = lp_build_broadcast_scalar(&bld.int_coord_bld, stride);
+
+   if(static_state->target == PIPE_TEXTURE_1D)
+      t = bld.coord_bld.zero;
+
+   if(static_state->normalized_coords) {
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld.coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(builder, width, coord_vec_type, "");
+      LLVMValueRef fp_height = LLVMBuildSIToFP(builder, height, coord_vec_type, "");
+      s = lp_build_mul(&bld.coord_bld, s, fp_width);
+      t = lp_build_mul(&bld.coord_bld, t, fp_height);
+   }
+
+   switch (static_state->min_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   default:
+      assert(0);
+   }
+
+   /* FIXME: respect static_state->min_mip_filter */;
+   /* FIXME: respect static_state->mag_img_filter */;
+   /* FIXME: respect static_state->prefilter */;
+
+   lp_build_sample_compare(&bld, p, texel);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.c b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
index 14d2b10df9..3998ac374f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
@@ -42,17 +42,30 @@
 
 
 LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef member_ptr;
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
+   return member_ptr;
+}
+
+
+LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
                     unsigned member,
                     const char *name)
 {
-   LLVMValueRef indices[2];
    LLVMValueRef member_ptr;
    LLVMValueRef res;
-   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
    res = LLVMBuildLoad(builder, member_ptr, "");
    lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
    return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.h b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
index cbefdc9f81..740392f561 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
@@ -53,6 +53,18 @@
              offsetof(_ctype, _cmember))
 
 
+/**
+ * Get value pointer to a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name);
+
+/**
+ * Get the value of a structure member.
+ */
 LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
index ac7eed9379..64e81f7b1f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
@@ -64,7 +64,7 @@ LLVMValueRef
 lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
    unsigned i;
 
@@ -83,7 +83,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
                        LLVMValueRef a,
                        unsigned channel)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -115,7 +115,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      union lp_type type4 = type;
+      struct lp_type type4 = type;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
@@ -161,7 +161,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -192,7 +192,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -201,11 +201,12 @@ lp_build_swizzle2_aos(struct lp_build_context *bld,
       return lp_build_swizzle1_aos(bld, a, swizzle);
 
    if(a == b) {
-      swizzle[0] %= 4;
-      swizzle[1] %= 4;
-      swizzle[2] %= 4;
-      swizzle[3] %= 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle);
+      unsigned char swizzle1[4];
+      swizzle1[0] = swizzle[0] % 4;
+      swizzle1[1] = swizzle[1] % 4;
+      swizzle1[2] = swizzle[2] % 4;
+      swizzle1[3] = swizzle[3] % 4;
+      return lp_build_swizzle1_aos(bld, a, swizzle1);
    }
 
    if(swizzle[0] % 4 == 0 &&
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
index d7dd6a8a60..1f6da80448 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -73,7 +73,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 /**
@@ -85,7 +85,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
index 912db24aec..eddb7a83fa 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
@@ -39,31 +39,46 @@
 
 
 struct tgsi_token;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 struct lp_build_mask_context;
 
 
-typedef void
-(*lp_emit_fetch_texel_soa_callback)( LLVMBuilderRef builder,
-                                void *context,
-                                unsigned unit,
-                                unsigned num_coords,
-                                const LLVMValueRef *coords,
-                                LLVMValueRef lodbias,
-                                LLVMValueRef *texel);
+/**
+ * Sampler code generation interface.
+ *
+ * Although texture sampling is a requirement for TGSI translation, it is
+ * a very different problem with several different approaches to it. This
+ * structure establishes an interface for texture sampling code generation, so
+ * that we can easily use different texture sampling strategies.
+ */
+struct lp_build_sampler_soa
+{
+   void
+   (*destroy)( struct lp_build_sampler_soa *sampler );
+
+   void
+   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+                        LLVMBuilderRef builder,
+                        struct lp_type type,
+                        unsigned unit,
+                        unsigned num_coords,
+                        const LLVMValueRef *coords,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef *texel);
+};
+
 
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context);
+                  struct lp_build_sampler_soa *sampler);
 
 
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index d4d18febec..adc81569ed 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -78,6 +78,11 @@
 #define CHAN_Z 2
 #define CHAN_W 3
 
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
 
 struct lp_build_tgsi_soa_context
 {
@@ -88,8 +93,7 @@ struct lp_build_tgsi_soa_context
    const LLVMValueRef (*inputs)[NUM_CHANNELS];
    LLVMValueRef (*outputs)[NUM_CHANNELS];
 
-   lp_emit_fetch_texel_soa_callback emit_fetch_texel;
-   void *emit_fetch_texel_context;
+   struct lp_build_sampler_soa *sampler;
 
    LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
@@ -98,6 +102,51 @@ struct lp_build_tgsi_soa_context
 };
 
 
+static const unsigned char
+swizzle_left[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
+   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+};
+
+
+static LLVMValueRef
+emit_ddx(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
+   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
+   return lp_build_sub(&bld->base, src_right, src_left);
+}
+
+
+static LLVMValueRef
+emit_ddy(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
+   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
+   return lp_build_sub(&bld->base, src_top, src_bottom);
+}
+
+
 /**
  * Register fetch.
  */
@@ -168,6 +217,7 @@ emit_fetch(
       break;
 
    case TGSI_UTIL_SIGN_SET:
+      /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
       res = LLVMBuildNeg( bld->base.builder, res, "" );
       break;
@@ -185,6 +235,36 @@ emit_fetch(
 
 
 /**
+ * Register fetch with derivatives.
+ */
+static void
+emit_fetch_deriv(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index,
+   LLVMValueRef *res,
+   LLVMValueRef *ddx,
+   LLVMValueRef *ddy)
+{
+   LLVMValueRef src;
+
+   src = emit_fetch(bld, inst, index, chan_index);
+
+   if(res)
+      *res = src;
+
+   /* TODO: use interpolation coeffs for inputs */
+
+   if(ddx)
+      *ddx = emit_ddx(bld, src);
+
+   if(ddy)
+      *ddy = emit_ddy(bld, src);
+}
+
+
+/**
  * Register store.
  */
 static void
@@ -239,17 +319,18 @@ emit_store(
  * High-level instruction translators.
  */
 
+
 static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
           boolean apply_lodbias,
-          boolean projected)
+          boolean projected,
+          LLVMValueRef *texel)
 {
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    LLVMValueRef lodbias;
    LLVMValueRef oow;
    LLVMValueRef coords[3];
-   LLVMValueRef texel[4];
    unsigned num_coords;
    unsigned i;
 
@@ -289,12 +370,11 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
 
-   bld->emit_fetch_texel(bld->base.builder, bld->emit_fetch_texel_context,
-                         unit, num_coords, coords, lodbias, texel);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( inst, i ) {
-      emit_store( bld, inst, 0, i, texel[i] );
-   }
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->base.builder,
+                                  bld->base.type,
+                                  unit, num_coords, coords, lodbias,
+                                  texel);
 }
 
 
@@ -347,14 +427,6 @@ emit_kil(
 }
 
 
-static void
-emit_kilp(
-   struct lp_build_tgsi_soa_context *bld )
-{
-   /* XXX todo / fix me */
-}
-
-
 /**
  * Check if inst src/dest regs use indirect addressing into temporary
  * register file.
@@ -382,25 +454,35 @@ indirect_temp_reference(const struct tgsi_full_instruction *inst)
 static int
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
-   struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info)
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
    LLVMValueRef tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-   LLVMValueRef dst0;
+   LLVMValueRef res;
+   LLVMValueRef dst0[NUM_CHANNELS];
 
    /* we can't handle indirect addressing into temp register file yet */
    if (indirect_temp_reference(inst))
       return FALSE;
 
+   assert(info->num_dst <= 1);
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.undef;
+      }
+   }
+
    switch (inst->Instruction.Opcode) {
 #if 0
    case TGSI_OPCODE_ARL:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_flr(bld, 0, 0);
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
@@ -408,19 +490,17 @@ emit_instruction(
    case TGSI_OPCODE_MOV:
    case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_LIT:
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
-         emit_store( bld, inst, 0, CHAN_X, bld->base.one);
+         dst0[CHAN_X] = bld->base.one;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
          src0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0 = lp_build_max( &bld->base, src0, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Y, dst0);
+         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
          /* XMM[1] = SrcReg[0].yyyy */
@@ -432,20 +512,19 @@ emit_instruction(
          tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
          tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
-         dst0 = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Z, dst0);
+         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
-         emit_store( bld, inst, 0, CHAN_W, bld->base.one);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      dst0 = lp_build_rcp(&bld->base, src0);
+      res = lp_build_rcp(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -453,9 +532,9 @@ emit_instruction(
    /* TGSI_OPCODE_RECIPSQRT */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src0 = lp_build_abs(&bld->base, src0);
-      dst0 = lp_build_rsqrt(&bld->base, src0);
+      res = lp_build_rsqrt(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -479,16 +558,15 @@ emit_instruction(
          lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
 
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = tmp1;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -514,20 +592,18 @@ emit_instruction(
 
          /* dst.x = floor(lg2(abs(src.x))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
-            tmp1 = lp_build_div( &bld->base, src0, tmp1);
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
          }
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -535,8 +611,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_mul(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
       }
       break;
 
@@ -544,8 +619,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_add(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
       }
       break;
 
@@ -563,7 +637,7 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -585,28 +659,24 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DST:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = bld->base.one;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
          tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
-         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_Z );
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = emit_fetch( bld, inst, 1, CHAN_W );
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
       }
       break;
 
@@ -614,8 +684,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_min( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
       }
       break;
 
@@ -623,8 +692,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_max( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
       }
       break;
 
@@ -634,8 +702,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -645,8 +712,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -658,7 +724,7 @@ emit_instruction(
          tmp2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
          tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -666,8 +732,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          tmp1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_sub( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
       }
       break;
 
@@ -678,13 +743,19 @@ emit_instruction(
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_sub( &bld->base, src1, src2 );
          tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
-         dst0 = lp_build_add( &bld->base, tmp0, src2 );
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
       }
       break;
 
    case TGSI_OPCODE_CND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
+      }
       break;
 
    case TGSI_OPCODE_DP2A:
@@ -698,45 +769,49 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
-#if 0
    case TGSI_OPCODE_FRC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_frc( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp0 = lp_build_floor(&bld->base, src0);
+         tmp0 = lp_build_sub(&bld->base, tmp0, src0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_CLAMP:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_max(&bld->base, tmp0, src1);
+         tmp0 = lp_build_min(&bld->base, tmp0, src2);
+         dst0[chan_index] = tmp0;
+      }
       break;
 
    case TGSI_OPCODE_FLR:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_flr( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
       }
       break;
 
    case TGSI_OPCODE_ROUND:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_rnd( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_EX2: {
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_exp2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
    }
@@ -745,16 +820,16 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_log2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_POW:
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src1 = emit_fetch( bld, inst, 1, CHAN_X );
-      dst0 = lp_build_pow( &bld->base, src0, src1 );
+      res = lp_build_pow( &bld->base, src0, src1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -775,7 +850,7 @@ emit_instruction(
          tmp5 = tmp3;
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
-         emit_store( bld, inst, 0, CHAN_X, tmp2);
+         dst0[CHAN_X] = tmp2;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
@@ -786,31 +861,30 @@ emit_instruction(
          tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
          tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
          tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp3);
+         dst0[CHAN_Y] = tmp3;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
          tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
-         emit_store( bld, inst, 0, CHAN_Z, tmp5);
+         dst0[CHAN_Z] = tmp5;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_ABS:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp0 = lp_build_abs( &bld->base, tmp0 ) ;
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
       }
       break;
 
    case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
       return 0;
-      break;
 
    case TGSI_OPCODE_DPH:
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
@@ -827,7 +901,7 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -835,25 +909,27 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_cos( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DDX:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
+      }
       break;
 
    case TGSI_OPCODE_DDY:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
+      }
       break;
 
-#if 0
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld );
-      return 0; /* XXX fix me */
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
@@ -885,13 +961,14 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_SFL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
       break;
 
    case TGSI_OPCODE_SGT:
@@ -899,8 +976,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -908,7 +984,7 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_sin( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -917,8 +993,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -927,85 +1002,99 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_STR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.one;
+      }
       break;
 
    case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, FALSE, FALSE );
+      emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
 #if 0
    case TGSI_OPCODE_ARR:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_rnd( bld, 0, 0 );
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
 
    case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CAL:
+      /* FIXME */
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_RET:
-      emit_ret( bld );
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_END:
       break;
 
-#if 0
    case TGSI_OPCODE_SSG:
    /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_sgn( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
       }
       break;
-#endif
 
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -1013,34 +1102,29 @@ emit_instruction(
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
-         dst0 = lp_build_select( &bld->base, tmp0, src1, src2);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
       }
       break;
 
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_cos( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_sin( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = bld->base.zero;
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = bld->base.zero;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_NRM:
@@ -1099,38 +1183,35 @@ emit_instruction(
 
             /* dst.x = xmm1 * src.x */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
-               tmp4 = lp_build_mul( &bld->base, tmp4, tmp1);
-               emit_store(bld, inst, 0, CHAN_X, tmp4);
+               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
             }
 
             /* dst.y = xmm1 * src.y */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
-               tmp5 = lp_build_mul( &bld->base, tmp5, tmp1);
-               emit_store(bld, inst, 0, CHAN_Y, tmp5);
+               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
             }
 
             /* dst.z = xmm1 * src.z */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
-               tmp6 = lp_build_mul( &bld->base, tmp6, tmp1);
-               emit_store(bld, inst, 0, CHAN_Z, tmp6);
+               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
             }
 
             /* dst.w = xmm1 * src.w */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
-               tmp7 = lp_build_mul( &bld->base, tmp7, tmp1);
-               emit_store(bld, inst, 0, CHAN_W, tmp7);
+               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
             }
          }
 
-         /* dst0.w = 1.0 */
+         /* dst.w = 1.0 */
          if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
-            tmp0 = bld->base.one;
-            emit_store(bld, inst, 0, CHAN_W, tmp0);
+            dst0[CHAN_W] = bld->base.one;
          }
       }
       break;
 
    case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert( 0 );
       return 0;
       break;
 
@@ -1143,118 +1224,157 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, FALSE, TRUE );
+      emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
       
    case TGSI_OPCODE_BRK:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_IF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_BGNFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_REP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ELSE:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDIF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ENDREP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CEIL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
+      }
       break;
 
    case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_f2it( bld, 0 );
-         emit_i2f( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SHR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CONT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
@@ -1266,10 +1386,28 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_NOISE1:
+   case TGSI_OPCODE_NOISE2:
+   case TGSI_OPCODE_NOISE3:
+   case TGSI_OPCODE_NOISE4:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
    default:
       return 0;
    }
    
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+      }
+   }
+
    return 1;
 }
 
@@ -1277,14 +1415,13 @@ emit_instruction(
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context)
+                  struct lp_build_sampler_soa *sampler)
 {
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
@@ -1299,8 +1436,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.inputs = inputs;
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
-   bld.emit_fetch_texel = emit_fetch_texel;
-   bld.emit_fetch_texel_context = emit_fetch_texel_context;
+   bld.sampler = sampler;
 
    tgsi_parse_init( &parse, tokens );
 
@@ -1309,16 +1445,18 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         /* Input already interpolated */
+         /* Inputs already interpolated */
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction( &bld, &parse.FullToken.FullInstruction )) {
+         {
             unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
             const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
-	    _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-	                  info ? info->mnemonic : "<invalid>");
-	 }
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                             info ? info->mnemonic : "<invalid>");
+         }
+
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
index 8e0026fd97..606243d6c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.c
@@ -33,7 +33,7 @@
 
 
 LLVMTypeRef
-lp_build_elem_type(union lp_type type)
+lp_build_elem_type(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -55,7 +55,7 @@ lp_build_elem_type(union lp_type type)
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type)
+lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
@@ -69,7 +69,7 @@ lp_build_vec_type(union lp_type type)
  * type and check for identity.
  */
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type) 
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
 {
    LLVMTypeKind elem_kind;
 
@@ -107,7 +107,7 @@ lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type)
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type) 
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
 {
    LLVMTypeRef elem_type;
 
@@ -128,7 +128,7 @@ lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type)
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val) 
+lp_check_value(struct lp_type type, LLVMValueRef val) 
 {
    LLVMTypeRef vec_type;
 
@@ -143,24 +143,36 @@ lp_check_value(union lp_type type, LLVMValueRef val)
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type)
+lp_build_int_elem_type(struct lp_type type)
 {
    return LLVMIntType(type.width);
 }
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type)
+lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
 }
 
 
+struct lp_type
+lp_int_type(struct lp_type type)
+{
+   struct lp_type int_type;
+
+   memset(&int_type, 0, sizeof int_type);
+   int_type.width = type.width;
+   int_type.length = type.length;
+   return int_type;
+}
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type)
+                      struct lp_type type)
 {
    bld->builder = builder;
    bld->type = type;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
index 3ce566be64..ee5ca3483c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.h
@@ -56,58 +56,55 @@
  * on the types used for intermediate computations, such as signed vs unsigned,
  * normalized values, or fixed point.
  */
-union lp_type {
-   struct {
-      /** 
-       * Floating-point. Cannot be used with fixed. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned floating:1;
-
-      /** 
-       * Fixed-point. Cannot be used with floating. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned fixed:1;
-      
-      /** 
-       * Whether it can represent negative values or not.
-       *
-       * If this is not set for floating point, it means that all values are
-       * assumed to be positive.
-       */
-      unsigned sign:1;
-
-      /**
-       * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
-       * interval for signed types.
-       *
-       * For integer types it means the representable integer range should be
-       * interpreted as the interval above.
-       *
-       * For floating and fixed point formats it means the values should be
-       * clamped to the interval above.
-       */
-      unsigned norm:1;
-
-      /**
-       * Element width.
-       *
-       * For fixed point values, the fixed point is assumed to be at half the
-       * width.
-       */
-      unsigned width:14;
-
-      /** 
-       * Vector length.
-       *
-       * width*length should be a power of two greater or equal to eight.
-       *
-       * @sa LP_MAX_VECTOR_LENGTH
-       */
-      unsigned length:14;
-   };
-   uint32_t value;
+struct lp_type {
+   /**
+    * Floating-point. Cannot be used with fixed. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned floating:1;
+
+   /**
+    * Fixed-point. Cannot be used with floating. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned fixed:1;
+
+   /**
+    * Whether it can represent negative values or not.
+    *
+    * If this is not set for floating point, it means that all values are
+    * assumed to be positive.
+    */
+   unsigned sign:1;
+
+   /**
+    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
+    * interval for signed types.
+    *
+    * For integer types it means the representable integer range should be
+    * interpreted as the interval above.
+    *
+    * For floating and fixed point formats it means the values should be
+    * clamped to the interval above.
+    */
+   unsigned norm:1;
+
+   /**
+    * Element width.
+    *
+    * For fixed point values, the fixed point is assumed to be at half the
+    * width.
+    */
+   unsigned width:14;
+
+   /**
+    * Vector length.
+    *
+    * width*length should be a power of two greater or equal to eight.
+    *
+    * @sa LP_MAX_VECTOR_LENGTH
+    */
+   unsigned length:14;
 };
 
 
@@ -124,7 +121,7 @@ struct lp_build_context
     * This not only describes the input/output LLVM types, but also whether
     * to normalize/clamp the results.
     */
-   union lp_type type;
+   struct lp_type type;
 
    /** Same as lp_build_undef(type) */
    LLVMValueRef undef;
@@ -138,37 +135,41 @@ struct lp_build_context
 
 
 LLVMTypeRef
-lp_build_elem_type(union lp_type type);
+lp_build_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type);
+lp_build_vec_type(struct lp_type type);
 
 
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type);
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type);
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val);
+lp_check_value(struct lp_type type, LLVMValueRef val);
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type);
+lp_build_int_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type);
+lp_build_int_vec_type(struct lp_type type);
+
+
+struct lp_type
+lp_int_type(struct lp_type type);
 
 
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type);
+                      struct lp_type type);
 
 
 #endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index 580cca5b46..bdcff94b9b 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -67,6 +67,7 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
          util_pack_color(rgba, ps->format, &cv);
          lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
       }
+      llvmpipe->dirty_render_cache = TRUE;
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 233d1df0e1..202cb8ef43 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -107,11 +107,16 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    if (llvmpipe->draw)
       draw_destroy( llvmpipe->draw );
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->texture[i], NULL);
+   }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
       if (llvmpipe->constants[i].buffer) {
@@ -141,8 +146,6 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          return PIPE_REFERENCED_FOR_WRITE;
    }
    
-   /* FIXME: we also need to do the same for the texture cache */
-   
    return PIPE_UNREFERENCED;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index d288460a1b..5d2cf01e5b 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,6 +36,7 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
 #include "lp_screen.h"
 #include "lp_bld_intr.h"
 #include "lp_jit.h"
@@ -44,15 +45,47 @@
 static void
 lp_jit_init_globals(struct llvmpipe_screen *screen)
 {
-   /* struct lp_jit_context */
+   LLVMTypeRef texture_type;
+
+   /* struct lp_jit_texture */
    {
       LLVMTypeRef elem_types[4];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           screen->target, texture_type);
+
+      LLVMAddTypeName(screen->module, "texture", texture_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[5];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
       elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
       elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
       elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -64,6 +97,9 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
                              screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
                              screen->target, context_type, 3);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             screen->target, context_type,
+                             LP_JIT_CONTEXT_TEXTURES_INDEX);
       LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
                            screen->target, context_type);
 
@@ -112,12 +148,19 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 {
    char *error = NULL;
 
+   util_cpu_detect();
+
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    screen->module = LLVMModuleCreateWithName("llvmpipe");
 
    screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
 
    if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
-      fprintf(stderr, "%s\n", error);
+      _debug_printf("%s\n", error);
       LLVMDisposeMessage(error);
       abort();
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index a7fb60f9f5..58f716ede2 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -38,11 +38,31 @@
 
 #include "lp_bld_struct.h"
 
+#include "pipe/p_state.h"
+
 
 struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
+struct lp_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t stride;
+   const void *data;
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_STRIDE,
+   LP_JIT_TEXTURE_DATA
+};
+
+
+
 /**
  * This structure is passed directly to the generated fragment shader.
  *
@@ -60,11 +80,12 @@ struct lp_jit_context
 
    struct tgsi_sampler **samplers;
 
-   /* TODO: alpha reference value */
    float alpha_ref_value;
 
-   /* TODO: blend constant color */
+   /* FIXME: store (also?) in floats */
    uint8_t *blend_color;
+
+   struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
 };
 
 
@@ -80,6 +101,11 @@ struct lp_jit_context
 #define lp_jit_context_blend_color(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 3, "blend_color")
 
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+
+#define lp_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+
 
 typedef void
 (*lp_jit_frag_func)(struct lp_jit_context *context,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 125035771e..0518927458 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -27,8 +27,6 @@
 
 
 #include "util/u_memory.h"
-#include "util/u_simple_screen.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -67,8 +65,6 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -86,7 +82,7 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
+      return 9;  /* max 256x256x256 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_TGSI_CONT_SUPPORTED:
@@ -196,8 +192,7 @@ static void
 llvmpipe_destroy_screen( struct pipe_screen *_screen )
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
-
-   struct pipe_winsys *winsys = _screen->winsys;
+   struct llvmpipe_winsys *winsys = screen->winsys;
 
    lp_jit_screen_cleanup(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index d145f6d6bb..2d2fc19a65 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -44,6 +44,7 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "lp_bld_debug.h"
 #include "lp_tile_cache.h"
 #include "lp_tile_soa.h"
 
@@ -162,12 +163,12 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    else
       depth = NULL;
 
-   /* TODO: blend color */
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
 
-   assert((((uintptr_t)mask) & 0xf) == 0);
-   assert((((uintptr_t)depth) & 0xf) == 0);
-   assert((((uintptr_t)color) & 0xf) == 0);
-   assert((((uintptr_t)llvmpipe->jit_context.blend_color) & 0xf) == 0);
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
 
    /* run shader */
    fs->current->jit_function( &llvmpipe->jit_context,
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index fb10329887..7b26ce61a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -36,6 +36,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_jit.h"
+#include "lp_bld_sample.h" /* for struct lp_sampler_static_state */
 
 
 #define LP_NEW_VIEWPORT      0x1
@@ -57,16 +58,20 @@
 
 struct tgsi_sampler;
 struct vertex_info;
-
+struct pipe_context;
+struct llvmpipe_context;
 
 struct lp_fragment_shader;
 
 
 struct lp_fragment_shader_variant_key
 {
+   enum pipe_format zsbuf_format;
    struct pipe_depth_state depth;
    struct pipe_alpha_state alpha;
    struct pipe_blend_state blend;
+
+   struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 6fbb057937..30fb41ea65 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -93,6 +93,23 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < lpfs->info.num_inputs; i++) {
          int src;
+         enum interp_mode interp;
+
+         switch (lpfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
          switch (lpfs->info.input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
             src = draw_find_vs_output(llvmpipe->draw,
@@ -108,7 +125,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
 
          case TGSI_SEMANTIC_FOG:
             src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          case TGSI_SEMANTIC_GENERIC:
@@ -116,7 +133,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
             /* this includes texcoords and varying vars */
             src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_GENERIC,
                                       lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          default:
@@ -250,7 +267,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_BLEND |
-                          LP_NEW_DEPTH_STENCIL_ALPHA))
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_TEXTURE))
       llvmpipe_update_fs( llvmpipe );
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 94170bd716..9faed5a0b1 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -85,6 +85,7 @@
 #include "lp_context.h"
 #include "lp_state.h"
 #include "lp_quad.h"
+#include "lp_tex_sample.h"
 
 
 static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@ -130,21 +131,20 @@ generate_pos0(LLVMBuilderRef builder,
  * Generate the depth test.
  */
 static void
-generate_depth(struct llvmpipe_context *lp,
-               LLVMBuilderRef builder,
-               const struct pipe_depth_state *state,
-               union lp_type src_type,
+generate_depth(LLVMBuilderRef builder,
+               const struct lp_fragment_shader_variant_key *key,
+               struct lp_type src_type,
                struct lp_build_mask_context *mask,
                LLVMValueRef src,
                LLVMValueRef dst_ptr)
 {
    const struct util_format_description *format_desc;
-   union lp_type dst_type;
+   struct lp_type dst_type;
 
-   if(!lp->framebuffer.zsbuf)
+   if(!key->depth.enabled)
       return;
 
-   format_desc = util_format_description(lp->framebuffer.zsbuf->format);
+   format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
    /* Pick the depth type. */
@@ -164,7 +164,7 @@ generate_depth(struct llvmpipe_context *lp,
 #endif
 
    lp_build_depth_test(builder,
-                       state,
+                       &key->depth,
                        dst_type,
                        format_desc,
                        mask,
@@ -173,107 +173,6 @@ generate_depth(struct llvmpipe_context *lp,
 }
 
 
-struct build_fetch_texel_context
-{
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-static void
-emit_fetch_texel( LLVMBuilderRef builder,
-                  void *context,
-                  unsigned unit,
-                  unsigned num_coords,
-                  const LLVMValueRef *coords,
-                  LLVMValueRef lodbias,
-                  LLVMValueRef *texel)
-{
-   struct build_fetch_texel_context *bld = context;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!bld->samplers_ptr)
-      bld->samplers_ptr = lp_jit_context_samplers(builder, bld->context_ptr);
-
-   if(!bld->store_ptr)
-      bld->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = bld->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = bld->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
  */
@@ -282,11 +181,11 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_fragment_shader *shader,
             const struct lp_fragment_shader_variant_key *key,
             LLVMBuilderRef builder,
-            union lp_type type,
+            struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
             const struct lp_build_interp_soa_context *interp,
-            struct build_fetch_texel_context *sampler,
+            struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef *color,
             LLVMValueRef depth_ptr)
@@ -298,6 +197,7 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef z = interp->pos[2];
+   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
    boolean early_depth_test;
    unsigned attrib;
@@ -309,25 +209,35 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   lp_build_mask_begin(&mask, builder, type, *pmask);
+   flow = lp_build_flow_create(builder);
+
+   memset(outputs, 0, sizeof outputs);
+
+   lp_build_flow_scope_begin(flow);
+
+   /* Declare the color and z variables */
+   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+      color[chan] = LLVMGetUndef(vec_type);
+      lp_build_flow_scope_declare(flow, &color[chan]);
+   }
+   lp_build_flow_scope_declare(flow, &z);
+
+   lp_build_mask_begin(&mask, flow, type, *pmask);
 
    early_depth_test =
-      lp->depth_stencil->depth.enabled &&
-      lp->framebuffer.zsbuf &&
-      !lp->depth_stencil->alpha.enabled &&
-      !lp->fs->info.uses_kill &&
-      !lp->fs->info.writes_z;
+      key->depth.enabled &&
+      !key->alpha.enabled &&
+      !shader->info.uses_kill &&
+      !shader->info.writes_z;
 
    if(early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
-   memset(outputs, 0, sizeof outputs);
-
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, emit_fetch_texel, sampler);
+                     outputs, sampler);
 
    for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -368,12 +278,16 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    if(!early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
    lp_build_mask_end(&mask);
 
+   lp_build_flow_scope_end(flow);
+
+   lp_build_flow_destroy(flow);
+
    *pmask = mask.value;
 
 }
@@ -385,13 +299,15 @@ generate_fs(struct llvmpipe_context *lp,
 static void
 generate_blend(const struct pipe_blend_state *blend,
                LLVMBuilderRef builder,
-               union lp_type type,
+               struct lp_type type,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
                LLVMValueRef dst_ptr)
 {
    struct lp_build_context bld;
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMTypeRef int_vec_type;
    LLVMValueRef const_ptr;
@@ -400,11 +316,14 @@ generate_blend(const struct pipe_blend_state *blend,
    LLVMValueRef res[4];
    unsigned chan;
 
+   lp_build_context_init(&bld, builder, type);
+
+   flow = lp_build_flow_create(builder);
+   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+
    vec_type = lp_build_vec_type(type);
    int_vec_type = lp_build_int_vec_type(type);
 
-   lp_build_context_init(&bld, builder, type);
-
    const_ptr = lp_jit_context_blend_color(builder, context_ptr);
    const_ptr = LLVMBuildBitCast(builder, const_ptr,
                                 LLVMPointerType(vec_type, 0), "");
@@ -422,11 +341,16 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_blend_soa(builder, blend, type, src, dst, con, res);
 
    for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
-      lp_build_name(res[chan], "res.%c", "rgba"[chan]);
-      res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
-      LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      if(blend->colormask & (1 << chan)) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         lp_build_name(res[chan], "res.%c", "rgba"[chan]);
+         res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
+         LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      }
    }
+
+   lp_build_mask_end(&mask_ctx);
+   lp_build_flow_destroy(flow);
 }
 
 
@@ -440,8 +364,8 @@ generate_fragment(struct llvmpipe_context *lp,
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    struct lp_fragment_shader_variant *variant;
-   union lp_type fs_type;
-   union lp_type blend_type;
+   struct lp_type fs_type;
+   struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
    LLVMTypeRef fs_vec_type;
    LLVMTypeRef fs_int_vec_type;
@@ -462,7 +386,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMBuilderRef builder;
    LLVMValueRef x0;
    LLVMValueRef y0;
-   struct build_fetch_texel_context sampler;
+   struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
    LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
    LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
@@ -507,7 +431,7 @@ generate_fragment(struct llvmpipe_context *lp,
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
 
-   fs_type.value = 0;
+   memset(&fs_type, 0, sizeof fs_type);
    fs_type.floating = TRUE; /* floating point values */
    fs_type.sign = TRUE;     /* values are signed */
    fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
@@ -515,7 +439,7 @@ generate_fragment(struct llvmpipe_context *lp,
    fs_type.length = 4;      /* 4 element per vector */
    num_fs = 4;
 
-   blend_type.value = 0;
+   memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
    blend_type.sign = FALSE;     /* values are unsigned */
    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
@@ -586,8 +510,13 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-   memset(&sampler, 0, sizeof sampler);
-   sampler.context_ptr = context_ptr;
+#if 0
+   /* C texture sampling */
+   sampler = lp_c_sampler_soa_create(context_ptr);
+#else
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
+#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
@@ -606,7 +535,7 @@ generate_fragment(struct llvmpipe_context *lp,
                   context_ptr,
                   i,
                   &interp,
-                  &sampler,
+                  sampler,
                   &fs_mask[i],
                   out_color,
                   depth_ptr_i);
@@ -615,6 +544,8 @@ generate_fragment(struct llvmpipe_context *lp,
          fs_out_color[chan][i] = out_color[chan];
    }
 
+   sampler->destroy(sampler);
+
    /* 
     * Convert the fs's output color and mask to fit to the blending type. 
     */
@@ -765,18 +696,45 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  */
 static void
 make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
                  struct lp_fragment_shader_variant_key *key)
 {
+   unsigned i;
+
    memset(key, 0, sizeof *key);
 
-   memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   if(lp->framebuffer.zsbuf &&
+      lp->depth_stencil->depth.enabled) {
+      key->zsbuf_format = lp->framebuffer.zsbuf->format;
+      memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   }
 
    key->alpha.enabled = lp->depth_stencil->alpha.enabled;
    if(key->alpha.enabled)
       key->alpha.func = lp->depth_stencil->alpha.func;
    /* alpha.ref_value is passed in jit_context */
 
-   memcpy(&key->blend, lp->blend, sizeof key->blend);
+   if(lp->framebuffer.cbufs[0]) {
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+      assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
+             format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
+
+      /* mask out color channels not present in the color buffer */
+      for(chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         if(swizzle > 4)
+            key->blend.colormask &= ~(1 << chan);
+      }
+   }
+
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+         lp_sampler_static_state(&key->sampler[i], lp->texture[i], lp->sampler[i]);
 }
 
 
@@ -787,7 +745,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant;
 
-   make_variant_key(lp, &key);
+   make_variant_key(lp, shader, &key);
 
    variant = shader->variants;
    while(variant) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 4fef541b1e..c69d90c723 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -98,6 +98,16 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
 
       pipe_texture_reference(&llvmpipe->texture[i], tex);
       lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
+
+      if(tex) {
+         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+         struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
+         jit_tex->width = tex->width[0];
+         jit_tex->height = tex->height[0];
+         jit_tex->stride = lp_tex->stride[0];
+         if(!lp_tex->dt)
+            jit_tex->data = lp_tex->data;
+      }
    }
 
    llvmpipe->num_textures = num;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 177a26b7b1..2c29144c03 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -56,7 +56,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
          lp_flush_tile_cache(lp->cbuf_cache[i]);
 
          /* assign new */
-         lp->framebuffer.cbufs[i] = fb->cbufs[i];
+         pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
 
          /* update cache */
          lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
@@ -81,7 +81,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       }
 
       /* assign new */
-      lp->framebuffer.zsbuf = fb->zsbuf;
+      pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
 
       /* Tell draw module how deep the Z/depth buffer is */
       if (lp->framebuffer.zsbuf) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 69aaae26e0..a88e110c66 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -86,43 +86,43 @@ random_float(void);
 
 
 void
-dump_type(FILE *fp, union lp_type type);
+dump_type(FILE *fp, struct lp_type type);
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index);
+read_elem(struct lp_type type, const void *src, unsigned index);
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double src);
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index);
+random_elem(struct lp_type type, void *dst, unsigned index);
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst);
+read_vec(struct lp_type type, const void *src, double *dst);
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src);
+write_vec(struct lp_type type, void *dst, const double *src);
 
 
 void
-random_vec(union lp_type type, void *dst);
+random_vec(struct lp_type type, void *dst);
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps);
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref);
+compare_vec(struct lp_type type, const void *res, const void *ref);
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src);
+dump_vec(FILE *fp, struct lp_type type, const void *src);
 
 
 #endif /* !LP_TEST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 8dfad468e3..94b661dcba 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -80,7 +80,7 @@ static void
 write_tsv_row(FILE *fp,
               const struct pipe_blend_state *blend,
               enum vector_mode mode,
-              union lp_type type,
+              struct lp_type type,
               double cycles,
               boolean success)
 {
@@ -125,7 +125,7 @@ static void
 dump_blend_type(FILE *fp,
                 const struct pipe_blend_state *blend,
                 enum vector_mode mode,
-                union lp_type type)
+                struct lp_type type)
 {
    fprintf(fp, "%s", mode ? "soa" : "aos");
 
@@ -153,7 +153,7 @@ static LLVMValueRef
 add_blend_test(LLVMModuleRef module,
                const struct pipe_blend_state *blend,
                enum vector_mode mode,
-               union lp_type type)
+               struct lp_type type)
 {
    LLVMTypeRef ret_type;
    LLVMTypeRef vec_type;
@@ -467,7 +467,7 @@ test_one(unsigned verbose,
          FILE *fp,
          const struct pipe_blend_state *blend,
          enum vector_mode mode,
-         union lp_type type)
+         struct lp_type type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -765,10 +765,10 @@ blend_funcs[] = {
 };
 
 
-const union lp_type blend_types[] = {
+const struct lp_type blend_types[] = {
    /* float, fixed,  sign,  norm, width, len */
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }}, /* f32 x 4 */
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }}, /* u8n x 16 */
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
 };
 
 
@@ -788,7 +788,7 @@ test_all(unsigned verbose, FILE *fp)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    bool success = TRUE;
 
    for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
@@ -841,27 +841,27 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      rgb_func = &blend_funcs[random() % num_funcs];
-      alpha_func = &blend_funcs[random() % num_funcs];
-      rgb_src_factor = &blend_factors[random() % num_factors];
-      alpha_src_factor = &blend_factors[random() % num_factors];
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
       
       do {
-         rgb_dst_factor = &blend_factors[random() % num_factors];
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
       } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
       do {
-         alpha_dst_factor = &blend_factors[random() % num_factors];
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
       } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
-      mode = random() & 1;
+      mode = rand() & 1;
 
-      type = &blend_types[random() % num_types];
+      type = &blend_types[rand() % num_types];
 
       memset(&blend, 0, sizeof blend);
       blend.blend_enable      = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index e6489834af..9dcf58e5dc 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -59,8 +59,8 @@ write_tsv_header(FILE *fp)
 
 static void
 write_tsv_row(FILE *fp,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               double cycles,
               boolean success)
 {
@@ -80,8 +80,8 @@ write_tsv_row(FILE *fp,
 
 static void
 dump_conv_types(FILE *fp,
-               union lp_type src_type,
-               union lp_type dst_type)
+               struct lp_type src_type,
+               struct lp_type dst_type)
 {
    fprintf(fp, "src_type=");
    dump_type(fp, src_type);
@@ -96,8 +96,8 @@ dump_conv_types(FILE *fp,
 
 static LLVMValueRef
 add_conv_test(LLVMModuleRef module,
-              union lp_type src_type, unsigned num_srcs,
-              union lp_type dst_type, unsigned num_dsts)
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
@@ -145,8 +145,8 @@ add_conv_test(LLVMModuleRef module,
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
-         union lp_type src_type,
-         union lp_type dst_type)
+         struct lp_type src_type,
+         struct lp_type dst_type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -343,35 +343,35 @@ test_one(unsigned verbose,
 }
 
 
-const union lp_type conv_types[] = {
+const struct lp_type conv_types[] = {
    /* float, fixed,  sign,  norm, width, len */
 
-   {{  TRUE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE, FALSE,    32,   4 }},
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
 
    /* TODO: test fixed formats too */
 
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    32,   4 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE, FALSE,     8,  16 }},
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
 };
 
 
@@ -381,8 +381,8 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    bool success = TRUE;
 
    for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
@@ -407,16 +407,16 @@ test_all(unsigned verbose, FILE *fp)
 boolean
 test_some(unsigned verbose, FILE *fp, unsigned long n)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      src_type = &conv_types[random() % num_types];
+      src_type = &conv_types[rand() % num_types];
       
       do {
-         dst_type = &conv_types[random() % num_types];
+         dst_type = &conv_types[rand() % num_types];
       } while (src_type == dst_type || src_type->norm != dst_type->norm);
 
       if(!test_one(verbose, fp, *src_type, *dst_type))
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 1d192355ee..7d83f899e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -119,7 +119,7 @@ add_load_rgba_test(LLVMModuleRef module,
 
    lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 1, 0), &loop);
 
-   rgba = lp_build_load_rgba(builder, format, ptr);
+   rgba = lp_build_load_rgba_aos(builder, format, ptr);
    LLVMBuildStore(builder, rgba, rgba_ptr);
 
    lp_build_loop_end(builder, LLVMConstInt(LLVMInt32Type(), 4, 0), NULL, &loop);
@@ -160,7 +160,7 @@ add_store_rgba_test(LLVMModuleRef module,
 
    rgba = LLVMBuildLoad(builder, rgba_ptr, "");
 
-   lp_build_store_rgba(builder, format, ptr, rgba);
+   lp_build_store_rgba_aos(builder, format, ptr, rgba);
 
    LLVMBuildRetVoid(builder);
 
@@ -264,6 +264,11 @@ int main(int argc, char **argv)
    unsigned i;
    int ret;
 
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    for (i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); ++i)
       if(!test_format(&test_cases[i]))
         ret = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 49213fb4f0..f07fa256f1 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -40,7 +40,7 @@
 
 void
 dump_type(FILE *fp,
-          union lp_type type)
+          struct lp_type type)
 {
    fprintf(fp, "%s%s%u%sx%u",
            type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
@@ -52,7 +52,7 @@ dump_type(FILE *fp,
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index)
+read_elem(struct lp_type type, const void *src, unsigned index)
 {
    double scale = lp_const_scale(type);
    double value;
@@ -115,7 +115,7 @@ read_elem(union lp_type type, const void *src, unsigned index)
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double value)
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
 {
    assert(index < type.length);
    if(!type.sign && value < 0.0)
@@ -184,11 +184,11 @@ write_elem(union lp_type type, void *dst, unsigned index, double value)
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index)
+random_elem(struct lp_type type, void *dst, unsigned index)
 {
    double value;
    assert(index < type.length);
-   value = (double)random()/(double)RAND_MAX;
+   value = (double)rand()/(double)RAND_MAX;
    if(!type.norm) {
       unsigned long long mask;
       if (type.floating)
@@ -199,17 +199,17 @@ random_elem(union lp_type type, void *dst, unsigned index)
          mask = ((unsigned long long)1 << (type.width - 1)) - 1;
       else
          mask = ((unsigned long long)1 << type.width) - 1;
-      value += (double)(mask & random());
+      value += (double)(mask & rand());
    }
    if(!type.sign)
-      if(random() & 1)
+      if(rand() & 1)
          value = -value;
    write_elem(type, dst, index, value);
 }
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst)
+read_vec(struct lp_type type, const void *src, double *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -218,7 +218,7 @@ read_vec(union lp_type type, const void *src, double *dst)
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src)
+write_vec(struct lp_type type, void *dst, const double *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -229,12 +229,12 @@ write_vec(union lp_type type, void *dst, const double *src)
 float
 random_float(void)
 {
-    return (float)((double)random()/(double)RAND_MAX);
+    return (float)((double)rand()/(double)RAND_MAX);
 }
 
 
 void
-random_vec(union lp_type type, void *dst)
+random_vec(struct lp_type type, void *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -243,7 +243,7 @@ random_vec(union lp_type type, void *dst)
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps)
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
@@ -259,7 +259,7 @@ compare_vec_with_eps(union lp_type type, const void *res, const void *ref, doubl
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref)
+compare_vec(struct lp_type type, const void *res, const void *ref)
 {
    double eps = lp_const_eps(type);
    return compare_vec_with_eps(type, res, ref, eps);
@@ -267,7 +267,7 @@ compare_vec(union lp_type type, const void *res, const void *ref)
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src)
+dump_vec(FILE *fp, struct lp_type type, const void *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
@@ -365,6 +365,11 @@ int main(int argc, char **argv)
          n = atoi(argv[i]);
    }
 
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    if(fp) {
       /* Warm up the caches */
       test_some(0, NULL, 100);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
index 23a94b5b0d..773e848242 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -154,7 +154,7 @@ lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
       if (lpt->timestamp != tc->timestamp) {
          /* texture was modified, invalidate all cached tiles */
          uint i;
-         _debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
+         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
          for (i = 0; i < NUM_ENTRIES; i++) {
             tc->entries[i].addr.bits.invalid = 1;
          }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 628ec3f1ef..9ad1bde956 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -29,10 +29,13 @@
 #define LP_TEX_SAMPLE_H
 
 
+#include <llvm-c/Core.h>
+
 #include "tgsi/tgsi_exec.h"
 
 
 struct llvmpipe_tex_tile_cache;
+struct lp_sampler_static_state;
 
 
 /**
@@ -75,4 +78,24 @@ lp_get_samples(struct tgsi_sampler *tgsi_sampler,
                float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
+/**
+ * Texture sampling code generator that just calls lp_get_samples C function
+ * for the actual sampling computation.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr);
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key,
+                           LLVMValueRef context_ptr);
+
+
 #endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
index 94eb6dad5a..a1365a045f 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -1578,3 +1578,136 @@ out:
    tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
 }
 
+
+void PIPE_CDECL
+lp_fetch_texel_soa( struct tgsi_sampler **samplers,
+                    uint32_t unit,
+                    float *store )
+{
+   struct tgsi_sampler *sampler = samplers[unit];
+
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n",
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      sampler->get_samples(sampler,
+                           &store[0],
+                           &store[4],
+                           &store[8],
+                           0.0f, /*store[12],  lodbias */
+                           rgba);
+      memcpy(store, rgba, sizeof rgba);
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_tgsi.h"
+
+
+struct lp_c_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   LLVMValueRef context_ptr;
+
+   LLVMValueRef samplers_ptr;
+
+   /** Coords/texels store */
+   LLVMValueRef store_ptr;
+};
+
+
+static void
+lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
+                                  LLVMBuilderRef builder,
+                                  struct lp_type type,
+                                  unsigned unit,
+                                  unsigned num_coords,
+                                  const LLVMValueRef *coords,
+                                  LLVMValueRef lodbias,
+                                  LLVMValueRef *texel)
+{
+   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
+   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
+   LLVMValueRef args[3];
+   unsigned i;
+
+   if(!sampler->samplers_ptr)
+      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
+
+   if(!sampler->store_ptr)
+      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
+                                            vec_type,
+                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
+                                            "texel_store");
+
+   for (i = 0; i < num_coords; i++) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      LLVMBuildStore(builder, coords[i], coord_ptr);
+   }
+
+   args[0] = sampler->samplers_ptr;
+   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   args[2] = sampler->store_ptr;
+
+   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
+
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
+   }
+}
+
+
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr)
+{
+   struct lp_c_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_c_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
+   sampler->context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
new file mode 100644
index 0000000000..d2a6ae21f5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_tgsi.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in lp_jit_context
+ * and lp_jit_texture and the sampler code generator. It provides the
+ * texture layout information required by the texture sampler code generator
+ * in terms of the state stored in lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(struct lp_sampler_dynamic_state *base,
+                       LLVMBuilderRef builder,
+                       unsigned unit,
+                       unsigned member_index,
+                       const char *member_name)
+{
+   struct llvmpipe_sampler_dynamic_state *state = (struct llvmpipe_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), LP_JIT_CONTEXT_TEXTURES_INDEX, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to fetch
+ * the members of lp_jit_texture to fulfill the sampler code generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture sampler code
+ * generator a reusable module without dependencies to llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index) \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( struct lp_sampler_dynamic_state *base, \
+                            LLVMBuilderRef builder, \
+                            unsigned unit) \
+   { \
+      return lp_llvm_texture_member(base, builder, unit, _index, #_name ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,    LP_JIT_TEXTURE_WIDTH)
+LP_LLVM_TEXTURE_MEMBER(height,   LP_JIT_TEXTURE_HEIGHT)
+LP_LLVM_TEXTURE_MEMBER(stride,   LP_JIT_TEXTURE_STRIDE)
+LP_LLVM_TEXTURE_MEMBER(data_ptr, LP_JIT_TEXTURE_DATA)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *base,
+                                     LLVMBuilderRef builder,
+                                     struct lp_type type,
+                                     unsigned unit,
+                                     unsigned num_coords,
+                                     const LLVMValueRef *coords,
+                                     LLVMValueRef lodbias,
+                                     LLVMValueRef *texel)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords,
+                       coords,
+                       lodbias,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                           LLVMValueRef context_ptr)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.stride = lp_llvm_texture_stride;
+   sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index 143afec3d3..0c06b659a1 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -44,10 +44,53 @@
 #include "lp_tile_cache.h"
 
 
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
 struct llvmpipe_tile_cache *
 lp_create_tile_cache( struct pipe_screen *screen )
 {
    struct llvmpipe_tile_cache *tc;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( llvmpipe_tile_cache );
    if(!tc)
@@ -225,11 +268,14 @@ lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
                            tc->clear_val);
 
             screen->transfer_unmap(screen, pt);
+
+            tile->status = LP_TILE_STATUS_UNDEFINED;
             break;
          }
 
          case LP_TILE_STATUS_DEFINED:
             lp_put_tile_rgba_soa(pt, x, y, tile->color);
+            tile->status = LP_TILE_STATUS_UNDEFINED;
             break;
          }
       }
@@ -257,7 +303,7 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
 
    case LP_TILE_STATUS_UNDEFINED:
       /* get new tile data from transfer */
-      lp_get_tile_rgba_soa(pt, x, y, tile->color);
+      lp_get_tile_rgba_soa(pt, x & ~(TILE_SIZE - 1), y & ~(TILE_SIZE - 1), tile->color);
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
index 6d8ba5ece7..161bab3799 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -33,42 +33,7 @@
 #include "lp_tile_soa.h"
 
 
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
+struct llvmpipe_tile_cache;  /* opaque */
 
 
 extern struct llvmpipe_tile_cache *
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index ff2febb668..170ce3eb7e 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -16,8 +16,6 @@ nv04_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 0;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 4469b22d91..ee5901e743 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -15,8 +15,6 @@ nv10_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index e6924ad71e..4eeacd1afd 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -15,8 +15,6 @@ nv20_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index f8285e4455..41af38450b 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -22,8 +22,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index 5d2a4216c5..bd13dfddd1 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -21,8 +21,6 @@ nv40_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index 6e8f4f9750..fca078b174 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -37,11 +37,12 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 
 	/* We need this in the ddx for reliable composite, not sure what we're
 	 * actually flushing. We generate all our own flushes with flags = 0. */
-	WAIT_RING(chan, 3);
+	WAIT_RING(chan, 2);
 	BEGIN_RING(chan, eng2d, 0x0110, 1);
 	OUT_RING  (chan, 0);
 
-	FIRE_RING(chan);
+	if (flags & PIPE_FLUSH_FRAME)
+		FIRE_RING(chan);
 }
 
 static void
@@ -110,6 +111,9 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv50->pipe.is_texture_referenced = nv50_is_texture_referenced;
 	nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced;
 
+	screen->base.channel->user_private = nv50;
+	screen->base.channel->flush_notify = nv50_state_flush_notify;
+
 	nv50_init_surface_functions(nv50);
 	nv50_init_state_functions(nv50);
 	nv50_init_query_functions(nv50);
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1e9e8e49bf..4608854d71 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -116,6 +116,7 @@ struct nv50_state {
 	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *programs;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
 	struct nouveau_stateobj *vtxattr;
@@ -190,10 +191,12 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_linkage_validate(struct nv50_context *nv50);
 extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
 
 /* nv50_state_validate.c */
 extern boolean nv50_state_validate(struct nv50_context *nv50);
+extern void nv50_state_flush_notify(struct nouveau_channel *chan);
 
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 03b9243b82..93479a0314 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -148,6 +148,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->image_nr = 1;
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+	mt->level[0].tile_mode = bo->tile_mode;
 
 	nouveau_bo_ref(bo, &mt->base.bo);
 	return &mt->base.base;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 4a838529de..576d075318 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -90,6 +90,10 @@ struct nv50_reg {
 	int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
+/* arbitrary limits */
+#define MAX_IF_DEPTH 4
+#define MAX_LOOP_DEPTH 4
+
 struct nv50_pc {
 	struct nv50_program *p;
 
@@ -112,11 +116,22 @@ struct nv50_pc {
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
 
+	/* broadcast and destination replacement regs */
+	struct nv50_reg *r_brdc;
+	struct nv50_reg *r_dst[4];
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
 	struct nv50_reg *iv_c;
 
+	struct nv50_program_exec *if_cond;
+	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
+	int if_lvl, loop_lvl;
+	unsigned loop_pos[MAX_LOOP_DEPTH];
+
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
 	unsigned insn_nr;
@@ -124,6 +139,25 @@ struct nv50_pc {
 	boolean allow32;
 };
 
+static INLINE void
+ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
+{
+	reg->type = type;
+	reg->index = index;
+	reg->hw = hw;
+	reg->neg = 0;
+	reg->rhw = -1;
+	reg->acc = 0;
+}
+
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+	static const unsigned cnt[16]
+	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	return cnt[val & 0xf];
+}
+
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
@@ -173,6 +207,10 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 	assert(0);
 }
 
+/* XXX: For shaders that aren't executed linearly (e.g. shaders that
+ * contain loops), we need to assign all hw regs to TGSI TEMPs early,
+ * lest we risk temp_temps overwriting regs alloc'd "later".
+ */
 static struct nv50_reg *
 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 {
@@ -184,11 +222,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 
 	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
 		if (!pc->r_temp[i]) {
-			r = CALLOC_STRUCT(nv50_reg);
-			r->type = P_TEMP;
-			r->index = -1;
-			r->hw = i;
-			r->rhw = -1;
+			r = MALLOC_STRUCT(nv50_reg);
+			ctor_reg(r, P_TEMP, -1, i);
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -254,10 +289,8 @@ alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
 		return alloc_temp4(pc, dst, idx + 4);
 
 	for (i = 0; i < 4; i++) {
-		dst[i] = CALLOC_STRUCT(nv50_reg);
-		dst[i]->type = P_TEMP;
-		dst[i]->index = -1;
-		dst[i]->hw = idx + i;
+		dst[i] = MALLOC_STRUCT(nv50_reg);
+		ctor_reg(dst[i], P_TEMP, -1, idx + i);
 		pc->r_temp[idx + i] = dst[i];
 	}
 
@@ -309,7 +342,7 @@ ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
-	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
@@ -319,9 +352,7 @@ alloc_immd(struct nv50_pc *pc, float f)
 	if (hw == pc->immd_nr * 4)
 		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
 
-	r->type = P_IMMD;
-	r->hw = hw;
-	r->index = -1;
+	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
 }
 
@@ -543,6 +574,22 @@ check_swap_src_0_1(struct nv50_pc *pc,
 }
 
 static void
+set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
+		     struct nv50_program_exec *e)
+{
+	struct nv50_reg *temp;
+
+	if (src->type != P_TEMP) {
+		temp = temp_temp(pc);
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
@@ -744,7 +791,11 @@ emit_flop(struct nv50_pc *pc, unsigned sub,
 	}
 
 	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
+
+	if (sub == 0 || sub == 2)
+		set_src_0_restricted(pc, src, e);
+	else
+		set_src_0(pc, src, e);
 
 	emit(pc, e);
 }
@@ -786,16 +837,20 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 #define CVTOP_SAT	0x08
 #define CVTOP_ABS	0x10
 
+/* 0x04 == 32 bit */
+/* 0x40 == dst is float */
+/* 0x80 == src is float */
 #define CVT_F32_F32 0xc4
 #define CVT_F32_S32 0x44
 #define CVT_F32_U32 0x64
 #define CVT_S32_F32 0x8c
 #define CVT_S32_S32 0x0c
-#define CVT_F32_F32_ROP 0xcc
+#define CVT_NEG     0x20
+#define CVT_RI      0x08
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cop, unsigned fmt)
+	 int wp, unsigned cvn, unsigned fmt)
 {
 	struct nv50_program_exec *e;
 
@@ -804,7 +859,7 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 
 	e->inst[0] |= 0xa0000000;
 	e->inst[1] |= 0x00004000;
-	e->inst[1] |= (cop << 16);
+	e->inst[1] |= (cvn << 16);
 	e->inst[1] |= (fmt << 24);
 	set_src_0(pc, src, e);
 
@@ -821,53 +876,85 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+/* nv50 Condition codes:
+ *  0x1 = LT
+ *  0x2 = EQ
+ *  0x3 = LE
+ *  0x4 = GT
+ *  0x5 = NE
+ *  0x6 = GE
+ *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
+ *  0x8 = unordered bit (allows NaN)
+ */
 static void
-emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
 {
+	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
 	struct nv50_program_exec *e = exec(pc);
-	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 	struct nv50_reg *rdst;
 
-	assert(c_op <= 7);
+	assert(ccode < 16);
 	if (check_swap_src_0_1(pc, &src0, &src1))
-		c_op = inv_cop[c_op];
+		ccode = cc_swapped[ccode & 7] | (ccode & 8);
 
 	rdst = dst;
-	if (dst->type != P_TEMP)
+	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
 	/* set.u32 */
 	set_long(pc, e);
 	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (3 << 29);
-	e->inst[1] |= (c_op << 14);
-	/*XXX: breaks things, .u32 by default?
-	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
-	 *     doesn't seem to match what the hw actually does.
-	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	e->inst[1] |= 0x60000000 | (ccode << 14);
+
+	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
+	 * that doesn't seem to match what the hw actually does
+	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
 	 */
-	set_dst(pc, dst, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
 	set_src_0(pc, src0, e);
 	set_src_1(pc, src1, e);
-	emit(pc, e);
 
-	/* cvt.f32.u32 */
-	e = exec(pc);
-	e->inst[0] = 0xa0000001;
-	e->inst[1] = 0x64014780;
-	set_dst(pc, rdst, e);
-	set_src_0(pc, dst, e);
 	emit(pc, e);
+	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
 
-	if (dst != rdst)
+	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
+	if (rdst)
+		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
+static INLINE unsigned
+map_tgsi_setop_cc(unsigned op)
+{
+	switch (op) {
+	case TGSI_OPCODE_SLT: return 0x1;
+	case TGSI_OPCODE_SGE: return 0x6;
+	case TGSI_OPCODE_SEQ: return 0x2;
+	case TGSI_OPCODE_SGT: return 0x4;
+	case TGSI_OPCODE_SLE: return 0x3;
+	case TGSI_OPCODE_SNE: return 0xd;
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
+	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -890,6 +977,12 @@ emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
+static INLINE void
+emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+}
+
 static void
 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	 struct nv50_reg **src)
@@ -1073,10 +1166,11 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	emit(pc, e);
 
 #if 1
-	if (mask & 1) emit_mov(pc, dst[0], t[0]);
-	if (mask & 2) emit_mov(pc, dst[1], t[1]);
-	if (mask & 4) emit_mov(pc, dst[2], t[2]);
-	if (mask & 8) emit_mov(pc, dst[3], t[3]);
+	c = 0;
+	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
+	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
+	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
+	if (mask & 8) emit_mov(pc, dst[3], t[c]);
 
 	free_temp4(pc, t);
 #else
@@ -1093,6 +1187,38 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 }
 
 static void
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
+	    struct nv50_program_exec **join)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	if (join) {
+		set_long(pc, e);
+		e->inst[0] |= 0xa0000002;
+		emit(pc, e);
+		*join = e;
+		e = exec(pc);
+	}
+
+	set_long(pc, e);
+	e->inst[0] |= 0x10000002;
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+	emit(pc, e);
+}
+
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
+static void
 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	unsigned q = 0, m = ~0;
@@ -1159,6 +1285,70 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	}
 }
 
+/* Return a read mask for source registers deduced from opcode & write mask. */
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
+{
+	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+	case TGSI_OPCODE_DP3:
+		return 0x7;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL: /* WriteMask ignored */
+		return 0xf;
+	case TGSI_OPCODE_DST:
+		return mask & (c ? 0xa : 0x6);
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+		return 0x1;
+	case TGSI_OPCODE_LIT:
+		return 0xb;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		const struct tgsi_instruction_ext_texture *tex;
+
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		mask = 0x7;
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			mask |= 0x8;
+
+		switch (tex->Texture) {
+		case TGSI_TEXTURE_1D:
+			mask &= 0x9;
+			break;
+		case TGSI_TEXTURE_2D:
+			mask &= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+		return mask;
+	case TGSI_OPCODE_XPD:
+		x = 0;
+		if (mask & 1) x |= 0x6;
+		if (mask & 2) x |= 0x5;
+		if (mask & 4) x |= 0x3;
+		return x;
+	default:
+		break;
+	}
+
+	return mask;
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -1258,93 +1448,175 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 	return r;
 }
 
-/* returns TRUE if instruction can overwrite sources before they're read */
+/* return TRUE for ops that produce only a single result */
 static boolean
-direct2dest_op(const struct tgsi_full_instruction *insn)
+is_scalar_op(unsigned op)
 {
-	if (insn->Instruction.Saturate)
-		return FALSE;
-
-	switch (insn->Instruction.Opcode) {
+	switch (op) {
 	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP2:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
 	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
+		/*
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+		*/
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+/* Returns a bitmask indicating which dst components depend
+ * on source s, component c (reverse of nv50_tgsi_src_mask).
+ */
+static unsigned
+nv50_tgsi_dst_revdep(unsigned op, int s, int c)
+{
+	if (is_scalar_op(op))
+		return 0x1;
+
+	switch (op) {
+	case TGSI_OPCODE_DST:
+		return (1 << c) & (s ? 0xa : 0x6);
+	case TGSI_OPCODE_XPD:
+		switch (c) {
+		case 0: return 0x6;
+		case 1: return 0x5;
+		case 2: return 0x3;
+		case 3: return 0x0;
+		default:
+			assert(0);
+			return 0x0;
+		}
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
 	case TGSI_OPCODE_TXP:
-		return FALSE;
+		/* these take care of dangerous swizzles themselves */
+		return 0x0;
+	case TGSI_OPCODE_IF:
+	case TGSI_OPCODE_KIL:
+		/* don't call this function for these ops */
+		assert(0);
+		return 0;
 	default:
-		return TRUE;
+		/* linear vector instruction */
+		return (1 << c);
 	}
 }
 
+static INLINE boolean
+has_pred(struct nv50_program_exec *e, unsigned cc)
+{
+	if (!is_long(e) || is_immd(e))
+		return FALSE;
+	return ((e->inst[1] & 0x780) == (cc << 7));
+}
+
+/* on ENDIF see if we can do "@p0.neu single_op" instead of:
+ *        join_at ENDIF
+ *        @p0.eq bra ENDIF
+ *        single_op
+ * ENDIF: nop.join
+ */
 static boolean
-nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+nv50_kill_branch(struct nv50_pc *pc)
 {
-	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
-	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	int lvl = pc->if_lvl;
+
+	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
+		return FALSE;
+
+	/* if ccode == 'true', the BRA is from an ELSE and the predicate
+	 * reg may no longer be valid, since we currently always use $p0
+	 */
+	if (has_pred(pc->if_insn[lvl], 0xf))
+		return FALSE;
+	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
+
+	/* We'll use the exec allocated for JOIN_AT (as we can't easily
+	 * update prev's next); if exec_tail is BRK, update the pointer.
+	 */
+	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
+		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
+
+	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
+
+	*pc->br_join[lvl] = *pc->p->exec_tail;
+
+	FREE(pc->if_insn[lvl]);
+	FREE(pc->p->exec_tail);
+
+	pc->p->exec_tail = pc->br_join[lvl];
+	pc->p->exec_tail->next = NULL;
+	set_pred(pc, 0xd, 0, pc->p->exec_tail);
+
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc,
+		     const struct tgsi_full_instruction *inst)
+{
+	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
 	unsigned mask, sat, unit;
-	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
+	memset(src, 0, sizeof(src));
+
 	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
+		if ((mask & (1 << c)) && !pc->r_dst[c])
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
-			dst[c] = NULL;
-		rdst[c] = NULL;
-		src[0][c] = NULL;
-		src[1][c] = NULL;
-		src[2][c] = NULL;
+			dst[c] = pc->r_dst[c];
+		rdst[c] = dst[c];
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		unsigned src_mask;
+		boolean neg_supp;
+
+		src_mask = nv50_tgsi_src_mask(inst, i);
+		neg_supp = negate_supported(inst, i);
 
 		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs,
-					     negate_supported(inst, i));
+			if (src_mask & (1 << c))
+				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
 	}
 
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			rdst[c] = dst[c];
-			dst[c] = temp_temp(pc);
-		}
+	brdc = temp = pc->r_brdc;
+	if (brdc && brdc->type != P_TEMP) {
+		temp = temp_temp(pc);
+		if (sat)
+			brdc = temp;
 	} else
-	if (direct2dest_op(inst)) {
+	if (sat) {
 		for (c = 0; c < 4; c++) {
-			if (!dst[c] || dst[c]->type != P_TEMP)
-				continue;
-
-			for (i = c + 1; i < 4; i++) {
-				if (dst[c] == src[0][i] ||
-				    dst[c] == src[1][i] ||
-				    dst[c] == src[2][i])
-					break;
-			}
-			if (i == 4)
+			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
-
-			assimilate = TRUE;
 			rdst[c] = dst[c];
-			dst[c] = alloc_temp(pc, NULL);
+			dst[c] = temp_temp(pc);
 		}
 	}
 
+	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
+
 	switch (inst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
 		for (c = 0; c < 4; c++) {
@@ -1360,74 +1632,91 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_add(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
-	case TGSI_OPCODE_COS:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, temp, temp);
+	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
+		break;
+	case TGSI_OPCODE_BRK:
+		emit_branch(pc, -1, 0, NULL);
+		assert(pc->loop_lvl > 0);
+		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
+		break;
+	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_mov(pc, dst[c], temp);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 5, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, brdc, temp);
 		break;
 	case TGSI_OPCODE_DP3:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
 		break;
 	case TGSI_OPCODE_DP4:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, temp, src[0][3], src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, temp, src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_add(pc, brdc, src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DST:
-	{
-		struct nv50_reg *one = alloc_immd(pc, 1.0);
-		if (mask & (1 << 0))
-			emit_mov(pc, dst[0], one);
 		if (mask & (1 << 1))
 			emit_mul(pc, dst[1], src[0][1], src[1][1]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], src[0][2]);
 		if (mask & (1 << 3))
 			emit_mov(pc, dst[3], src[1][3]);
-		FREE(one);
-	}
+		if (mask & (1 << 0))
+			emit_mov_immdval(pc, dst[0], 1.0f);
+		break;
+	case TGSI_OPCODE_ELSE:
+		emit_branch(pc, -1, 0, NULL);
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		break;
+	case TGSI_OPCODE_ENDIF:
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+
+		/* try to replace branch over 1 insn with a predicated insn */
+		if (nv50_kill_branch(pc) == TRUE)
+			break;
+
+		if (pc->br_join[pc->if_lvl]) {
+			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->br_join[pc->if_lvl] = NULL;
+		}
+		/* emit a NOP as join point, we could set it on the next
+		 * one, but would have to make sure it is long and !immd
+		 */
+		emit_nop(pc);
+		pc->p->exec_tail->inst[1] |= 2;
+		break;
+	case TGSI_OPCODE_ENDLOOP:
+		emit_branch(pc, -1, 0, NULL);
+		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
+		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 6, brdc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1445,24 +1734,24 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_IF:
+		/* emitting a join_at may not be necessary */
+		assert(pc->if_lvl < MAX_IF_DEPTH);
+		set_pred_wr(pc, 1, 0, pc->if_cond);
+		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		break;
 	case TGSI_OPCODE_KIL:
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
-		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = temp_temp(pc);
-		emit_flop(pc, 3, temp, src[0][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 3, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -1510,31 +1799,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = temp_temp(pc);
-		emit_pow(pc, temp, src[0][0], src[1][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 0, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 0, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 2, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 2, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
+		if (mask & 3)
+			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
@@ -1544,28 +1820,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
-	case TGSI_OPCODE_SGE:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
-		}
-		break;
 	case TGSI_OPCODE_SIN:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 4, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_SUB:
@@ -1583,6 +1860,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_tex(pc, dst, mask, src[0], unit,
 			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
+	case TGSI_OPCODE_TRUNC:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
+		}
+		break;
 	case TGSI_OPCODE_XPD:
 		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
@@ -1607,17 +1892,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		return FALSE;
 	}
 
+	if (brdc) {
+		if (sat)
+			emit_sat(pc, brdc, brdc);
+		for (c = 0; c < 4; c++)
+			if ((mask & (1 << c)) && dst[c] != brdc)
+				emit_mov(pc, dst[c], brdc);
+	} else
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
-				 CVT_F32_F32);
+			/* in this case we saturate later */
+			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
+				continue;
+			emit_sat(pc, rdst[c], dst[c]);
 		}
-	} else if (assimilate) {
-		for (c = 0; c < 4; c++)
-			if (rdst[c])
-				assimilate_temp(pc, rdst[c], dst[c]);
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -1626,9 +1916,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
 				FREE(src[i][c]);
-			else
-			if (src[i][c]->acc == pc->insn_cur)
-				release_hw(pc, src[i][c]);
 		}
 	}
 
@@ -1636,180 +1923,271 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
-/* Adjust a bitmask that indicates what components of a source are used,
- * we use this in tx_prep so we only load interpolants that are needed.
- */
-static void
-insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
-{
-	const struct tgsi_instruction_ext_texture *tex;
-
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_DP3:
-		*mask = 0x7;
-		break;
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-		*mask = 0xF;
-		break;
-	case TGSI_OPCODE_LIT:
-		*mask = 0xB;
-		break;
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-		*mask = 0x1;
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXP:
-		assert(insn->Instruction.Extended);
-		tex = &insn->InstructionExtTexture;
-
-		*mask = 0x7;
-		if (tex->Texture == TGSI_TEXTURE_1D)
-			*mask = 0x1;
-		else
-		if (tex->Texture == TGSI_TEXTURE_2D)
-			*mask = 0x3;
-
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			*mask |= 0x8;
-		break;
-	default:
-		break;
-	}
-}
-
 static void
-prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
-		  unsigned *r_usage[2])
+prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	const struct tgsi_full_instruction *insn;
+	struct nv50_reg *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
+	unsigned i, c, k, mask;
 
-	unsigned i, c, k, n, mask, *acc_p;
-
-	insn = &tok->FullInstruction;
 	dst = &insn->FullDstRegisters[0].DstRegister;
 	mask = dst->WriteMask;
 
-	if (!r_usage[0])
-		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
-	if (!r_usage[1])
-		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+        if (dst->File == TGSI_FILE_TEMPORARY)
+                reg = pc->temp;
+        else
+        if (dst->File == TGSI_FILE_OUTPUT)
+                reg = pc->result;
 
-	if (dst->File == TGSI_FILE_TEMPORARY) {
+	if (reg) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+			reg[dst->Index * 4 + c].acc = pc->insn_nr;
 		}
 	}
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		src = &insn->FullSrcRegisters[i];
 
-		switch (src->SrcRegister.File) {
-		case TGSI_FILE_TEMPORARY:
-			acc_p = r_usage[0];
-			break;
-		case TGSI_FILE_INPUT:
-			acc_p = r_usage[1];
-			break;
-		default:
+		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
+			reg = pc->temp;
+		else
+		if (src->SrcRegister.File == TGSI_FILE_INPUT)
+			reg = pc->attr;
+		else
 			continue;
-		}
 
-		insn_adjust_mask(insn, &mask);
+		mask = nv50_tgsi_src_mask(insn, i);
 
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-
 			k = tgsi_util_get_full_src_register_extswizzle(src, c);
-			switch (k) {
-			case TGSI_EXTSWIZZLE_X:
-			case TGSI_EXTSWIZZLE_Y:
-			case TGSI_EXTSWIZZLE_Z:
-			case TGSI_EXTSWIZZLE_W:
-				n = src->SrcRegister.Index * 4 + k;
-				acc_p[n] = pc->insn_nr;
-				break;
-			default:
-				break;
-			}
+
+			if (k > TGSI_EXTSWIZZLE_W)
+				continue;
+
+			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
 		}
 	}
 }
 
+/* Returns a bitmask indicating which dst components need to be
+ * written to temporaries first to avoid 'corrupting' sources.
+ *
+ * m[i]   (out) indicate component to write in the i-th position
+ * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
+ */
+static unsigned
+nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
+{
+	unsigned i, c, x, unsafe;
+
+	for (c = 0; c < 4; c++)
+		m[c] = c;
+
+	/* Swap as long as a dst component written earlier is depended on
+	 * by one written later, but the next one isn't depended on by it.
+	 */
+	for (c = 0; c < 3; c++) {
+		if (rdep[m[c + 1]] & (1 << m[c]))
+			continue; /* if next one is depended on by us */
+		for (i = c + 1; i < 4; i++)
+			/* if we are depended on by a later one */
+			if (rdep[m[c]] & (1 << m[i]))
+				break;
+		if (i == 4)
+			continue;
+		/* now, swap */
+		x = m[c];
+		m[c] = m[c + 1];
+		m[c + 1] = x;
+
+		/* restart */
+		c = 0;
+	}
+
+	/* mark dependencies that could not be resolved by reordering */
+	for (i = 0; i < 3; ++i)
+		for (c = i + 1; c < 4; ++c)
+			if (rdep[m[i]] & (1 << m[c]))
+				unsafe |= (1 << i);
+
+	/* NOTE: $unsafe is with respect to order, not component */
+	return unsafe;
+}
+
+/* Select a suitable dst register for broadcasting scalar results,
+ * or return NULL if we have to allocate an extra TEMP.
+ *
+ * If e.g. only 1 component is written, we may also emit the final
+ * result to a write-only register.
+ */
+static struct nv50_reg *
+tgsi_broadcast_dst(struct nv50_pc *pc,
+		   const struct tgsi_full_dst_register *fd, unsigned mask)
+{
+	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->DstRegister.WriteMask);
+		if (c)
+			return tgsi_dst(pc, c - 1, fd);
+	} else {
+		int c = ffs(fd->DstRegister.WriteMask) - 1;
+		if ((1 << c) == fd->DstRegister.WriteMask)
+			return tgsi_dst(pc, c, fd);
+	}
+
+	return NULL;
+}
+
+/* Scan source swizzles and return a bitmask indicating dst regs that
+ * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
+ */
 static unsigned
-load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
-	       int *aid, int *p_oid)
+nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
+		       unsigned rdep[4])
 {
-	struct nv50_reg *iv;
-	int oid, c, n;
-	unsigned mask = 0;
+	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
+	const struct tgsi_full_src_register *fs;
+	unsigned i, deqs = 0;
 
-	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+	for (i = 0; i < 4; ++i)
+		rdep[i] = 0;
 
-	for (c = 0, n = i * 4; c < 4; c++, n++) {
-		oid = (*p_oid)++;
-		pc->attr[n].type = P_TEMP;
-		pc->attr[n].index = i;
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
+		boolean neg_supp = negate_supported(insn, i);
 
-		if (pc->attr[n].acc == acc[n])
+		fs = &insn->FullSrcRegisters[i];
+		if (fs->SrcRegister.File != fd->DstRegister.File ||
+		    fs->SrcRegister.Index != fd->DstRegister.Index)
 			continue;
-		mask |= (1 << c);
 
-		pc->attr[n].acc = acc[n];
-		pc->attr[n].rhw = pc->attr[n].hw = -1;
-		alloc_reg(pc, &pc->attr[n]);
+		for (chn = 0; chn < 4; ++chn) {
+			unsigned s, c;
+
+			if (!(mask & (1 << chn))) /* src is not read */
+				continue;
+			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
+			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
 
-		pc->attr[n].rhw = (*aid)++;
-		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+			if (c > TGSI_EXTSWIZZLE_W ||
+			    !(fd->DstRegister.WriteMask & (1 << c)))
+				continue;
 
-		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
-		(*mid)++;
-		pc->p->cfg.fp.regs[1] += 0x00010001;
+			/* no danger if src is copied to TEMP first */
+			if ((s != TGSI_UTIL_SIGN_KEEP) &&
+			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
+				continue;
+
+			rdep[c] |= nv50_tgsi_dst_revdep(
+				insn->Instruction.Opcode, i, chn);
+			deqs |= (1 << c);
+		}
 	}
 
-	return mask;
+	return deqs;
 }
 
 static boolean
-nv50_program_tx_prep(struct nv50_pc *pc)
+nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
-	struct tgsi_parse_context p;
-	boolean ret = FALSE;
-	unsigned i, c;
-	unsigned fcol, bcol, fcrd, depr;
+	struct tgsi_full_instruction insn = tok->FullInstruction;
+	const struct tgsi_full_dst_register *fd;
+	unsigned i, deqs, rdep[4], m[4];
+
+	fd = &tok->FullInstruction.FullDstRegisters[0];
+	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
+
+	if (is_scalar_op(insn.Instruction.Opcode)) {
+		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
+		if (!pc->r_brdc)
+			pc->r_brdc = temp_temp(pc);
+		return nv50_program_tx_insn(pc, &insn);
+	}
+	pc->r_brdc = NULL;
+
+	if (!deqs)
+		return nv50_program_tx_insn(pc, &insn);
+
+	deqs = nv50_revdep_reorder(m, rdep);
+
+	for (i = 0; i < 4; ++i) {
+		assert(pc->r_dst[m[i]] == NULL);
+
+		insn.FullDstRegisters[0].DstRegister.WriteMask =
+			fd->DstRegister.WriteMask & (1 << m[i]);
+
+		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
+			continue;
+
+		if (deqs & (1 << i))
+			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
+
+		if (!nv50_program_tx_insn(pc, &insn))
+			return FALSE;
+	}
 
-	/* count (centroid) perspective interpolations */
-	unsigned centroid_loads = 0;
-	unsigned perspect_loads = 0;
+	for (i = 0; i < 4; i++) {
+		struct nv50_reg *reg = pc->r_dst[i];
+		if (!reg)
+			continue;
+		pc->r_dst[i] = NULL;
+
+		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
+		else
+			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
+		free_temp(pc, reg);
+	}
 
-	/* track register access for temps and attrs */
-	unsigned *r_usage[2];
-	r_usage[0] = NULL;
-	r_usage[1] = NULL;
+	return TRUE;
+}
 
-	depr = fcol = bcol = fcrd = 0xffff;
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *iv, **ppiv;
+	unsigned mode = pc->interp_mode[reg->index];
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-		pc->p->cfg.fp.regs[0] = 0x01000404;
-		pc->p->cfg.fp.regs[1] = 0x00000400;
+	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+	iv = *ppiv;
+
+	if ((mode & INTERP_PERSPECTIVE) && !iv) {
+		iv = *ppiv = alloc_temp(pc, NULL);
+		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
+
+		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+		emit_flop(pc, 0, iv, iv);
+
+		/* XXX: when loading interpolants dynamically, move these
+		 * to the program head, or make sure it can't be skipped.
+		 */
 	}
 
-	tgsi_parse_init(&p, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&p)) {
-		const union tgsi_full_token *tok = &p.FullToken;
+	emit_interp(pc, reg, iv, mode);
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context tp;
+	struct nv50_program *p = pc->p;
+	boolean ret = FALSE;
+	unsigned i, c, flat_nr = 0;
+
+	tgsi_parse_init(&tp, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&tp)) {
+		const union tgsi_full_token *tok = &tp.FullToken;
 
-		tgsi_parse_token(&p);
+		tgsi_parse_token(&tp);
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_IMMEDIATE:
 		{
 			const struct tgsi_full_immediate *imm =
-				&p.FullToken.FullImmediate;
+				&tp.FullToken.FullImmediate;
 
 			ctor_immd(pc, imm->u[0].Float,
 				      imm->u[1].Float,
@@ -1820,78 +2198,61 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last, first, mode;
+			unsigned si, last, first, mode;
 
-			d = &p.FullToken.FullDeclaration;
+			d = &tp.FullToken.FullDeclaration;
 			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
-				if (pc->temp_nr < (last + 1))
-					pc->temp_nr = last + 1;
 				break;
 			case TGSI_FILE_OUTPUT:
-				if (pc->result_nr < (last + 1))
-					pc->result_nr = last + 1;
-
-				if (!d->Declaration.Semantic)
+				if (!d->Declaration.Semantic ||
+				    p->type == PIPE_SHADER_FRAGMENT)
 					break;
 
+				si = d->Semantic.SemanticIndex;
 				switch (d->Semantic.SemanticName) {
-				case TGSI_SEMANTIC_POSITION:
-					depr = first;
-					pc->p->cfg.fp.regs[2] |= 0x00000100;
-					pc->p->cfg.fp.regs[3] |= 0x00000011;
+				case TGSI_SEMANTIC_BCOLOR:
+					p->cfg.two_side[si].hw = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					p->cfg.psiz = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
 					break;
+					/*
+				case TGSI_SEMANTIC_CLIP_DISTANCE:
+					p->cfg.clpd = MIN2(p->cfg.clpd, first);
+					break;
+					*/
 				default:
 					break;
 				}
-
 				break;
 			case TGSI_FILE_INPUT:
 			{
-				if (pc->attr_nr < (last + 1))
-					pc->attr_nr = last + 1;
-
-				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				if (p->type != PIPE_SHADER_FRAGMENT)
 					break;
 
 				switch (d->Declaration.Interpolate) {
 				case TGSI_INTERPOLATE_CONSTANT:
 					mode = INTERP_FLAT;
+					flat_nr++;
 					break;
 				case TGSI_INTERPOLATE_PERSPECTIVE:
 					mode = INTERP_PERSPECTIVE;
+					p->cfg.regs[1] |= 0x08 << 24;
 					break;
 				default:
 					mode = INTERP_LINEAR;
 					break;
 				}
-
-				if (d->Declaration.Semantic) {
-					switch (d->Semantic.SemanticName) {
-					case TGSI_SEMANTIC_POSITION:
-						fcrd = first;
-						break;
-					case TGSI_SEMANTIC_COLOR:
-						fcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					case TGSI_SEMANTIC_BCOLOR:
-						bcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					}
-				}
-
-				if (d->Declaration.Centroid) {
+				if (d->Declaration.Centroid)
 					mode |= INTERP_CENTROID;
-					if (mode & INTERP_PERSPECTIVE)
-						centroid_loads++;
-				} else
-				if (mode & INTERP_PERSPECTIVE)
-					perspect_loads++;
 
 				assert(last < 32);
 				for (i = first; i <= last; i++)
@@ -1899,8 +2260,6 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			}
 				break;
 			case TGSI_FILE_CONSTANT:
-				if (pc->param_nr < (last + 1))
-					pc->param_nr = last + 1;
 				break;
 			case TGSI_FILE_SAMPLER:
 				break;
@@ -1913,182 +2272,157 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			pc->insn_nr++;
-			prep_inspect_insn(pc, tok, r_usage);
+			prep_inspect_insn(pc, &tok->FullInstruction);
 			break;
 		default:
 			break;
 		}
 	}
 
-	if (pc->temp_nr) {
-		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->temp)
-			goto out_err;
+	if (p->type == PIPE_SHADER_VERTEX) {
+		int rid = 0;
 
-		for (i = 0; i < pc->temp_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->temp[i*4+c].type = P_TEMP;
-				pc->temp[i*4+c].hw = -1;
-				pc->temp[i*4+c].rhw = -1;
-				pc->temp[i*4+c].index = i;
-				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
+		for (i = 0; i < pc->attr_nr * 4; ++i) {
+			if (pc->attr[i].acc) {
+				pc->attr[i].hw = rid++;
+				p->cfg.attr[i / 32] |= 1 << (i % 32);
 			}
 		}
-	}
-
-	if (pc->attr_nr) {
-		int oid = 4, mid = 4, aid = 0;
-		/* oid = VP output id
-		 * aid = FP attribute/interpolant id
-		 * mid = VP output mapping field ID
-		 */
 
-		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->attr)
-			goto out_err;
-
-		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			/* position should be loaded first */
-			if (fcrd != 0xffff) {
-				unsigned mask;
-				mid = 0;
-				mask = load_fp_attrib(pc, fcrd, r_usage[1],
-						      &mid, &aid, &oid);
-				oid = 0;
-				pc->p->cfg.fp.regs[1] |= (mask << 24);
-				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
-			}
-			pc->p->cfg.fp.map[0] += 0x03020100;
-
-			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
-
-			if (perspect_loads) {
-				pc->iv_p = alloc_temp(pc, NULL);
-
-				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
-					pc->p->cfg.fp.regs[1] |= 0x08000000;
-					pc->iv_p->rhw = aid++;
-					emit_interp(pc, pc->iv_p, NULL,
-						    INTERP_LINEAR);
-					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
-				} else {
-					pc->iv_p->rhw = aid - 1;
-					emit_flop(pc, 0, pc->iv_p,
-						  &pc->attr[fcrd * 4 + 3]);
-				}
-			}
+		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+			p->cfg.io[i].hw = rid;
+			p->cfg.io[i].id_vp = i;
 
-			if (centroid_loads) {
-				pc->iv_c = alloc_temp(pc, NULL);
-				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
-				emit_interp(pc, pc->iv_c, NULL,
-					    INTERP_CENTROID);
-				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
-				pc->p->cfg.fp.regs[1] |= 0x08000000;
+			for (c = 0; c < 4; ++c) {
+				int n = i * 4 + c;
+				if (!pc->result[n].acc)
+					continue;
+				pc->result[n].hw = rid++;
+				p->cfg.io[i].mask |= 1 << c;
 			}
+		}
 
-			for (c = 0; c < 4; c++) {
-				/* I don't know what these values do, but
-				 * let's set them like the blob does:
-				 */
-				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-			}
+		for (c = 0; c < 2; ++c)
+			if (p->cfg.two_side[c].hw < 0x40)
+				p->cfg.two_side[c] = p->cfg.io[
+					p->cfg.two_side[c].hw];
 
-			for (i = 0; i < pc->attr_nr; i++)
-				load_fp_attrib(pc, i, r_usage[1],
-					       &mid, &aid, &oid);
+		if (p->cfg.psiz < 0x40)
+			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+	} else
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		int rid, aid;
+		unsigned n = 0, m = pc->attr_nr - flat_nr;
 
-			if (pc->iv_p)
-				free_temp(pc, pc->iv_p);
-			if (pc->iv_c)
-				free_temp(pc, pc->iv_c);
+		int base = (TGSI_SEMANTIC_POSITION ==
+			    p->info.input_semantic_name[0]) ? 0 : 1;
 
-			pc->p->cfg.fp.high_map = (mid / 4);
-			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
-		} else {
-			/* vertex program */
-			for (i = 0; i < pc->attr_nr * 4; i++) {
-				pc->p->cfg.vp.attr[aid / 32] |=
-					(1 << (aid % 32));
-				pc->attr[i].type = P_ATTR;
-				pc->attr[i].hw = aid++;
-				pc->attr[i].index = i / 4;
+		/* non-flat interpolants have to be mapped to
+		 * the lower hardware IDs, so sort them:
+		 */
+		for (i = 0; i < pc->attr_nr; i++) {
+			if (pc->interp_mode[i] == INTERP_FLAT) {
+				p->cfg.io[m].id_vp = i + base;
+				p->cfg.io[m++].id_fp = i;
+			} else {
+				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+					p->cfg.io[n].linear = TRUE;
+				p->cfg.io[n].id_vp = i + base;
+				p->cfg.io[n++].id_fp = i;
 			}
 		}
-	}
 
-	if (pc->result_nr) {
-		int rid = 0;
+		if (!base) /* set w-coordinate mask from perspective interp */
+			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
 
-		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->result)
-			goto out_err;
+		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
+			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
 
-		for (i = 0; i < pc->result_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					pc->result[i*4+c].type = P_TEMP;
-					pc->result[i*4+c].hw = -1;
-					pc->result[i*4+c].rhw = (i == depr) ?
-						-1 : rid++;
-				} else {
-					pc->result[i*4+c].type = P_RESULT;
-					pc->result[i*4+c].hw = rid++;
-				}
-				pc->result[i*4+c].index = i;
-			}
+		for (n = 0; n < pc->attr_nr; ++n) {
+			p->cfg.io[n].hw = rid = aid;
+			i = p->cfg.io[n].id_fp;
 
-			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
-			    depr != 0xffff) {
-				pc->result[depr * 4 + 2].rhw =
-					(pc->result_nr - 1) * 4;
+			for (c = 0; c < 4; ++c) {
+				if (!pc->attr[i * 4 + c].acc)
+					continue;
+				pc->attr[i * 4 + c].rhw = rid++;
+				p->cfg.io[n].mask |= 1 << c;
+
+				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
+			aid += popcnt4(p->cfg.io[n].mask);
 		}
-	}
 
-	if (pc->param_nr) {
-		int rid = 0;
+		if (!base)
+			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
 
-		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->param)
-			goto out_err;
+		m = popcnt4(p->cfg.regs[1] >> 24);
+
+		/* set count of non-position inputs and of non-flat
+		 * non-position inputs for FP_INTERPOLANT_CTRL
+		 */
+		p->cfg.regs[1] |= aid - m;
+
+		if (flat_nr) {
+			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			p->cfg.regs[1] |= (i - m) << 16;
+		} else
+			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+		/* mark color semantic for light-twoside */
+		n = 0x40;
+		for (i = 0; i < pc->attr_nr; i++) {
+			ubyte si, sn;
 
-		for (i = 0; i < pc->param_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->param[i*4+c].type = P_CONST;
-				pc->param[i*4+c].hw = rid++;
-				pc->param[i*4+c].index = i;
+			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
+			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
+
+			if (sn == TGSI_SEMANTIC_COLOR) {
+				p->cfg.two_side[si] = p->cfg.io[i];
+
+				/* increase colour count */
+				p->cfg.regs[0] += popcnt4(
+					p->cfg.two_side[si].mask) << 16;
+
+				n = MIN2(n, p->cfg.io[i].hw - m);
 			}
 		}
+		if (n < 0x40)
+			p->cfg.regs[0] += n;
+
+		/* Initialize FP results:
+		 * FragDepth is always first TGSI and last hw output
+		 */
+		i = p->info.writes_z ? 4 : 0;
+		for (rid = 0; i < pc->result_nr * 4; i++)
+			pc->result[i].rhw = rid++;
+		if (p->info.writes_z)
+			pc->result[2].rhw = rid;
+
+		p->cfg.high_result = rid;
 	}
 
 	if (pc->immd_nr) {
 		int rid = 0;
 
-		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
 		if (!pc->immd)
 			goto out_err;
 
 		for (i = 0; i < pc->immd_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->immd[i*4+c].type = P_IMMD;
-				pc->immd[i*4+c].hw = rid++;
-				pc->immd[i*4+c].index = i;
-			}
+			for (c = 0; c < 4; c++, rid++)
+				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
 		}
 	}
 
 	ret = TRUE;
 out_err:
-	if (r_usage[0])
-		FREE(r_usage[0]);
-	if (r_usage[1])
-		FREE(r_usage[1]);
+	if (pc->iv_p)
+		free_temp(pc, pc->iv_p);
+	if (pc->iv_c)
+		free_temp(pc, pc->iv_c);
 
-	tgsi_parse_free(&p);
+	tgsi_parse_free(&tp);
 	return ret;
 }
 
@@ -2110,18 +2444,165 @@ free_nv50_pc(struct nv50_pc *pc)
 }
 
 static boolean
+ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
+{
+	int i, c;
+	unsigned rtype[2] = { P_ATTR, P_RESULT };
+
+	pc->p = p;
+	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
+	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
+	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+	p->cfg.high_temp = 4;
+
+	p->cfg.two_side[0].hw = 0x40;
+	p->cfg.two_side[1].hw = 0x40;
+
+	switch (p->type) {
+	case PIPE_SHADER_VERTEX:
+		p->cfg.psiz = 0x40;
+		p->cfg.clpd = 0x40;
+		p->cfg.io_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		rtype[0] = rtype[1] = P_TEMP;
+
+		p->cfg.regs[0] = 0x01000004;
+		p->cfg.io_nr = pc->attr_nr;
+
+		if (p->info.writes_z) {
+			p->cfg.regs[2] |= 0x00000100;
+			p->cfg.regs[3] |= 0x00000011;
+		}
+		if (p->info.uses_kill)
+			p->cfg.regs[2] |= 0x00100000;
+		break;
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->temp)
+			return FALSE;
+
+		for (i = 0; i < pc->temp_nr * 4; ++i)
+			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
+	}
+
+	if (pc->attr_nr) {
+		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->attr)
+			return FALSE;
+
+		for (i = 0; i < pc->attr_nr * 4; ++i)
+			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
+	}
+
+	if (pc->result_nr) {
+		unsigned nr = pc->result_nr * 4;
+
+		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
+		if (!pc->result)
+			return FALSE;
+
+		for (i = 0; i < nr; ++i)
+			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->param)
+			return FALSE;
+
+		for (i = 0; i < pc->param_nr; ++i)
+			for (c = 0; c < 4; ++c, ++rid)
+				ctor_reg(&pc->param[rid], P_CONST, i, rid);
+	}
+
+	return TRUE;
+}
+
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
+static void
+nv50_program_fixup_insns(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e, *prev = NULL, **bra_list;
+	unsigned i, n, pos;
+
+	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
+
+	/* Collect branch instructions, we need to adjust their offsets
+	 * when converting 32 bit instructions to 64 bit ones
+	 */
+	for (n = 0, e = pc->p->exec_head; e; e = e->next)
+		if (e->param.index >= 0 && !e->param.mask)
+			bra_list[n++] = e;
+
+	/* Make sure we don't have any single 32 bit instructions. */
+	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
+		pos += is_long(e) ? 2 : 1;
+
+		if ((pos & 1) && (!e->next || is_long(e->next))) {
+			for (i = 0; i < n; ++i)
+				if (bra_list[i]->param.index >= pos)
+					bra_list[i]->param.index += 1;
+			convert_to_long(pc, e);
+			++pos;
+		}
+		if (e->next)
+			prev = e;
+	}
+
+	assert(!is_immd(pc->p->exec_head));
+	assert(!is_immd(pc->p->exec_tail));
+
+	/* last instruction must be long so it can have the end bit set */
+	if (!is_long(pc->p->exec_tail)) {
+		convert_to_long(pc, pc->p->exec_tail);
+		if (prev)
+			convert_to_long(pc, prev);
+	}
+	assert(!(pc->p->exec_tail->inst[1] & 2));
+	/* set the end-bit */
+	pc->p->exec_tail->inst[1] |= 1;
+
+	FREE(bra_list);
+}
+
+static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
 	struct nv50_pc *pc;
-	unsigned k;
 	boolean ret;
 
 	pc = CALLOC_STRUCT(nv50_pc);
 	if (!pc)
 		return FALSE;
-	pc->p = p;
-	pc->p->cfg.high_temp = 4;
+
+	ret = ctor_nv50_pc(pc, p);
+	if (ret == FALSE)
+		goto out_cleanup;
 
 	ret = nv50_program_tx_prep(pc);
 	if (ret == FALSE)
@@ -2141,7 +2622,7 @@ nv50_program_tx(struct nv50_program *p)
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			++pc->insn_cur;
-			ret = nv50_program_tx_insn(pc, tok);
+			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
 			break;
@@ -2150,48 +2631,10 @@ nv50_program_tx(struct nv50_program *p)
 		}
 	}
 
-	if (p->type == PIPE_SHADER_FRAGMENT) {
-		struct nv50_reg out;
-
-		out.type = P_TEMP;
-		for (k = 0; k < pc->result_nr * 4; k++) {
-			if (pc->result[k].rhw == -1)
-				continue;
-			if (pc->result[k].hw != pc->result[k].rhw) {
-				out.hw = pc->result[k].rhw;
-				emit_mov(pc, &out, &pc->result[k]);
-			}
-			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
-				pc->p->cfg.high_result = pc->result[k].rhw + 1;
-		}
-	}
-
-	/* look for single half instructions and make them long */
-	struct nv50_program_exec *e, *e_prev;
+	if (pc->p->type == PIPE_SHADER_FRAGMENT)
+		nv50_fp_move_results(pc);
 
-	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
-		if (!is_long(e))
-			k++;
-
-		if (!e->next || is_long(e->next)) {
-			if (k & 1)
-				convert_to_long(pc, e);
-			k = 0;
-		}
-
-		if (e->next)
-			e_prev = e;
-	}
-
-	if (!is_long(pc->p->exec_tail)) {
-		/* this may occur if moving FP results */
-		assert(e_prev && !is_long(e_prev));
-		convert_to_long(pc, e_prev);
-		convert_to_long(pc, pc->p->exec_tail);
-	}
-
-	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
-	pc->p->exec_tail->inst[1] |= 0x00000001;
+	nv50_program_fixup_insns(pc);
 
 	p->param_nr = pc->param_nr * 4;
 	p->immd_nr = pc->immd_nr * 4;
@@ -2258,30 +2701,19 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (!p->data[1] && p->param_nr) {
-		struct nouveau_resource *heap =
-			nv50->screen->parm_heap[p->type];
-
-		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
-			while (heap->next && heap->size < p->param_nr) {
-				struct nv50_program *evict = heap->next->priv;
-				nouveau_resource_free(&evict->data[1]);
-			}
-
-			if (nouveau_resource_alloc(heap, p->param_nr, p,
-						   &p->data[1]))
-				assert(0);
-		}
-	}
+	assert(p->param_nr <= 128);
 
 	if (p->param_nr) {
-		unsigned cbuf = NV50_CB_PVP;
+		unsigned cb;
 		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
 					     PIPE_BUFFER_USAGE_CPU_READ);
-		if (p->type == PIPE_SHADER_FRAGMENT)
-			cbuf = NV50_CB_PFP;
-		nv50_program_upload_data(nv50, map, p->data[1]->start,
-					 p->param_nr, cbuf);
+
+		if (p->type == PIPE_SHADER_VERTEX)
+			cb = NV50_CB_PVP;
+		else
+			cb = NV50_CB_PFP;
+
+		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
@@ -2303,32 +2735,41 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 		upload = TRUE;
 	}
 
-	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
-		(p->data[1] && p->data[1]->start != p->data_start[1])) {
-		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci, bs;
+	if (p->data[0] && p->data[0]->start != p->data_start[0])
+		upload = TRUE;
 
-			if (e->param.index < 0)
-				continue;
-			bs = (e->inst[1] >> 22) & 0x07;
-			assert(bs < 2);
-			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data[bs]->start;
+	if (!upload)
+		return;
+
+	for (e = p->exec_head; e; e = e->next) {
+		unsigned ei, ci, bs;
+
+		if (e->param.index < 0)
+			continue;
+
+		if (e->param.mask == 0) {
+			assert(!(e->param.index & 1));
+			/* seem to be 8 byte steps */
+			ei = (e->param.index >> 1) + 0 /* START_ID */;
 
-			e->inst[ei] &= ~e->param.mask;
-			e->inst[ei] |= (ci << e->param.shift);
+			e->inst[0] &= 0xf0000fff;
+			e->inst[0] |= ei << 12;
+			continue;
 		}
 
-		if (p->data[0])
-			p->data_start[0] = p->data[0]->start;
-		if (p->data[1])
-			p->data_start[1] = p->data[1]->start;
+		bs = (e->inst[1] >> 22) & 0x07;
+		assert(bs < 2);
+		ei = e->param.shift >> 5;
+		ci = e->param.index;
+		if (bs == 0)
+			ci += p->data[bs]->start;
 
-		upload = TRUE;
+		e->inst[ei] &= ~e->param.mask;
+		e->inst[ei] |= (ci << e->param.shift);
 	}
 
-	if (!upload)
-		return;
+	if (p->data[0])
+		p->data_start[0] = p->data[0]->start;
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
@@ -2402,8 +2843,8 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
-	so_data  (so, p->cfg.vp.attr[0]);
-	so_data  (so, p->cfg.vp.attr[1]);
+	so_data  (so, p->cfg.attr[0]);
+	so_data  (so, p->cfg.attr[1]);
 	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
@@ -2421,7 +2862,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -2438,29 +2878,186 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 		      NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
-	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
-	so_data  (so, 0x00000004);
-	so_data  (so, 0x00000000);
-	so_data  (so, 0x00000000);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
-	for (i = 0; i < p->cfg.fp.high_map; i++)
-		so_data(so, p->cfg.fp.map[i]);
-	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
-	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
-	so_data  (so, p->cfg.fp.regs[2]);
+	so_data  (so, p->cfg.regs[2]);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
-	so_data  (so, p->cfg.fp.regs[3]);
+	so_data  (so, p->cfg.regs[3]);
 	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
 	so_ref(NULL, &so);
 }
 
+static void
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
+{
+	struct nv50_program *fp = nv50->fragprog;
+	struct nv50_program *vp = nv50->vertprog;
+	unsigned i, c, m = base;
+
+	/* XXX: This can't work correctly in all cases yet, we either
+	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
+	 * to be per FP input instead of per VP output
+	 */
+	memset(pntc, 0, 8 * sizeof(uint32_t));
+
+	for (i = 0; i < fp->cfg.io_nr; i++) {
+		uint8_t sn, si;
+		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
+		unsigned n = popcnt4(fp->cfg.io[i].mask);
+
+		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+			m += n;
+			continue;
+		}
+
+		sn = vp->info.input_semantic_name[j];
+		si = vp->info.input_semantic_index[j];
+
+		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
+			ubyte mode =
+				nv50->rasterizer->pipe.sprite_coord_mode[si];
+
+			if (mode == PIPE_SPRITE_COORD_NONE) {
+				m += n;
+				continue;
+			}
+		}
+
+		/* this is either PointCoord or replaced by sprite coords */
+		for (c = 0; c < 4; c++) {
+			if (!(fp->cfg.io[i].mask & (1 << c)))
+				continue;
+			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+			++m;
+		}
+	}
+}
+
+static int
+nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
+	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+	int c;
+	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+	uint8_t *map = (uint8_t *)p_map;
+
+	for (c = 0; c < 4; ++c) {
+		if (mf & 1) {
+			if (fpi->linear == TRUE)
+				lin[mid / 32] |= 1 << (mid % 32);
+			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+		}
+
+		oid += mv & 1;
+		mf >>= 1;
+		mv >>= 1;
+	}
+
+	return mid;
+}
+
+void
+nv50_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *fp = nv50->fragprog;
+	struct nouveau_stateobj *so;
+	struct nv50_sreg4 dummy, *vpo;
+	int i, n, c, m = 0;
+	uint32_t map[16], lin[4], reg[5], pcrd[8];
+
+	memset(map, 0, sizeof(map));
+	memset(lin, 0, sizeof(lin));
+
+	reg[1] = 0x00000004; /* low and high clip distance map ids */
+	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+	reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+	dummy.linear = FALSE;
+	dummy.mask = 0xf; /* map all components of HPOS */
+	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+
+	dummy.mask = 0x0;
+
+	if (vp->cfg.clpd < 0x40) {
+		for (c = 0; c < vp->cfg.clpd_nr; ++c)
+			map[m++] = vp->cfg.clpd + c;
+		reg[1] = (m << 8);
+	}
+
+	reg[0] |= m << 8; /* adjust BFC0 id */
+
+	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+	if (nv50->rasterizer->pipe.light_twoside) {
+		vpo = &vp->cfg.two_side[0];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+	}
+
+	reg[0] += m - 4; /* adjust FFC0 id */
+	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+	i = 0;
+	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+		i = 1;
+	for (; i < fp->cfg.io_nr; i++) {
+		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
+		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
+
+		n = fp->cfg.io[i].id_vp;
+		if (n >= vp->cfg.io_nr ||
+		    vp->info.output_semantic_name[n] != sn ||
+		    vp->info.output_semantic_index[n] != si)
+			vpo = &dummy;
+		else
+			vpo = &vp->cfg.io[n];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+	}
+
+	if (nv50->rasterizer->pipe.point_size_per_vertex) {
+		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
+		reg[3] = (m++ << 4) | 1;
+	}
+
+	/* now fill the stateobj */
+	so = so_new(64, 0);
+
+	n = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+	so_datap (so, map, n);
+
+	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+	so_datap (so, reg, 4);
+
+	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+	so_data  (so, reg[4]);
+
+	so_method(so, tesla, 0x1540, 4);
+	so_datap (so, lin, 4);
+
+	if (nv50->rasterizer->pipe.point_sprite) {
+		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+
+		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+		so_datap (so, pcrd, 8);
+	}
+
+        so_ref(so, &nv50->state.programs);
+        so_ref(NULL, &so);
+}
+
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
@@ -2476,7 +3073,6 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	nouveau_bo_ref(NULL, &p->bo);
 
 	nouveau_resource_free(&p->data[0]);
-	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 096e0476aa..d78dee083f 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -15,6 +15,15 @@ struct nv50_program_exec {
 	} param;
 };
 
+struct nv50_sreg4 {
+	uint8_t hw;
+	uint8_t id_vp;
+	uint8_t id_fp;
+
+	uint8_t mask;
+	boolean linear;
+};
+
 struct nv50_program {
 	struct pipe_shader_state pipe;
 	struct tgsi_shader_info info;
@@ -24,8 +33,8 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data[2];
-	unsigned data_start[2];
+	struct nouveau_resource *data[1];
+	unsigned data_start[1];
 
 	struct nouveau_bo *bo;
 
@@ -36,14 +45,20 @@ struct nv50_program {
 	struct {
 		unsigned high_temp;
 		unsigned high_result;
-		struct {
-			unsigned attr[2];
-		} vp;
-		struct {
-			unsigned regs[4];
-			unsigned map[5];
-			unsigned high_map;
-		} fp;
+
+		uint32_t attr[2];
+		uint32_t regs[4];
+
+		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
+		unsigned io_nr;
+		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+
+		/* FP colour inputs, VP/GP back colour outputs */
+		struct nv50_sreg4 two_side[2];
+
+		/* VP only */
+		uint8_t clpd, clpd_nr;
+		uint8_t psiz;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c7f80a2203..3b08e1b89f 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -87,12 +87,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 4283808ed9..81fa3e34c5 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -276,6 +276,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, 0x1684, 1);
 	so_data  (so, cso->flatshade_first ? 0 : 1);
 
+	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	so_data  (so, cso->light_twoside);
+
 	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
 	so_data  (so, fui(cso->line_width));
 	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
@@ -294,6 +297,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
 	so_data  (so, fui(cso->point_size));
 
+	so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1);
+	so_data  (so, cso->point_sprite);
+
 	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
 		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 344c2cf6dd..4ed76973c4 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -66,7 +66,8 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -110,7 +111,8 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
@@ -187,6 +189,8 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
+	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		so_emit(chan, nv50->state.programs);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -208,6 +212,12 @@ nv50_state_emit(struct nv50_context *nv50)
 			so_emit(chan, nv50->state.vtxattr);
 	}
 	nv50->state.dirty = 0;
+}
+
+void
+nv50_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv50_context *nv50 = chan->user_private;
 
 	so_emit_reloc_markers(chan, nv50->state.fb);
 	so_emit_reloc_markers(chan, nv50->state.vertprog);
@@ -238,6 +248,9 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		nv50_linkage_validate(nv50);
+
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
@@ -299,7 +312,7 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(12, 0);
+		so = so_new(14, 0);
 		if (!bypass) {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
@@ -312,12 +325,21 @@ scissor_uptodate:
 
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 1);
+			/* 0x0000 = remove whole primitive only (xyz)
+			 * 0x1018 = remove whole primitive only (xy), clamp z
+			 * 0x1080 = clip primitive (xyz)
+			 * 0x1098 = clip primitive (xy), clamp z
+			 */
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x1080);
 			/* no idea what 0f90 does */
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 0);
 		} else {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 0);
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x0000);
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 1);
 		}
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index b266324f58..6bf6f773b0 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -60,13 +60,13 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
-  
+
  	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
  		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
- 		OUT_RING  (chan, mt->level[0].pitch);
+		OUT_RING  (chan, mt->level[ps->level].pitch);
  		OUT_RING  (chan, ps->width);
  		OUT_RING  (chan, ps->height);
  		OUT_RELOCh(chan, bo, ps->offset, flags);
@@ -75,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index e9c3562194..bb7731855c 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -89,14 +89,14 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		if (src_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
-			OUT_RING  (chan, (sy << 16) | sx);
+			OUT_RING  (chan, (sy << 16) | (sx * cpp));
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
 		if (dst_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
-			OUT_RING  (chan, (dy << 16) | dx);
+			OUT_RING  (chan, (dy << 16) | (dx * cpp));
 		} else {
 			dst_offset += (line_count * dst_pitch);
 		}
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index d7a2c8c462..93c2152edc 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -9,6 +9,7 @@ C_SOURCES = \
 	r300_chipset.c \
 	r300_clear.c \
 	r300_context.c \
+	r300_debug.c \
 	r300_emit.c \
 	r300_flush.c \
 	r300_fs.c \
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index da67bc29b8..9cc455135d 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -22,6 +22,9 @@
 
 #include "r300_context.h"
 
+#include "r300_flush.h"
+#include "r300_state_invariant.h"
+
 static boolean r300_draw_range_elements(struct pipe_context* pipe,
                                         struct pipe_buffer* indexBuffer,
                                         unsigned indexSize,
@@ -146,6 +149,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.winsys = (struct pipe_winsys*)r300_winsys;
     r300->context.screen = r300_screen(screen);
 
+    r300_init_debug(r300);
+
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f78492d4aa..52b1c9a6b2 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -184,8 +184,15 @@ struct r300_texture {
     /* Offsets into the buffer. */
     unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
 
-    /* Stride (pitch?) of this texture in bytes */
-    unsigned stride;
+    /**
+     * If non-zero, override the natural texture layout with
+     * a custom stride (in bytes).
+     *
+     * \note Mipmapping fails for textures with a non-natural layout!
+     *
+     * \sa r300_texture_get_stride
+     */
+    unsigned stride_override;
 
     /* Total size of this texture, in bytes. */
     unsigned size;
@@ -211,10 +218,7 @@ struct r300_vertex_format {
     int fs_tab[16];
 };
 
-static struct pipe_viewport_state r300_viewport_identity = {
-    .scale = {1.0, 1.0, 1.0, 1.0},
-    .translate = {0.0, 0.0, 0.0, 0.0},
-};
+extern struct pipe_viewport_state r300_viewport_identity;
 
 struct r300_context {
     /* Parent class */
@@ -275,6 +279,9 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 /* Convenience cast wrapper. */
@@ -288,4 +295,40 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+/*@}*/
+
+static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+{
+    return (ctx->debug & flags) ? true : false;
+}
+
+static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+{
+    if (DBG_ON(ctx, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_context * ctx);
+
 #endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 71b142c0db..883f0a02dc 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -49,7 +49,8 @@
     (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
 
 #define CS_LOCALS(context) \
-    struct r300_winsys* cs_winsys = context->winsys; \
+    struct r300_context* const cs_context_copy = (context); \
+    struct r300_winsys* cs_winsys = cs_context_copy->winsys; \
     int cs_count = 0;
 
 #define CHECK_CS(size) \
@@ -58,7 +59,7 @@
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
                 size, __FUNCTION__, __FILE__, __LINE__); \
     } \
     cs_winsys->begin_cs(cs_winsys, (size), \
@@ -67,47 +68,55 @@
 } while (0)
 
 #define OUT_CS(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %08x\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, (value)); \
     cs_count--; \
 } while (0)
 
 #define OUT_CS_32F(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %f\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, fui(value)); \
     cs_count--; \
 } while (0)
 
 #define OUT_CS_REG(register, value) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
             value, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, 0)); \
-    OUT_CS(value); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0(register, 0)); \
+    cs_winsys->write_cs_dword(cs_winsys, value); \
+    cs_count -= 2; \
 } while (0)
 
 /* Note: This expects count to be the number of registers,
  * not the actual packet0 count! */
 #define OUT_CS_REG_SEQ(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1))); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1))); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for buffer %p, offset %d, " \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, offset %d, " \
             "domains (%d, %d, %d)\n", \
         bo, offset, rd, wd, flags); \
     assert(bo); \
-    OUT_CS(offset); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 3; \
 } while (0)
 
 #define END_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     if (cs_count != 0) \
@@ -117,7 +126,7 @@
 
 #define FLUSH_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     cs_winsys->flush_cs(cs_winsys); \
@@ -127,27 +136,29 @@
 
 #define OUT_CS_ONE_REG(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing data sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_count--; \
 } while (0)
 
 #define CP_PACKET3(op, count) \
     (RADEON_CP_PACKET3 | (op) | ((count) << 16))
 
 #define OUT_CS_PKT3(op, count) do { \
-    OUT_CS(CP_PACKET3(op, count)); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET3(op, count)); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for index buffer %p," \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
             "offset %d\n", bo, offset); \
     assert(bo); \
-    OUT_CS(offset); \
-    OUT_CS(count); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
+    cs_winsys->write_cs_dword(cs_winsys, count); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 4; \
 } while (0)
 
 #endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
new file mode 100644
index 0000000000..15308dda1d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2009 Nicolai Haehnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+#include <ctype.h>
+
+
+struct debug_option {
+    const char * name;
+    unsigned flag;
+    const char * description;
+};
+
+static struct debug_option debug_options[] = {
+    { "help", DBG_HELP, "Helpful meta-information about the driver" },
+    { "fp", DBG_FP, "Fragment program handling" },
+    { "vp", DBG_VP, "Vertex program handling" },
+    { "cs", DBG_CS, "Command submissions" },
+    { "draw", DBG_DRAW, "Draw and emit" },
+
+    { "all", ~0, "Convenience option that enables all debug flags" },
+
+    /* must be last */
+    { 0, 0, 0 }
+};
+
+void r300_init_debug(struct r300_context * ctx)
+{
+    const char * options = debug_get_option("RADEON_DEBUG", 0);
+    boolean printhint = false;
+
+    if (options) {
+        while(*options) {
+            if (*options == ' ' || *options == ',') {
+                options++;
+                continue;
+            }
+
+            size_t length = strcspn(options, " ,");
+            struct debug_option * opt;
+
+            for(opt = debug_options; opt->name; ++opt) {
+                if (!strncmp(options, opt->name, length)) {
+                    ctx->debug |= opt->flag;
+                    break;
+                }
+            }
+
+            if (!opt->name) {
+                debug_printf("Unknown debug option: %s\n", options);
+                printhint = true;
+            }
+
+            options += length;
+        }
+
+        if (!ctx->debug)
+            printhint = true;
+    }
+
+    if (printhint || ctx->debug & DBG_HELP) {
+        debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
+                     "to a comma-separated list of debug options. Available options are:\n");
+        for(struct debug_option * opt = debug_options; opt->name; ++opt) {
+            debug_printf("    %s: %s\n", opt->name, opt->description);
+        }
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index bd4d59e6f1..77ce431cdc 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -25,6 +25,7 @@
 #include "r300_emit.h"
 
 #include "r300_fs.h"
+#include "r300_state_derived.h"
 #include "r300_vs.h"
 
 void r300_emit_blend_state(struct r300_context* r300,
@@ -282,7 +283,7 @@ void r300_emit_fb_state(struct r300_context* r300,
     for (i = 0; i < fb->nr_cbufs; i++) {
         tex = (struct r300_texture*)fb->cbufs[i]->texture;
         assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
@@ -299,7 +300,7 @@ void r300_emit_fb_state(struct r300_context* r300,
     if (fb->zsbuf) {
         tex = (struct r300_texture*)fb->zsbuf->texture;
         assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
@@ -380,6 +381,7 @@ void r300_emit_query_end(struct r300_context* r300,
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
             OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 0),
                     0, RADEON_GEM_DOMAIN_GTT, 0);
+	    break;
         default:
             debug_printf("r300: Implementation error: Chipset reports %d"
                     " pixel pipes!\n", caps->num_frag_pipes);
@@ -490,7 +492,7 @@ void r300_emit_vertex_buffer(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    debug_printf("r300: Preparing vertex buffer %p for render, "
+    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
             "vertex size %d\n", r300->vbo,
             r300->vertex_info.vinfo.size);
     /* Set the pointer to our vertex buffer. The emitted values are this:
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 350691d592..c4002b8e5d 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -56,6 +56,11 @@ void r500_emit_fragment_program_code(struct r300_context* r300,
 void r300_emit_fb_state(struct r300_context* r300,
                         struct pipe_framebuffer_state* fb);
 
+void r300_emit_query_begin(struct r300_context* r300,
+                           struct r300_query* query);
+void r300_emit_query_end(struct r300_context* r300,
+                         struct r300_query* query);
+
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 36463b9a2e..a0e848a59a 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -96,7 +96,7 @@ void r300_translate_fragment_shader(struct r300_context* r300,
 
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
 
     compiler.code = &fs->code;
     compiler.is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 1d5185b417..2880d34877 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -22,6 +22,8 @@
 
 #include "r300_query.h"
 
+#include "r300_emit.h"
+
 static struct pipe_query* r300_create_query(struct pipe_context* pipe,
                                             unsigned query_type)
 {
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 03cd219cde..3abff5db62 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1478,6 +1478,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_TX_PITCH_EN                  (1 << 31)
 #       define R300_TX_WIDTH(x)                  ((x) << 0)
 #       define R300_TX_HEIGHT(x)                 ((x) << 11)
+#       define R300_TX_DEPTH(x)                  ((x) << 22)
 #       define R300_TX_NUM_LEVELS(x)             ((x) << 26)
 
 #define R300_TX_FORMAT1_0                   0x44C0
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index cd458d019a..16f6404012 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -26,15 +26,17 @@
 
 #include "r300_cs.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_reg.h"
 #include "r300_state_derived.h"
 
 /* r300_render: Vertex and index buffer primitive emission. */
+#define R300_MAX_VBO_SIZE  (1024 * 1024)
 
 struct r300_render {
     /* Parent class */
     struct vbuf_render base;
-    
+
     /* Pipe context */
     struct r300_context* r300;
 
@@ -45,7 +47,10 @@ struct r300_render {
 
     /* VBO */
     struct pipe_buffer* vbo;
-    size_t vbo_alloc_size;
+    size_t vbo_size;
+    size_t vbo_offset;
+    size_t vbo_max_used;
+    void * vbo_ptr;
 };
 
 static INLINE struct r300_render*
@@ -74,19 +79,19 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (r300render->vbo && (size > r300render->vbo_alloc_size)) {
-        pipe_buffer_reference(&r300render->vbo, NULL);
-    }
-    
-    if (!r300render->vbo) {
+    if (size + r300render->vbo_offset > r300render->vbo_size) 
+    {
+        pipe_buffer_reference(&r300->vbo, NULL);
         r300render->vbo = pipe_buffer_create(screen,
                                              64,
                                              PIPE_BUFFER_USAGE_VERTEX,
-                                             size);
+                                             R300_MAX_VBO_SIZE);
+        r300render->vbo_size = R300_MAX_VBO_SIZE;
     }
 
-    r300render->vbo_alloc_size = MAX2(size, r300render->vbo_alloc_size);
     r300render->vertex_size = vertex_size;
+    r300->vbo = r300render->vbo;
+    r300->vbo_offset = r300render->vbo_offset;
 
     return (r300render->vbo) ? TRUE : FALSE;
 }
@@ -96,8 +101,10 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    return (unsigned char*)pipe_buffer_map(screen, r300render->vbo,
-                                           PIPE_BUFFER_USAGE_CPU_WRITE);
+    r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    return (r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
@@ -106,15 +113,23 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
+    CS_LOCALS(r300render->r300);
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max);
+    END_CS;
 
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used, 
+                                    r300render->vertex_size * (max + 1));
     pipe_buffer_unmap(screen, r300render->vbo);
 }
 
 static void r300_render_release_vertices(struct vbuf_render* render)
 {
     struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
 
-    pipe_buffer_reference(&r300render->vbo, NULL);
+    r300render->vbo_offset += r300render->vbo_max_used;
+    r300render->vbo_max_used = 0;
 }
 
 static boolean r300_render_set_primitive(struct vbuf_render* render,
@@ -162,14 +177,12 @@ static boolean r300_render_set_primitive(struct vbuf_render* render,
     return TRUE;
 }
 
-static void prepare_render(struct r300_render* render, unsigned count)
+static void r300_prepare_render(struct r300_render* render, unsigned count)
 {
     struct r300_context* r300 = render->r300;
 
     CS_LOCALS(r300);
 
-    r300->vbo = render->vbo;
-
     r300_emit_dirty_state(r300);
 }
 
@@ -182,9 +195,9 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
-    debug_printf("r300: Doing vbuf render, count %d\n", count);
+    DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
 
     BEGIN_CS(2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
@@ -207,7 +220,7 @@ static void r300_render_draw(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
     /* Send our indices into an index buffer. */
     index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
@@ -216,23 +229,6 @@ static void r300_render_draw(struct vbuf_render* render,
         return;
     }
 
-/*
-    index_map = pipe_buffer_map(screen, index_buffer,
-                                PIPE_BUFFER_USAGE_CPU_WRITE);
-    memcpy(index_map, indices, count);
-    pipe_buffer_unmap(screen, index_buffer);
-
-    debug_printf("r300: Doing indexbuf render, count %d\n", count);
-
-    BEGIN_CS(8);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
-    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
-           r300render->hwprim);
-    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
-    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2));
-    OUT_CS_INDEX_RELOC(index_buffer, 0, count, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS; */
-
     BEGIN_CS(2 + (count+1)/2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
@@ -271,6 +267,10 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
     r300render->base.release_vertices = r300_render_release_vertices;
     r300render->base.destroy = r300_render_destroy;
 
+    r300render->vbo = NULL;
+    r300render->vbo_size = 0;
+    r300render->vbo_offset = 0;
+
     return &r300render->base;
 }
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 15740f6125..8296d56840 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -93,8 +93,6 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
             } else {
                 return 0;
             }
-        case PIPE_CAP_S3TC:
-            return 1;
         case PIPE_CAP_ANISOTROPIC_FILTER:
             return 1;
         case PIPE_CAP_POINT_SPRITE:
@@ -103,11 +101,9 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_MAX_RENDER_TARGETS:
             return 4;
         case PIPE_CAP_OCCLUSION_QUERY:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_TEXTURE_SHADOW_MAP:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
             if (r300screen->caps->is_r500) {
                 /* 13 == 4096x4096 */
@@ -323,13 +319,14 @@ r300_get_tex_transfer(struct pipe_screen *screen,
     if (trans) {
         pipe_texture_reference(&trans->transfer.texture, texture);
         trans->transfer.format = texture->format;
+        trans->transfer.x = x;
+        trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
         trans->transfer.block = texture->block;
         trans->transfer.nblocksx = texture->nblocksx[level];
         trans->transfer.nblocksy = texture->nblocksy[level];
-        trans->transfer.stride = align(pf_get_stride(&trans->transfer.block,
-                                                     texture->width[level]), 32);
+        trans->transfer.stride = r300_texture_get_stride(tex, level);
         trans->transfer.usage = usage;
         trans->offset = offset;
     }
@@ -356,7 +353,7 @@ static void* r300_transfer_map(struct pipe_screen* screen,
     if (transfer->usage != PIPE_TRANSFER_READ) {
         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
     }
-    
+
     map = pipe_buffer_map(screen, tex->buffer, flags);
 
     if (!map) {
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index c16cadd040..88cb9af6fb 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -20,10 +20,11 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_pack_color.h"
 
-#include "util/u_debug.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "pipe/p_config.h"
 #include "pipe/internal/p_winsys_screen.h"
@@ -429,6 +430,9 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
 
     r300->rs_state = rs;
     r300->dirty_state |= R300_NEW_RASTERIZER;
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
+    r300->dirty_state |= R300_NEW_SCISSOR;
+    r300->dirty_state |= R300_NEW_VIEWPORT;
 }
 
 /* Free rasterizer state. */
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index c01e61a9b1..02b7ab9107 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -23,6 +23,7 @@
 #include "r300_state_derived.h"
 
 #include "r300_fs.h"
+#include "r300_state_inlines.h"
 #include "r300_vs.h"
 
 /* r300_state_derived: Various bits of state which are dependent upon
@@ -52,7 +53,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
     if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
     {
         for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
+            switch (r300->vs->code.inputs[i]) {
                 case TGSI_SEMANTIC_POSITION:
                     pos = TRUE;
                     tab[i] = 0;
@@ -62,10 +63,12 @@ static void r300_vs_tab_routes(struct r300_context* r300,
                     cols++;
                     break;
                 case TGSI_SEMANTIC_PSIZE:
+                    assert(psize == FALSE);
                     psize = TRUE;
                     tab[i] = 15;
                     break;
                 case TGSI_SEMANTIC_FOG:
+                    assert(fog == FALSE);
                     fog = TRUE;
                     /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
@@ -124,7 +127,9 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 
     vinfo->hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
 
-    if (!pos) {
+    /* We need to add vertex position attribute only for SW TCL case,
+     * for HW TCL case it could be generated by vertex shader */
+    if (!pos && !r300screen->caps->has_tcl) {
         debug_printf("r300: Forcing vertex position attribute emit...\n");
         /* Make room for the position attribute
          * at the beginning of the tab. */
@@ -163,7 +168,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    for (i; i < texs; i++) {
+    for (; i < texs; i++) {
         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
             draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
@@ -195,13 +200,13 @@ static void r300_vertex_psc(struct r300_context* r300,
      * and not on attrib information. */
     if (r300screen->caps->has_tcl) {
         attrib_count = r300->vs->info.num_inputs;
-        debug_printf("r300: routing %d attribs in psc for vs\n",
+        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
                 attrib_count);
     } else {
         attrib_count = vinfo->num_attribs;
-        debug_printf("r300: attrib count: %d\n", attrib_count);
+        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
         for (i = 0; i < attrib_count; i++) {
-            debug_printf("r300: attrib: offset %d, interp %d, size %d,"
+            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
                    " tab %d\n", vinfo->attrib[i].src_index,
                    vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
                    tab[i]);
@@ -299,18 +304,18 @@ static void r300_update_fs_tab(struct r300_context* r300)
     }
 
     /* Now that we know where everything is... */
-    debug_printf("r300: fp input count: %d\n", info->num_inputs);
+    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
     for (i = 0; i < info->num_inputs; i++) {
         switch (tab[i]) {
             case INTERP_LINEAR:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, color,    tab %d\n",
                         i, cols_emitted);
                 tab[i] = cols_emitted;
                 cols_emitted++;
                 break;
             case INTERP_PERSPECTIVE:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, texcoord, tab %d\n",
                         i, cols + texs);
                 tab[i] = cols + texs;
@@ -333,48 +338,37 @@ static void r300_update_rs_block(struct r300_context* r300)
     struct r300_rs_block* rs = r300->rs_block;
     struct tgsi_shader_info* info = &r300->fs->info;
     int* tab = r300->vertex_info.fs_tab;
-    int col_count = 0, fp_offset = 0, i, memory_pos, tex_count = 0;
-
+    int col_count = 0, fp_offset = 0, i, tex_count = 0;
+    int rs_tex_comp = 0;
     memset(rs, 0, sizeof(struct r300_rs_block));
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
         for (i = 0; i < info->num_inputs; i++) {
             assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R500_RS_COL_PTR(memory_pos) |
+                        R500_RS_COL_PTR(col_count) |
                         R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R500_RS_SEL_S(memory_pos) |
-                        R500_RS_SEL_T(memory_pos + 1) |
-                        R500_RS_SEL_R(memory_pos + 2) |
-                        R500_RS_SEL_Q(memory_pos + 3);
+                        R500_RS_SEL_S(rs_tex_comp) |
+                        R500_RS_SEL_T(rs_tex_comp + 1) |
+                        R500_RS_SEL_R(rs_tex_comp + 2) |
+                        R500_RS_SEL_Q(rs_tex_comp + 3);
                     tex_count++;
+                    rs_tex_comp += 4;
                     break;
                 default:
                     break;
             }
         }
 
-        if (col_count == 0) {
-            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
-        }
-
-        if (tex_count == 0) {
-            rs->ip[0] |=
-                R500_RS_SEL_S(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
-        }
-
         /* Rasterize at least one color, or bad things happen. */
         if ((col_count == 0) && (tex_count == 0)) {
+            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
             col_count++;
         }
 
@@ -392,22 +386,22 @@ static void r300_update_rs_block(struct r300_context* r300)
     } else {
         for (i = 0; i < info->num_inputs; i++) {
             assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R300_RS_COL_PTR(memory_pos) |
+                        R300_RS_COL_PTR(col_count) |
                         R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R300_RS_TEX_PTR(memory_pos) |
+                        R300_RS_TEX_PTR(rs_tex_comp) |
                         R300_RS_SEL_S(R300_RS_SEL_C0) |
                         R300_RS_SEL_T(R300_RS_SEL_C1) |
                         R300_RS_SEL_R(R300_RS_SEL_C2) |
                         R300_RS_SEL_Q(R300_RS_SEL_C3);
                     tex_count++;
+                    rs_tex_comp+=4;
                     break;
                 default:
                     break;
@@ -444,7 +438,7 @@ static void r300_update_rs_block(struct r300_context* r300)
         }
     }
 
-    rs->count = (tex_count * 4) | (col_count << R300_IC_COUNT_SHIFT) |
+    rs->count = (rs_tex_comp) | (col_count << R300_IC_COUNT_SHIFT) |
         R300_HIRES_EN;
 
     rs->inst_count = MAX2(MAX2(col_count - 1, tex_count - 1), 0);
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
index 63ae8eb8d0..71a4a47b00 100644
--- a/src/gallium/drivers/r300/r300_state_derived.h
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -23,11 +23,7 @@
 #ifndef R300_STATE_DERIVED_H
 #define R300_STATE_DERIVED_H
 
-#include "draw/draw_vertex.h"
-
-#include "r300_context.h"
-#include "r300_reg.h"
-#include "r300_state_inlines.h"
+struct r300_context;
 
 void r300_update_derived_state(struct r300_context* r300);
 
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 7d822fec48..3865730d63 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -23,6 +23,12 @@
 
 #include "r300_state_invariant.h"
 
+
+struct pipe_viewport_state r300_viewport_identity = {
+    .scale = {1.0, 1.0, 1.0, 1.0},
+    .translate = {0.0, 0.0, 0.0, 0.0},
+};
+
 /* Calculate and emit invariant state. This is data that the 3D engine
  * will probably want at the beginning of every CS, but it's not currently
  * handled by any CSO setup, and in addition it doesn't really change much.
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
index 96e6e4a77d..cc6288cb51 100644
--- a/src/gallium/drivers/r300/r300_surface.c
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -29,7 +29,7 @@ static void r300_surface_setup(struct r300_context* r300,
                                unsigned w, unsigned h)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
-    unsigned pixpitch = dest->stride / dest->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(dest, 0) / dest->tex.block.size;
     CS_LOCALS(r300);
 
     r300_emit_blend_state(r300, &blend_clear_state);
@@ -100,7 +100,7 @@ static void r300_surface_fill(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* tex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = tex->stride / tex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
     boolean invalid = FALSE;
     CS_LOCALS(r300);
 
@@ -233,7 +233,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* srctex = (struct r300_texture*)src->texture;
     struct r300_texture* desttex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = srctex->stride / srctex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(srctex, 0) / srctex->tex.block.size;
     boolean invalid = FALSE;
     float fsrcx = srcx, fsrcy = srcy, fdestx = destx, fdesty = desty;
     CS_LOCALS(r300);
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 590052509c..ce60ded7ca 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -22,41 +22,62 @@
 
 #include "r300_texture.h"
 
-static void r300_setup_texture_state(struct r300_texture* tex,
-                                     unsigned width,
-                                     unsigned height,
-                                     unsigned pitch,
-                                     unsigned levels)
+static void r300_setup_texture_state(struct r300_texture* tex)
 {
     struct r300_texture_state* state = &tex->state;
+    struct pipe_texture *pt = &tex->tex;
 
-    state->format0 = R300_TX_WIDTH((width - 1) & 0x7ff) |
-        R300_TX_HEIGHT((height - 1) & 0x7ff) |
-        R300_TX_NUM_LEVELS(levels) |
+    state->format0 = R300_TX_WIDTH((pt->width[0] - 1) & 0x7ff) |
+        R300_TX_HEIGHT((pt->height[0] - 1) & 0x7ff) |
+        R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf) |
+        R300_TX_NUM_LEVELS(pt->last_level) |
         R300_TX_PITCH_EN;
 
     /* XXX */
-    state->format1 = r300_translate_texformat(tex->tex.format);
+    state->format1 = r300_translate_texformat(pt->format);
+    if (pt->target == PIPE_TEXTURE_CUBE) {
+	state->format1 |= R300_TX_FORMAT_CUBIC_MAP;
+    }
+    if (pt->target == PIPE_TEXTURE_3D) {
+	state->format1 |= R300_TX_FORMAT_3D;
+    }
 
-    state->format2 = pitch - 1;
+    state->format2 = (r300_texture_get_stride(tex, 0) / pt->block.size) - 1;
 
     /* Assume (somewhat foolishly) that oversized textures will
      * not be permitted by the state tracker. */
-    if (width > 2048) {
+    if (pt->width[0] > 2048) {
         state->format2 |= R500_TXWIDTH_BIT11;
     }
-    if (height > 2048) {
+    if (pt->height[0] > 2048) {
         state->format2 |= R500_TXHEIGHT_BIT11;
     }
 
-    debug_printf("r300: Set texture state (%dx%d, pitch %d, %d levels)\n",
-            width, height, pitch, levels);
+    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
+		 pt->width[0], pt->height[0], pt->last_level);
+}
+
+/**
+ * Return the stride, in bytes, of the texture images of the given texture
+ * at the given level.
+ */
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+{
+    if (tex->stride_override)
+        return tex->stride_override;
+
+    if (level > tex->tex.last_level) {
+        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__, level, tex->tex.last_level);
+        return 0;
+    }
+
+    return align(pf_get_stride(&tex->tex.block, tex->tex.width[level]), 32);
 }
 
 static void r300_setup_miptree(struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, offset;
+    int stride, size;
     int i;
 
     for (i = 0; i <= base->last_level; i++) {
@@ -74,7 +95,7 @@ static void r300_setup_miptree(struct r300_texture* tex)
          * XXX
          * POT, uncompressed, unmippmapped textures can be aligned to 32,
          * instead of 64. */
-        stride = align(pf_get_stride(&base->block, base->width[i]), 32);
+        stride = r300_texture_get_stride(tex, i);
         size = stride * base->nblocksy[i] * base->depth[i];
 
         tex->offset[i] = align(tex->size, 32);
@@ -84,10 +105,6 @@ static void r300_setup_miptree(struct r300_texture* tex)
                 "(%dx%dx%d px, pitch %d bytes)\n",
                 i, base->width[i], base->height[i], base->depth[i],
                 stride);
-        /* Save stride of first level to the texture. */
-        if (i == 0) {
-            tex->stride = stride;
-        }
     }
 }
 
@@ -108,8 +125,7 @@ static struct pipe_texture*
 
     r300_setup_miptree(tex);
 
-    r300_setup_texture_state(tex, template->width[0], template->height[0],
-            template->width[0], template->last_level);
+    r300_setup_texture_state(tex);
 
     tex->buffer = screen->buffer_create(screen, 1024,
                                         PIPE_BUFFER_USAGE_PIXEL,
@@ -189,11 +205,10 @@ static struct pipe_texture*
     pipe_reference_init(&tex->tex.reference, 1);
     tex->tex.screen = screen;
 
-    tex->stride = *stride;
+    tex->stride_override = *stride;
 
     /* XXX */
-    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0],
-            tex->stride, 0);
+    r300_setup_texture_state(tex);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
@@ -221,7 +236,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = tex->stride;
+        *stride = r300_texture_get_stride(tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 3b56f0307c..78ee0f1611 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -30,8 +30,12 @@
 #include "r300_context.h"
 #include "r300_reg.h"
 
+struct r300_texture;
+
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+
 /* Note the signature of R300_EASY_TX_FORMAT(A, R, G, B, FORMAT)... */
 static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
 {
@@ -68,6 +72,11 @@ static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
         /* W24_FP */
         case PIPE_FORMAT_Z24S8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, W24_FP);
+	/* Z5_Y6_X5 */
+        case PIPE_FORMAT_R16_SNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, X, Z5Y6X5);
+        case PIPE_FORMAT_Z16_UNORM:
+	    return R300_EASY_TX_FORMAT(X, X, X, X, X16);
         default:
             debug_printf("r300: Implementation error: "
                 "Got unsupported texture format %s in %s\n",
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index d68a104106..0913ca1bd5 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -158,7 +158,6 @@ static unsigned translate_saturate(unsigned saturate)
     switch(saturate) {
         case TGSI_SAT_NONE: return SATURATE_OFF;
         case TGSI_SAT_ZERO_ONE: return SATURATE_ZERO_ONE;
-        case TGSI_SAT_MINUS_PLUS_ONE: return SATURATE_PLUS_MINUS_ONE;
     }
 
     fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 2cb903bba2..12a6e37be6 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -116,7 +116,7 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     /* Setup the compiler */
     rc_init(&compiler.Base);
 
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
     compiler.code = &vs->code;
     compiler.UserData = vs;
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 516e3992fd..bcb887a0b2 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -6,26 +6,17 @@ LIBNAME = softpipe
 C_SOURCES = \
 	sp_fs_exec.c \
 	sp_fs_sse.c \
-	sp_fs_llvm.c \
 	sp_clear.c \
 	sp_flush.c \
 	sp_query.c \
 	sp_context.c \
 	sp_draw_arrays.c \
-	sp_prim_setup.c \
 	sp_prim_vbuf.c \
 	sp_quad_pipe.c \
-	sp_quad_alpha_test.c \
-	sp_quad_blend.c \
-	sp_quad_colormask.c \
-	sp_quad_coverage.c \
+	sp_quad_stipple.c \
 	sp_quad_depth_test.c \
-	sp_quad_earlyz.c \
 	sp_quad_fs.c \
-	sp_quad_occlusion.c \
-	sp_quad_output.c \
-	sp_quad_stencil.c \
-	sp_quad_stipple.c \
+	sp_quad_blend.c \
 	sp_screen.c \
         sp_setup.c \
 	sp_state_blend.c \
@@ -38,7 +29,9 @@ C_SOURCES = \
 	sp_state_vertex.c \
 	sp_texture.c \
 	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
-	sp_surface.c 
+	sp_surface.c \
+	sp_video_context.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index f8720638a7..aac9edf44e 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -7,25 +7,16 @@ softpipe = env.ConvenienceLibrary(
 	source = [
 		'sp_fs_exec.c',
 		'sp_fs_sse.c',
-		'sp_fs_llvm.c',
 		'sp_clear.c',
 		'sp_context.c',
 		'sp_draw_arrays.c',
 		'sp_flush.c',
-		'sp_prim_setup.c',
 		'sp_prim_vbuf.c',
 		'sp_setup.c',
-		'sp_quad_alpha_test.c',
 		'sp_quad_blend.c',
 		'sp_quad_pipe.c',
-		'sp_quad_colormask.c',
-		'sp_quad_coverage.c',
 		'sp_quad_depth_test.c',
-		'sp_quad_earlyz.c',
 		'sp_quad_fs.c',
-		'sp_quad_occlusion.c',
-		'sp_quad_output.c',
-		'sp_quad_stencil.c',
 		'sp_quad_stipple.c',
 		'sp_query.c',
 		'sp_screen.c',
@@ -39,8 +30,10 @@ softpipe = env.ConvenienceLibrary(
 		'sp_state_vertex.c',
 		'sp_surface.c',
 		'sp_tex_sample.c',
+		'sp_tex_tile_cache.c',
 		'sp_texture.c',
 		'sp_tile_cache.c',
+		'sp_video_context.c',
 	])
 
-Export('softpipe')
-\ No newline at end of file
+Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index d3af18e162..8fac8e6e05 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,8 +36,6 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_state.h"
 #include "sp_tile_cache.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
index 2e450672f5..9be3b86fe9 100644
--- a/src/gallium/drivers/softpipe/sp_clear.h
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -32,7 +32,6 @@
 #ifndef SP_CLEAR_H
 #define SP_CLEAR_H
 
-#include "pipe/p_state.h"
 struct pipe_context;
 
 extern void
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index b4650c0dc5..94d000a5ac 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -31,17 +31,18 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
-#include "sp_prim_setup.h"
 #include "sp_prim_vbuf.h"
 #include "sp_state.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_texture.h"
 #include "sp_winsys.h"
 #include "sp_query.h"
@@ -72,18 +73,16 @@ softpipe_unmap_transfers(struct softpipe_context *sp)
 {
    uint i;
 
-   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
-      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
-   sp_flush_tile_cache(sp, sp->zsbuf_cache);
-
    for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
       sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
    }
+
    sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
 }
 
 
-static void softpipe_destroy( struct pipe_context *pipe )
+static void
+softpipe_destroy( struct pipe_context *pipe )
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
    uint i;
@@ -91,19 +90,9 @@ static void softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
-      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
-      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
-      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
-      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
-      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
-      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
-      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
-      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
-      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
-      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
-   }
+      softpipe->quad.shade->destroy( softpipe->quad.shade );
+      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+      softpipe->quad.blend->destroy( softpipe->quad.blend );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
@@ -113,7 +102,7 @@ static void softpipe_destroy( struct pipe_context *pipe )
    pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+      sp_destroy_tex_tile_cache(softpipe->tex_cache[i]);
       pipe_texture_reference(&softpipe->texture[i], NULL);
    }
 
@@ -126,6 +115,15 @@ static void softpipe_destroy( struct pipe_context *pipe )
    FREE( softpipe );
 }
 
+
+/**
+ * if (the texture is being used as a framebuffer surface)
+ *    return PIPE_REFERENCED_FOR_WRITE
+ * else if (the texture is a bound texture source)
+ *    return PIPE_REFERENCED_FOR_READ  XXX not done yet
+ * else
+ *    return PIPE_UNREFERENCED
+ */
 static unsigned int
 softpipe_is_texture_referenced( struct pipe_context *pipe,
 				struct pipe_texture *texture,
@@ -134,15 +132,17 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    struct softpipe_context *softpipe = softpipe_context( pipe );
    unsigned i;
 
-   if(softpipe->dirty_render_cache) {
+   if (softpipe->dirty_render_cache) {
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-         if(softpipe->framebuffer.cbufs[i] && 
-            softpipe->framebuffer.cbufs[i]->texture == texture)
+         if (softpipe->framebuffer.cbufs[i] && 
+             softpipe->framebuffer.cbufs[i]->texture == texture) {
             return PIPE_REFERENCED_FOR_WRITE;
+         }
       }
-      if(softpipe->framebuffer.zsbuf && 
-         softpipe->framebuffer.zsbuf->texture == texture)
+      if (softpipe->framebuffer.zsbuf && 
+          softpipe->framebuffer.zsbuf->texture == texture) {
          return PIPE_REFERENCED_FOR_WRITE;
+      }
    }
    
    /* FIXME: we also need to do the same for the texture cache */
@@ -150,6 +150,7 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 static unsigned int
 softpipe_is_buffer_referenced( struct pipe_context *pipe,
 			       struct pipe_buffer *buf)
@@ -157,6 +158,7 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -227,7 +229,6 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.is_buffer_referenced = softpipe_is_buffer_referenced;
 
    softpipe_init_query_funcs( softpipe );
-   softpipe_init_texture_funcs( softpipe );
 
    /*
     * Alloc caches for accessing drawing surfaces and textures.
@@ -238,41 +239,14 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->zsbuf_cache = sp_create_tile_cache( screen );
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+      softpipe->tex_cache[i] = sp_create_tex_tile_cache( screen );
 
 
    /* setup quad rendering stages */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
-      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
-      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
-      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
-      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
-      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
-      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
-      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
-      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
-      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
-      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
-   }
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
-      softpipe->tgsi.vert_samplers[i].unit = i;
-      softpipe->tgsi.vert_samplers[i].sp = softpipe;
-      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
-   }
+      softpipe->quad.shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad.blend = sp_quad_blend_stage(softpipe);
 
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
-      softpipe->tgsi.frag_samplers[i].unit = i;
-      softpipe->tgsi.frag_samplers[i].sp = softpipe;
-      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
-   }
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -286,30 +260,28 @@ softpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
-   softpipe->setup = sp_draw_render_stage(softpipe);
-   if (!softpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
-   }
-   else {
-      sp_init_vbuf(softpipe);
-   }
+   softpipe->vbuf_backend = sp_create_vbuf_backend(softpipe);
+   if (!softpipe->vbuf_backend)
+      goto fail;
+
+   softpipe->vbuf = draw_vbuf_stage(softpipe->draw, softpipe->vbuf_backend);
+   if (!softpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(softpipe->draw, softpipe->vbuf);
+   draw_set_render(softpipe->draw, softpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
    draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
 
-#if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
    draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
-#endif
 
    sp_init_surface_functions(softpipe);
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 7888c2f644..43a195c8ef 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -36,24 +36,13 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
-#include "sp_tex_sample.h"
 
 
-/**
- * This is a temporary variable for testing draw-stage polygon stipple.
- * If zero, do stipple in sp_quad_stipple.c
- */
-#define USE_DRAW_STAGE_PSTIPPLE 1
-
-/* Number of threads working on individual quads.
- * Setting to 1 disables this feature.
- */
-#define SP_NUM_QUAD_THREADS 1
-
 struct softpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
 struct softpipe_tile_cache;
+struct softpipe_tex_tile_cache;
 struct sp_fragment_shader;
 struct sp_vertex_shader;
 
@@ -62,12 +51,12 @@ struct softpipe_context {
    struct pipe_context pipe;  /**< base class */
 
    /** Constant state objects */
-   const struct pipe_blend_state *blend;
-   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
-   const struct pipe_depth_stencil_alpha_state *depth_stencil;
-   const struct pipe_rasterizer_state *rasterizer;
-   const struct sp_fragment_shader *fs;
-   const struct sp_vertex_shader *vs;
+   struct pipe_blend_state *blend;
+   struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+   struct sp_fragment_shader *fs;
+   struct sp_vertex_shader *vs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -107,7 +96,16 @@ struct softpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+   /* The reduced version of the primitive supplied by the state
+    * tracker.
+    */
+   unsigned reduced_api_prim;
+
+   /* The reduced primitive after unfilled triangles, wide-line
+    * decomposition, etc, are taken into account.  This is the
+    * primitive actually rasterized.
+    */
+   unsigned reduced_prim;
 
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect;
@@ -116,41 +114,33 @@ struct softpipe_context {
 
    /** Software quad rendering pipeline */
    struct {
-      struct quad_stage *polygon_stipple;
-      struct quad_stage *earlyz;
       struct quad_stage *shade;
-      struct quad_stage *alpha_test;
-      struct quad_stage *stencil_test;
       struct quad_stage *depth_test;
-      struct quad_stage *occlusion;
-      struct quad_stage *coverage;
       struct quad_stage *blend;
-      struct quad_stage *colormask;
-      struct quad_stage *output;
 
       struct quad_stage *first; /**< points to one of the above stages */
-   } quad[SP_NUM_QUAD_THREADS];
+   } quad;
 
    /** TGSI exec things */
    struct {
-      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct softpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
    struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
    struct softpipe_tile_cache *zsbuf_cache;
-
-   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   
+   unsigned tex_timestamp;
+   struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
@@ -164,5 +154,9 @@ softpipe_context( struct pipe_context *pipe )
    return (struct softpipe_context *)pipe;
 }
 
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe);
+
+
 #endif /* SP_CONTEXT_H */
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 4a14d49686..e38b767cf2 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -37,6 +37,7 @@
 #include "sp_surface.h"
 #include "sp_state.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_winsys.h"
 
 
@@ -52,17 +53,19 @@ softpipe_flush( struct pipe_context *pipe,
 
    if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
       for (i = 0; i < softpipe->num_textures; i++) {
-         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+         sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
       }
    }
 
-   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
          if (softpipe->cbuf_cache[i])
-            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
-
-      if (softpipe->zsbuf_cache)
-         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,7 +74,15 @@ softpipe_flush( struct pipe_context *pipe,
        * to unmap surfaces when flushing.
        */
       softpipe_unmap_transfers(softpipe);
-      
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe->zsbuf_cache);
+     
       softpipe->dirty_render_cache = FALSE;
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 9ee86fe787..4076114d39 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -59,15 +59,34 @@ sp_exec_fragment_shader(const struct sp_fragment_shader *base)
 }
 
 
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
  *
  * This should really be part of the compiled shader.
  */
-void
-sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
-		    float x, float y,
-		    struct tgsi_exec_vector *quadpos)
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
 {
    uint chan;
    /* do X */
@@ -95,24 +114,6 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 }
 
 
-static void
-exec_prepare( const struct sp_fragment_shader *base,
-	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler **samplers )
-{
-   /*
-    * Bind tokens/shader to the interpreter's machine state.
-    * Avoid redundant binding.
-    */
-   if (machine->Tokens != base->shader.tokens) {
-      tgsi_exec_machine_bind_shader( machine,
-                                     base->shader.tokens,
-                                     PIPE_MAX_SAMPLERS,
-                                     samplers );
-   }
-}
-
-
 /* TODO: hide the machine struct in here somewhere, remove from this
  * interface:
  */
@@ -122,11 +123,43 @@ exec_run( const struct sp_fragment_shader *base,
 	  struct quad_header *quad )
 {
    /* Compute X, Y, Z, W vals for this quad */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       &machine->QuadPos);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    &machine->QuadPos);
    
-   return tgsi_exec_machine_run( machine );
+   quad->inout.mask &= tgsi_exec_machine_run( machine );
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
deleted file mode 100644
index 95c0d982d1..0000000000
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Execute fragment shader using LLVM code generation.
- * Authors:
- *   Zack Rusin
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_sse2.h"
-
-#if 0
-
-/**
- * Subclass of sp_fragment_shader
- */
-struct sp_llvm_fragment_shader
-{
-   struct sp_fragment_shader base;
-   struct gallivm_prog *llvm_prog;
-};
-
-
-static void
-shade_quad_llvm(struct quad_stage *qs,
-                struct quad_header *quad)
-{
-   struct quad_shade_stage *qss = quad_shade_stage(qs);
-   struct softpipe_context *softpipe = qs->softpipe;
-   float dests[4][16][4] ALIGN16_ATTRIB;
-   float inputs[4][16][4] ALIGN16_ATTRIB;
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
-   struct gallivm_prog *llvm = qss->llvm_prog;
-
-   inputs[0][0][0] = fx;
-   inputs[1][0][0] = fx + 1.0f;
-   inputs[2][0][0] = fx;
-   inputs[3][0][0] = fx + 1.0f;
-
-   inputs[0][0][1] = fy;
-   inputs[1][0][1] = fy;
-   inputs[2][0][1] = fy + 1.0f;
-   inputs[3][0][1] = fy + 1.0f;
-
-
-   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
-
-#if DLLVM
-   debug_printf("MASK = %d\n", quad->mask);
-   for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 2; ++j) {
-         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
-                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
-      }
-   }
-#endif
-
-   quad->mask &=
-      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
-                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
-                                   qss->samplers);
-#if DLLVM
-   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
-          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
-          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
-#endif
-
-   /* store result color */
-   if (qss->colorOutSlot >= 0) {
-      unsigned i;
-      /* XXX need to handle multiple color outputs someday */
-      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
-             == TGSI_SEMANTIC_COLOR);
-      for (i = 0; i < QUAD_SIZE; ++i) {
-         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
-         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
-         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
-         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
-      }
-   }
-#if DLLVM
-   for (int i = 0; i < QUAD_SIZE; ++i) {
-      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
-             quad->outputs.color[0][0][i],
-             quad->outputs.color[0][1][i],
-             quad->outputs.color[0][2][i],
-             quad->outputs.color[0][3][i]);
-   }
-#endif
-
-   /* store result Z */
-   if (qss->depthOutSlot >= 0) {
-      /* output[slot] is new Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = dests[i][0][2];
-      }
-   }
-   else {
-      /* copy input Z (which was interpolated by the executor) to output Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = inputs[i][0][2];
-      }
-   }
-#if DLLVM
-   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
-             quad->outputs.depth[0],
-             quad->outputs.depth[1],
-             quad->outputs.depth[2],
-             quad->outputs.depth[3], quad->mask);
-#endif
-
-   /* shader may cull fragments */
-   if( quad->mask ) {
-      qs->next->run( qs->next, quad );
-   }
-}
-
-
-unsigned 
-run_llvm_fs( const struct sp_fragment_shader *base,
-	     struct foo *machine )
-{
-}
-
-
-void 
-delete_llvm_fs( struct sp_fragment_shader *base )
-{
-   FREE(base);
-}
-
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-                        const struct pipe_shader_state *templ)
-{
-   struct sp_llvm_fragment_shader *shader = NULL;
-
-   /* LLVM fragment shaders currently disabled:
-    */
-   state = CALLOC_STRUCT(sp_llvm_shader_state);
-   if (!state)
-      return NULL;
-
-   state->llvm_prog = 0;
-
-   if (!gallivm_global_cpu_engine()) {
-      gallivm_cpu_engine_create(state->llvm_prog);
-   }
-   else
-      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
-   
-   if (shader) {
-      shader->base.run = run_llvm_fs;
-      shader->base.delete = delete_llvm_fs;
-   }
-
-   return shader;
-}
-
-
-#else
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-		       const struct pipe_shader_state *templ)
-{
-   return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index f4fa0905d7..9d3e4670ee 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -76,6 +76,43 @@ fs_sse_prepare( const struct sp_fragment_shader *base,
 }
 
 
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
 /* TODO: codegenerate the whole run function, skip this wrapper.
  * TODO: break dependency on tgsi_exec_machine struct
  * TODO: push Position calculation into the generated shader
@@ -89,9 +126,9 @@ fs_sse_run( const struct sp_fragment_shader *base,
    struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
 
    /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       machine->Temps);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    machine->Temps);
 
    /* init kill mask */
    tgsi_set_kill_mask(machine, 0x0);
@@ -104,7 +141,39 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		 //	 , &machine->QuadPos
       );
 
-   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
deleted file mode 100644
index 038ff04d4f..0000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in sp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "sp_context.h"
-#include "sp_setup.h"
-#include "sp_state.h"
-#include "sp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = setup_create_context(softpipe);
-   sstage->stage.draw = softpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-sp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-sp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
deleted file mode 100644
index 49bdd98ed8..0000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SP_PRIM_SETUP_H
-#define SP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct softpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-sp_draw_render_stage( struct softpipe_context *softpipe );
-
-extern struct setup_context *
-sp_draw_setup_context( struct draw_stage * );
-
-extern void
-sp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-sp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 42021789ea..e603c20fc4 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -37,13 +37,13 @@
 
 
 #include "sp_context.h"
+#include "sp_setup.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "sp_prim_setup.h"
-#include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
@@ -58,6 +58,8 @@ struct softpipe_vbuf_render
 {
    struct vbuf_render base;
    struct softpipe_context *softpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -74,6 +76,11 @@ softpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -104,36 +111,6 @@ sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 sp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         softpipe_get_vbuf_vertex_info(cvbr->softpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -159,14 +136,11 @@ static boolean
 sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
    setup_prepare( setup_ctx );
 
+   cvbr->softpipe->reduced_prim = u_reduced_prim(prim);
    cvbr->prim = prim;
    return TRUE;
 
@@ -191,14 +165,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct softpipe_context *softpipe = cvbr->softpipe;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -237,14 +206,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -254,14 +225,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
@@ -271,14 +244,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -288,8 +263,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -299,7 +274,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -314,8 +291,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -325,7 +302,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -355,11 +334,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in sp_setup.c:
-    */
-   sp_draw_flush( setup );
 }
 
 
@@ -372,17 +346,12 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -421,14 +390,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -438,14 +409,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i++) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-1, stride),
                        get_vert(vertex_buffer, i-(i&1), stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-2, stride),
                        get_vert(vertex_buffer, i-(i&1)-1, stride),
@@ -455,14 +428,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -472,8 +447,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -483,7 +458,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -497,8 +474,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -508,7 +485,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -546,40 +525,38 @@ static void
 sp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   cvbr->softpipe->vbuf_render = NULL;
+   setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-sp_init_vbuf(struct softpipe_context *sp)
+struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *sp)
 {
-   assert(sp->draw);
+   struct softpipe_vbuf_render *cvbr = CALLOC_STRUCT(softpipe_vbuf_render);
 
-   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+   assert(sp->draw);
 
-   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
-   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
-   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
-   sp->vbuf_render->base.map_vertices = sp_vbuf_map_vertices;
-   sp->vbuf_render->base.unmap_vertices = sp_vbuf_unmap_vertices;
-   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
-   sp->vbuf_render->base.draw = sp_vbuf_draw;
-   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
-   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
-   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+   cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->softpipe = sp;
+   cvbr->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = sp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = sp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = sp_vbuf_set_primitive;
+   cvbr->base.draw = sp_vbuf_draw;
+   cvbr->base.draw_arrays = sp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = sp_vbuf_release_vertices;
+   cvbr->base.destroy = sp_vbuf_destroy;
 
-   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+   cvbr->softpipe = sp;
 
-   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+   cvbr->setup = setup_create_context(cvbr->softpipe);
 
-   draw_set_render(sp->draw, &sp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
index 1de9cc2a89..ad01cc2f28 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.h
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct softpipe_context;
 
-extern void
-sp_init_vbuf(struct softpipe_context *softpipe);
+extern struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *softpipe);
 
 
 #endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index bd6c6cb912..a3236bd116 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -97,10 +97,10 @@ struct quad_header {
    struct quad_header_inout inout;
    struct quad_header_output output;
 
-   const struct tgsi_interp_coef *coef;
+   /* Redundant/duplicated:
+    */
    const struct tgsi_interp_coef *posCoef;
-
-   unsigned nr_attrs;
+   const struct tgsi_interp_coef *coef;
 };
 
 #endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
deleted file mode 100644
index 0845bae0e6..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ /dev/null
@@ -1,108 +0,0 @@
-
-/**
- * quad alpha test
- */
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-static void
-alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const float ref = softpipe->depth_stencil->alpha.ref_value;
-   unsigned passMask = 0x0, j;
-   const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->output.color[cbuf][3];
-
-   switch (softpipe->depth_stencil->alpha.func) {
-   case PIPE_FUNC_NEVER:
-      break;
-   case PIPE_FUNC_LESS:
-      /*
-       * If mask were an array [4] we could do this SIMD-style:
-       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
-       */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] < ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] == ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] <= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] > ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] != ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] >= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   quad->inout.mask &= passMask;
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void alpha_test_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void alpha_test_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *
-sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = alpha_test_begin;
-   stage->run = alpha_test_quad;
-   stage->destroy = alpha_test_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index b1e18805c7..e243c63fa2 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -117,644 +117,865 @@ do { \
 
 
 static void
-logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+logicop_quad(struct quad_stage *qs, 
+             float (*quadColor)[4],
+             float (*dest)[4])
 {
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   ubyte src[4][4], dst[4][4], res[4][4];
+   uint *src4 = (uint *) src;
+   uint *dst4 = (uint *) dst;
+   uint *res4 = (uint *) res;
+   uint j;
+
+
+   /* convert to ubyte */
+   for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+      dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+      dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+      dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+      dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+      src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+      src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+      src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+      src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+   }
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      ubyte src[4][4], dst[4][4], res[4][4];
-      uint *src4 = (uint *) src;
-      uint *dst4 = (uint *) dst;
-      uint *res4 = (uint *) res;
-      struct softpipe_cached_tile *
-         tile = sp_get_cached_tile(softpipe,
-                                   softpipe->cbuf_cache[cbuf],
-                                   quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
+   switch (softpipe->blend->logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      for (j = 0; j < 4; j++)
+         res4[j] = 0;
+      break;
+   case PIPE_LOGICOP_NOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] | dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j];
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & ~dst4[j];
+      break;
+   case PIPE_LOGICOP_INVERT:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~dst4[j];
+      break;
+   case PIPE_LOGICOP_XOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j] ^ src4[j];
+      break;
+   case PIPE_LOGICOP_NAND:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] & dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] ^ dst4[j]);
+      break;
+   case PIPE_LOGICOP_NOOP:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j];
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j];
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | ~dst4[j];
+      break;
+   case PIPE_LOGICOP_OR:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_SET:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~0;
+      break;
+   default:
+      assert(0);
+   }
 
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
+   for (j = 0; j < 4; j++) {
+      quadColor[j][0] = ubyte_to_float(res[j][0]);
+      quadColor[j][1] = ubyte_to_float(res[j][1]);
+      quadColor[j][2] = ubyte_to_float(res[j][2]);
+      quadColor[j][3] = ubyte_to_float(res[j][3]);
+   }
+}
 
-      /* convert to ubyte */
-      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
-         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
-         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
-         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
-
-         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
-         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
-         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
-         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
-      }
 
-      switch (softpipe->blend->logicop_func) {
-      case PIPE_LOGICOP_CLEAR:
-         for (j = 0; j < 4; j++)
-            res4[j] = 0;
-         break;
-      case PIPE_LOGICOP_NOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] | dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j];
-         break;
-      case PIPE_LOGICOP_AND_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & ~dst4[j];
-         break;
-      case PIPE_LOGICOP_INVERT:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~dst4[j];
-         break;
-      case PIPE_LOGICOP_XOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j] ^ src4[j];
-         break;
-      case PIPE_LOGICOP_NAND:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] & dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_EQUIV:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] ^ dst4[j]);
-         break;
-      case PIPE_LOGICOP_NOOP:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j];
-         break;
-      case PIPE_LOGICOP_OR_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j];
-         break;
-      case PIPE_LOGICOP_OR_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | ~dst4[j];
-         break;
-      case PIPE_LOGICOP_OR:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_SET:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~0;
-         break;
-      default:
-         assert(0);
-      }
 
-      for (j = 0; j < 4; j++) {
-         quadColor[j][0] = ubyte_to_float(res[j][0]);
-         quadColor[j][1] = ubyte_to_float(res[j][1]);
-         quadColor[j][2] = ubyte_to_float(res[j][2]);
-         quadColor[j][3] = ubyte_to_float(res[j][3]);
-      }
+static void
+blend_quad(struct quad_stage *qs, 
+           float (*quadColor)[4],
+           float (*dest)[4])
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+   float source[4][QUAD_SIZE];
+
+   /*
+    * Compute src/first term RGB
+    */
+   switch (softpipe->blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[0], quadColor[0]); /* R */
+      VEC4_COPY(source[1], quadColor[1]); /* G */
+      VEC4_COPY(source[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   {
+      const float *alpha = dest[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   {
+      const float *alpha = quadColor[3];
+      float diff[4], temp[4];
+      VEC4_SUB(diff, one, dest[3]);
+      VEC4_MIN(temp, alpha, diff);
+      VEC4_MUL(source[0], quadColor[0], temp); /* R */
+      VEC4_MUL(source[1], quadColor[1], temp); /* G */
+      VEC4_MUL(source[2], quadColor[2], temp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float alpha[4];
+      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[0], zero); /* R */
+      VEC4_COPY(source[1], zero); /* G */
+      VEC4_COPY(source[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(source[0], quadColor[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(source[1], quadColor[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(source[2], quadColor[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (softpipe->blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* multiply alpha by 1.0 */
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(source[3], quadColor[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      /* A */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
    }
 
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
 
+   /*
+    * Compute dest/second term RGB
+    */
+   switch (softpipe->blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[0], zero); /* R */
+      VEC4_COPY(dest[1], zero); /* G */
+      VEC4_COPY(dest[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Compute dest/second term A
+    */
+   switch (softpipe->blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[3], dest[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[3], dest[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Combine RGB terms
+    */
+   switch (softpipe->blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (softpipe->blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
 
 static void
-blend_quad(struct quad_stage *qs, struct quad_header *quad)
+colormask_quad(struct quad_stage *qs,
+               float (*quadColor)[4],
+               float (*dest)[4])
 {
-   static const float zero[4] = { 0, 0, 0, 0 };
-   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
 
+   /* R */
+   if (!(softpipe->blend->colormask & PIPE_MASK_R))
+      COPY_4V(quadColor[0], dest[0]);
+
+   /* G */
+   if (!(softpipe->blend->colormask & PIPE_MASK_G))
+      COPY_4V(quadColor[1], dest[1]);
+
+   /* B */
+   if (!(softpipe->blend->colormask & PIPE_MASK_B))
+      COPY_4V(quadColor[2], dest[2]);
+
+   /* A */
+   if (!(softpipe->blend->colormask & PIPE_MASK_A))
+      COPY_4V(quadColor[3], dest[3]);
+}
+
+
+static void
+blend_fallback(struct quad_stage *qs, 
+               struct quad_header *quads[],
+               unsigned nr)
+{
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned cbuf;
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
+   {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         const int itx = (quad->input.x0 & (TILE_SIZE-1));
+         const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+         /* get/swizzle dest colors 
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
+         }
 
-   if (softpipe->blend->logicop_enable) {
-      logicop_quad(qs, quad);
-      return;
+
+         if (blend->logicop_enable) {
+            logicop_quad( qs, quadColor, dest );
+         }
+         else if (blend->blend_enable) {
+            blend_quad( qs, quadColor, dest );
+         }
+
+         if (blend->colormask != 0xf)
+            colormask_quad( qs, quadColor, dest );
+   
+         /* Output color values
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+            }
+         }
+      }
    }
+}
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
 
+static void
+blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs, 
+                                         struct quad_header *quads[],
+                                         unsigned nr)
+{
+   static const float one[4] = { 1, 1, 1, 1 };
+   float one_minus_alpha[QUAD_SIZE];
+   float dest[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const float *alpha = quadColor[3];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
       }
 
-      /*
-       * Compute src/first term RGB
-       */
-      switch (softpipe->blend->rgb_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[0], quadColor[0]); /* R */
-         VEC4_COPY(source[1], quadColor[1]); /* G */
-         VEC4_COPY(source[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         {
-            const float *alpha = dest[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         {
-            const float *alpha = quadColor[3];
-            float diff[4], temp[4];
-            VEC4_SUB(diff, one, dest[3]);
-            VEC4_MIN(temp, alpha, diff);
-            VEC4_MUL(source[0], quadColor[0], temp); /* R */
-            VEC4_MUL(source[1], quadColor[1], temp); /* G */
-            VEC4_MUL(source[2], quadColor[2], temp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float alpha[4];
-            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[0], zero); /* R */
-         VEC4_COPY(source[1], zero); /* G */
-         VEC4_COPY(source[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(source[0], quadColor[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(source[1], quadColor[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(source[2], quadColor[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+
+      VEC4_SUB(one_minus_alpha, one, alpha);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute src/first term A
-       */
-      switch (softpipe->blend->alpha_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         /* multiply alpha by 1.0 */
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(source[3], quadColor[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            /* A */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_comp);
+static void
+blend_single_add_one_one(struct quad_stage *qs, 
+                         struct quad_header *quads[],
+                         unsigned nr)
+{
+   float dest[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
          }
-         break;
-      default:
-         assert(0);
       }
+     
+      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
 
-
-      /*
-       * Compute dest/second term RGB
-       */
-      switch (softpipe->blend->rgb_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[0], zero); /* R */
-         VEC4_COPY(dest[1], zero); /* G */
-         VEC4_COPY(dest[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
-            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
-            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute dest/second term A
-       */
-      switch (softpipe->blend->alpha_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[3], dest[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[3], dest[3], inv_comp);
+
+static void
+single_output_color(struct quad_stage *qs, 
+                    struct quad_header *quads[],
+                    unsigned nr)
+{
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      default:
-         assert(0);
       }
+   }
+}
+
+static void
+blend_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+}
 
-      /*
-       * Combine RGB terms
-       */
-      switch (softpipe->blend->rgb_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      default:
-         assert(0);
-      }
 
-      /*
-       * Combine A terms
-       */
-      switch (softpipe->blend->alpha_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      default:
-         assert(0);
+static void
+choose_blend_quad(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+
+   qs->run = blend_fallback;
+   
+   if (softpipe->framebuffer.nr_cbufs == 0) {
+      qs->run = blend_noop;
+   }
+   else if (!softpipe->blend->logicop_enable &&
+            softpipe->blend->colormask == 0xf) 
+   {
+      if (!blend->blend_enable) {
+         qs->run = single_output_color;
       }
+      else if (blend->rgb_src_factor == blend->alpha_src_factor &&
+               blend->rgb_dst_factor == blend->alpha_dst_factor &&
+               blend->rgb_func == blend->alpha_func &&
+               softpipe->framebuffer.nr_cbufs == 1)
+      {
+         if (blend->alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+               qs->run = blend_single_add_one_one;
+            }
+            else if (blend->rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+               qs->run = blend_single_add_src_alpha_inv_src_alpha;
 
-   } /* cbuf loop */
+         }
+      }
+   }
 
-   /* pass blended quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->run(qs, quads, nr);
 }
 
 
 static void blend_begin(struct quad_stage *qs)
 {
-   qs->next->begin(qs->next);
+   qs->run = choose_blend_quad;
 }
 
 
@@ -770,7 +991,7 @@ struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = blend_begin;
-   stage->run = blend_quad;
+   stage->run = choose_blend_quad;
    stage->destroy = blend_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
deleted file mode 100644
index 953d8516b9..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Loop over colorbuffers, passing quad to next stage each time.
- */
-static void
-cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
-   unsigned i;
-
-   assert(sizeof(quad->outputs.color) == sizeof(tmp));
-   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
-
-   /* make copy of original colors since they can get modified
-    * by blending and masking.
-    * XXX we won't have to do this if the fragment program actually emits
-    * N separate colors and we're drawing to N color buffers (MRT).
-    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
-    * in effect, we need to save/restore colors like this.
-    */
-   memcpy(tmp, quad->outputs.color, sizeof(tmp));
-
-   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-      /* set current cbuffer */
-#if 0 /* obsolete & going away */
-      softpipe->current_cbuf = i;
-#endif
-
-      /* pass blended quad to next stage */
-      qs->next->run(qs->next, quad);
-
-      /* restore quad's colors for next buffer */
-      memcpy(quad->outputs.color, tmp, sizeof(tmp));
-   }
-}
-
-
-static void cbuf_loop_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void cbuf_loop_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-/**
- * Create the colorbuffer loop stage.
- * This is used to implement multiple render targets and GL_FRONT_AND_BACK
- * rendering.
- */
-struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = cbuf_loop_begin;
-   stage->run = cbuf_loop_quad;
-   stage->destroy = cbuf_loop_destroy;
-
-   return stage;
-}
-
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
deleted file mode 100644
index dc90e5d5e9..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  quad colormask stage
- * \author Brian Paul
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-
-/**
- * XXX colormask could be rolled into blending...
- */
-static void
-colormask_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
-
-      /* R */
-      if (!(softpipe->blend->colormask & PIPE_MASK_R))
-          COPY_4V(quadColor[0], dest[0]);
-
-      /* G */
-      if (!(softpipe->blend->colormask & PIPE_MASK_G))
-          COPY_4V(quadColor[1], dest[1]);
-
-      /* B */
-      if (!(softpipe->blend->colormask & PIPE_MASK_B))
-          COPY_4V(quadColor[2], dest[2]);
-
-      /* A */
-      if (!(softpipe->blend->colormask & PIPE_MASK_A))
-          COPY_4V(quadColor[3], dest[3]);
-   }
-
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
-
-
-static void colormask_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void colormask_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = colormask_begin;
-   stage->run = colormask_quad;
-   stage->destroy = colormask_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
deleted file mode 100644
index 4aeee85870..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Apply AA coverage to quad alpha valus
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Multiply quad's alpha values by the fragment coverage.
- */
-static void
-coverage_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const uint prim = quad->input.prim;
-
-   if ((softpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
-      uint cbuf;
-
-      /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         unsigned j;
-         for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->input.coverage[j] >= 0.0);
-            assert(quad->input.coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->input.coverage[j];
-         }
-      }
-   }
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void coverage_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void coverage_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = coverage_begin;
-   stage->run = coverage_quad;
-   stage->destroy = coverage_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index d463930bae..0ca86c4e1c 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -31,61 +31,109 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_scan.h"
 #include "sp_context.h"
 #include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
+#include "sp_state.h"           /* for sp_fragment_shader */
 
 
-/**
- * Do depth testing for a quad.
- * Not static since it's used by the stencil code.
- */
+struct depth_data {
+   struct pipe_surface *ps;
+   enum pipe_format format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile;
+};
 
-/*
- * To increase efficiency, we should probably have multiple versions
- * of this function that are specifically for Z16, Z32 and FP Z buffers.
- * Try to effectively do that with codegen...
- */
 
-void
-sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+
+static void
+get_depth_stencil_values( struct depth_data *data,
+                          const struct quad_header *quad )
 {
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   const enum pipe_format format = ps->format;
-   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
-   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
-   unsigned zmask = 0;
    unsigned j;
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   const struct softpipe_cached_tile *tile = data->tile;
+
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth16[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         data->stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+   break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         data->stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-   assert(ps); /* shouldn't get here if there's no zbuffer */
+/* If the shader has not been run, interpolate the depth values
+ * ourselves.
+ */
+static void
+interpolate_quad_depth( struct quad_header *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-   /*
-    * Convert quad's float depth values to int depth values (qzzzz).
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+}
+
+
+static void
+convert_quad_depth( struct depth_data *data, 
+                    const struct quad_header *quad )
+{
+   unsigned j;
+
+   /* Convert quad's float depth values to int depth values (qzzzz).
     * If the Z buffer stores integer values, we _have_ to do the depth
     * compares with integers (not floats).  Otherwise, the float->int->float
     * conversion of Z values (which isn't an identity function) will cause
     * Z-fighting errors.
-    *
-    * Also, get the zbuffer values (bzzzz) from the cached tile.
     */
-   switch (format) {
+   switch (data->format) {
    case PIPE_FORMAT_Z16_UNORM:
       {
          float scale = 65535.0;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth16[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
@@ -94,47 +142,247 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          double scale = (double) (uint) ~0UL;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_X8Z24_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_Z24X8_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+
+
+static void
+write_depth_stencil_values( struct depth_data *data,
+                            struct quad_header *quad )
+{
+   struct softpipe_cached_tile *tile = data->tile;
+   unsigned j;
+
+   /* put updated Z values back into cached tile */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth16[y][x] = (ushort) data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->stencilVals[j] << 24) | data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->bzzzz[j] << 8) | data->stencilVals[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j] << 8;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param data->stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(struct depth_data *data,
+                unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param data->stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(struct depth_data *data,
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = data->stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = data->stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] > 0) {
+               newstencil[j] = data->stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~data->stencilVals[j];
          }
       }
       break;
@@ -142,6 +390,39 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & data->stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+   
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+static boolean
+depth_test_quad(struct quad_stage *qs, 
+                struct depth_data *data,
+                struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned zmask = 0;
+   unsigned j;
+
    switch (softpipe->depth_stencil->depth.func) {
    case PIPE_FUNC_NEVER:
       /* zmask = 0 */
@@ -151,37 +432,37 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
        * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
        */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] < bzzzz[j]) 
+	 if (data->qzzzz[j] < data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_EQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] == bzzzz[j]) 
+	 if (data->qzzzz[j] == data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_LEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] <= bzzzz[j]) 
+	 if (data->qzzzz[j] <= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GREATER:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] > bzzzz[j]) 
+	 if (data->qzzzz[j] > data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_NOTEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] != bzzzz[j]) 
+	 if (data->qzzzz[j] != data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] >= bzzzz[j]) 
+	 if (data->qzzzz[j] >= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
@@ -193,80 +474,480 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
 
+   /* Update our internal copy only if writemask set.  Even if
+    * depth.writemask is FALSE, may still need to write out buffer
+    * data due to stencil changes.
+    */
    if (softpipe->depth_stencil->depth.writemask) {
-      
-      /* This is also efficient with sse / spe instructions: 
-       */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (quad->inout.mask & (1 << j)) {
-	    bzzzz[j] = qzzzz[j];
-	 }
+         if (quad->inout.mask & (1 << j)) {
+            data->bzzzz[j] = data->qzzzz[j];
+         }
       }
+   }
 
-      /* put updated Z values back into cached tile */
-      switch (format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+   return TRUE;
+}
+
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+depth_stencil_test_quad(struct quad_stage *qs, 
+                        struct depth_data *data,
+                        struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(data, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(data, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         depth_test_quad(qs, data, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zFailMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(data, zFailMask, zFailOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_X8Z24_UNORM:
-         /* fall-through */
-         /* (yes, this falls through to a different case than above) */
-      case PIPE_FORMAT_Z32_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j];
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zPassMask = origMask & quad->inout.mask;
+            apply_stencil_op(data, zPassMask, zPassOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_S8Z24_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint s8z24 = tile->data.depth32[y][x];
-            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
-            tile->data.depth32[y][x] = s8z24;
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(data, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+   }
+}
+
+
+#define ALPHATEST( FUNC, COMP )                                         \
+   static int                                                          \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->softpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      return pass_nr;                                                   \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
+
+
+/* XXX: Incorporate into shader using KILP.
+ */
+static int
+alpha_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[], 
+                 unsigned nr)
+{
+   switch (qs->softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_LESS:
+      return alpha_test_quads_LESS( qs, quads, nr );
+   case PIPE_FUNC_EQUAL:
+      return alpha_test_quads_EQUAL( qs, quads, nr );
+      break;
+   case PIPE_FUNC_LEQUAL:
+      return alpha_test_quads_LEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GREATER:
+      return alpha_test_quads_GREATER( qs, quads, nr );
+   case PIPE_FUNC_NOTEQUAL:
+      return alpha_test_quads_NOTEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GEQUAL:
+      return alpha_test_quads_GEQUAL( qs, quads, nr );
+   case PIPE_FUNC_ALWAYS:
+      return nr;
+   case PIPE_FUNC_NEVER:
+   default:
+      return 0;
+   }
+}
+
+static unsigned mask_count[16] = 
+{
+   0,                           /* 0x0 */
+   1,                           /* 0x1 */
+   1,                           /* 0x2 */
+   2,                           /* 0x3 */
+   1,                           /* 0x4 */
+   2,                           /* 0x5 */
+   2,                           /* 0x6 */
+   3,                           /* 0x7 */
+   1,                           /* 0x8 */
+   2,                           /* 0x9 */
+   2,                           /* 0xa */
+   3,                           /* 0xb */
+   2,                           /* 0xc */
+   3,                           /* 0xd */
+   3,                           /* 0xe */
+   4,                           /* 0xf */
+};
+
+
+
+static void
+depth_test_quads_fallback(struct quad_stage *qs, 
+                          struct quad_header *quads[],
+                          unsigned nr)
+{
+   unsigned i, pass = 0;
+   const struct sp_fragment_shader *fs = qs->softpipe->fs;
+   boolean interp_depth = !fs->info.writes_z;
+   struct depth_data data;
+
+
+   if (qs->softpipe->depth_stencil->alpha.enabled) {
+      nr = alpha_test_quads(qs, quads, nr);
+   }
+
+   if (qs->softpipe->framebuffer.zsbuf && 
+       (qs->softpipe->depth_stencil->depth.enabled ||
+        qs->softpipe->depth_stencil->stencil[0].enabled)) {
+
+      data.ps = qs->softpipe->framebuffer.zsbuf;
+      data.format = data.ps->format;
+      data.tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, 
+                                     quads[0]->input.x0, 
+                                     quads[0]->input.y0);
+
+      for (i = 0; i < nr; i++) {
+         get_depth_stencil_values(&data, quads[i]);
+
+         if (qs->softpipe->depth_stencil->depth.enabled) {
+            if (interp_depth)
+               interpolate_quad_depth(quads[i]);
+
+            convert_quad_depth(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24S8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint z24s8 = tile->data.depth32[y][x];
-            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
-            tile->data.depth32[y][x] = z24s8;
+
+         if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            depth_stencil_test_quad(qs, &data, quads[i]);
+            write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         else {
+            if (!depth_test_quad(qs, &data, quads[i]))
+               continue;
+
+            if (qs->softpipe->depth_stencil->depth.writemask)
+               write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      default:
-         assert(0);
+
+
+         quads[pass++] = quads[i];
+      }
+
+      nr = pass;
+   }
+
+   if (qs->softpipe->active_query_count) {
+      for (i = 0; i < nr; i++) 
+         qs->softpipe->occlusion_count += mask_count[quads[i]->inout.mask];
+   }
+
+   if (nr)
+      qs->next->run(qs->next, quads, nr);
+}
+
+/* XXX: this function assumes setup function actually emits linear
+ * spans of quads.  It seems a lot more natural to do (early)
+ * depth-testing on spans rather than quads.
+ */
+static void
+depth_interp_z16_less_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
+{
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] < depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] < depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] < depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] < depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
       }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
    }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+
 }
 
 
 static void
-depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+depth_interp_z16_lequal_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
 {
-   sp_depth_test_quad(qs, quad);
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] <= depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] <= depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] <= depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] <= depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
+   }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
 }
 
 
+
+
+
+static void
+depth_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   qs->next->run(qs->next, quads, nr);
+}
+
+
+
+static void
+choose_depth_test(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   boolean interp_depth = !qs->softpipe->fs->info.writes_z;
+
+   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
+
+   boolean depth = (qs->softpipe->framebuffer.zsbuf && 
+                    qs->softpipe->depth_stencil->depth.enabled);
+
+   unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
+
+   boolean stencil = qs->softpipe->depth_stencil->stencil[0].enabled;
+
+   boolean depthwrite = qs->softpipe->depth_stencil->depth.writemask;
+
+   boolean occlusion = qs->softpipe->active_query_count;
+
+
+   if (!alpha &&
+       !depth &&
+       !stencil) {
+      qs->run = depth_noop;
+   }
+   else if (!alpha && 
+            interp_depth && 
+            depth && 
+            depthwrite && 
+            !occlusion &&
+            !stencil) 
+   {
+      switch (depthfunc) {
+      case PIPE_FUNC_LESS:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_less_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      case PIPE_FUNC_LEQUAL:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_lequal_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      default:
+         qs->run = depth_test_quads_fallback;
+      }
+   }
+   else {
+      qs->run = depth_test_quads_fallback;
+   }
+
+
+   qs->run( qs, quads, nr );
+}
+
+
+
+
+
 static void depth_test_begin(struct quad_stage *qs)
 {
+   qs->run = choose_depth_test;
    qs->next->begin(qs->next);
 }
 
@@ -283,7 +964,7 @@ struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = depth_test_begin;
-   stage->run = depth_test_quad;
+   stage->run = choose_depth_test;
    stage->destroy = depth_test_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
deleted file mode 100644
index 496fd39ed1..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  Quad early-z testing
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * All this stage does is compute the quad's Z values (which is normally
- * done by the shading stage).
- * The next stage will do the actual depth test.
- */
-static void
-earlyz_quad(
-   struct quad_stage    *qs,
-   struct quad_header   *quad )
-{
-   const float fx = (float) quad->input.x0;
-   const float fy = (float) quad->input.y0;
-   const float dzdx = quad->posCoef->dadx[2];
-   const float dzdy = quad->posCoef->dady[2];
-   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-   quad->output.depth[0] = z0;
-   quad->output.depth[1] = z0 + dzdx;
-   quad->output.depth[2] = z0 + dzdy;
-   quad->output.depth[3] = z0 + dzdx + dzdy;
-
-   qs->next->run( qs->next, quad );
-}
-
-static void
-earlyz_begin(
-   struct quad_stage *qs )
-{
-   qs->next->begin( qs->next );
-}
-
-static void
-earlyz_destroy(
-   struct quad_stage *qs )
-{
-   FREE( qs );
-}
-
-struct quad_stage *
-sp_quad_earlyz_stage(
-   struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
-
-   stage->softpipe = softpipe;
-   stage->begin = earlyz_begin;
-   stage->run = earlyz_quad;
-   stage->destroy = earlyz_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 28f8d1a60e..1e7533d0f9 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -68,72 +68,69 @@ quad_shade_stage(struct quad_stage *qs)
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-static void
+static INLINE boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
    struct tgsi_exec_machine *machine = qss->machine;
-   boolean z_written;
-   
-   /* Consts do not require 16 byte alignment. */
-   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   machine->InterpCoefs = quad->coef;
 
    /* run shader */
-   quad->inout.mask &= softpipe->fs->run( softpipe->fs, machine, quad );
-
-   /* store outputs */
-   z_written = FALSE;
-   {
-      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
-      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
-      const uint n = qss->stage.softpipe->fs->info.num_outputs;
-      uint i;
-      for (i = 0; i < n; i++) {
-         switch (sem_name[i]) {
-         case TGSI_SEMANTIC_COLOR:
-            {
-               uint cbuf = sem_index[i];
-               memcpy(quad->output.color[cbuf],
-                      &machine->Outputs[i].xyzw[0].f[0],
-                      sizeof(quad->output.color[0]) );
-            }
-            break;
-         case TGSI_SEMANTIC_POSITION:
-            {
-               uint j;
-               for (j = 0; j < 4; j++) {
-                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
-               }
-               z_written = TRUE;
-            }
-            break;
-         }
+   return softpipe->fs->run( softpipe->fs, machine, quad );
+}
+
+
+
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
       }
    }
+}
+
 
-   if (!z_written) {
-      /* compute Z values now, as in the quad earlyz stage */
-      /* XXX we should really only do this if the earlyz stage is not used */
-      const float fx = (float) quad->input.x0;
-      const float fy = (float) quad->input.y0;
-      const float dzdx = quad->posCoef->dadx[2];
-      const float dzdy = quad->posCoef->dady[2];
-      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-      quad->output.depth[0] = z0;
-      quad->output.depth[1] = z0 + dzdx;
-      quad->output.depth[2] = z0 + dzdy;
-      quad->output.depth[3] = z0 + dzdx + dzdy;
-   }
 
-   /* shader may cull fragments */
-   if (quad->inout.mask) {
-      qs->next->run( qs->next, quad );
+static void
+shade_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = qss->machine;
+
+   unsigned i, pass = 0;
+   
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (!shade_quad(qs, quads[i]))
+         continue;
+
+      if (/*do_coverage*/ 0)
+         coverage_quad( qs, quads[i] );
+
+      quads[pass++] = quads[i];
    }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
+   
+
+
 
 
 /**
@@ -174,7 +171,7 @@ sp_quad_shade_stage( struct softpipe_context *softpipe )
 
    qss->stage.softpipe = softpipe;
    qss->stage.begin = shade_begin;
-   qss->stage.run = shade_quad;
+   qss->stage.run = shade_quads;
    qss->stage.destroy = shade_destroy;
 
    qss->machine = tgsi_exec_machine_create();
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
deleted file mode 100644
index dfa7ff3b1d..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Quad occlusion counter stage
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-static unsigned count_bits( unsigned val )
-{
-   unsigned i;
-
-   for (i = 0; val ; val >>= 1)
-      i += (val & 1);
-
-   return i;
-}
-
-static void
-occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-
-   softpipe->occlusion_count += count_bits(quad->inout.mask);
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void occlusion_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void occlusion_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = occlusion_begin;
-   stage->run = occlusion_count_quad;
-   stage->destroy = occlusion_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
deleted file mode 100644
index 92d5f9f3c1..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-/**
- * Last step of quad processing: write quad colors to the framebuffer,
- * taking mask into account.
- */
-static void
-output_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   /* in-tile pos: */
-   const int itx = quad->input.x0 % TILE_SIZE;
-   const int ity = quad->input.y0 % TILE_SIZE;
-
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      int i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->inout.mask & (1 << j)) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) { /* loop over color chans */
-               tile->data.color[y][x][i] = quadColor[i][j];
-            }
-            if (0) {
-               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
-                            quad->input.x0 + x,
-                            quad->input.y0 + y,
-                            quadColor[0][j],
-                            quadColor[1][j],
-                            quadColor[2][j]);
-            }
-         }
-      }
-   }
-}
-
-
-static void output_begin(struct quad_stage *qs)
-{
-   assert(qs->next == NULL);
-}
-
-
-static void output_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = output_begin;
-   stage->run = output_quad;
-   stage->destroy = output_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index b5f69b7426..1b5bab4eca 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -31,88 +31,33 @@
 #include "pipe/p_shader_tokens.h"
 
 static void
-sp_push_quad_first(
-   struct softpipe_context *sp,
-   struct quad_stage *quad,
-   uint i )
+sp_push_quad_first( struct softpipe_context *sp,
+                    struct quad_stage *quad )
 {
-   quad->next = sp->quad[i].first;
-   sp->quad[i].first = quad;
+   quad->next = sp->quad.first;
+   sp->quad.first = quad;
 }
 
-static void
-sp_build_depth_stencil(
-   struct softpipe_context *sp,
-   uint i )
-{
-   if (sp->depth_stencil->stencil[0].enabled ||
-       sp->depth_stencil->stencil[1].enabled) {
-      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
-   }
-   else if (sp->depth_stencil->depth.enabled &&
-            sp->framebuffer.zsbuf) {
-      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
-   }
-}
 
 void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
-   uint i;
-
    boolean early_depth_test =
-               sp->depth_stencil->depth.enabled &&
-               sp->framebuffer.zsbuf &&
-               !sp->depth_stencil->alpha.enabled &&
-               !sp->fs->info.uses_kill &&
-               !sp->fs->info.writes_z;
-
-   /* build up the pipeline in reverse order... */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first = sp->quad[i].output;
-
-      if (sp->blend->colormask != 0xf) {
-         sp_push_quad_first( sp, sp->quad[i].colormask, i );
-      }
+      sp->depth_stencil->depth.enabled &&
+      sp->framebuffer.zsbuf &&
+      !sp->depth_stencil->alpha.enabled &&
+      !sp->fs->info.uses_kill &&
+      !sp->fs->info.writes_z;
 
-      if (sp->blend->blend_enable ||
-          sp->blend->logicop_enable) {
-         sp_push_quad_first( sp, sp->quad[i].blend, i );
-      }
+   sp->quad.first = sp->quad.blend;
 
-      if (sp->active_query_count) {
-         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
-      }
-
-      if (sp->rasterizer->poly_smooth ||
-          sp->rasterizer->line_smooth ||
-          sp->rasterizer->point_smooth) {
-         sp_push_quad_first( sp, sp->quad[i].coverage, i );
-      }
-
-      if (!early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-      }
-
-      if (sp->depth_stencil->alpha.enabled) {
-         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
-      }
-
-      /* XXX always enable shader? */
-      if (1) {
-         sp_push_quad_first( sp, sp->quad[i].shade, i );
-      }
-
-      if (early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
-      }
-
-#if !USE_DRAW_STAGE_PSTIPPLE
-      if (sp->rasterizer->poly_stipple_enable) {
-         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
-      }
-#endif
+   if (early_depth_test) {
+      sp_push_quad_first( sp, sp->quad.shade );
+      sp_push_quad_first( sp, sp->quad.depth_test );
+   }
+   else {
+      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad.shade );
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
index 0e40586ffc..c0aa134831 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.h
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -49,7 +49,7 @@ struct quad_stage {
    void (*begin)(struct quad_stage *qs);
 
    /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
 
    void (*destroy)(struct quad_stage *qs);
 };
@@ -69,6 +69,4 @@ struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
 
 void sp_build_quad_pipeline(struct softpipe_context *sp);
 
-void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
-
 #endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
deleted file mode 100644
index 5e9d447737..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ /dev/null
@@ -1,352 +0,0 @@
-
-/**
- * \brief Quad stencil testing
- */
-
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_tile_cache.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-/** Only 8-bit stencil supported */
-#define STENCIL_MAX 0xff
-
-
-/**
- * Do the basic stencil test (compare stencil buffer values against the
- * reference value.
- *
- * \param stencilVals  the stencil values from the stencil buffer
- * \param func  the stencil func (PIPE_FUNC_x)
- * \param ref  the stencil reference value
- * \param valMask  the stencil value mask indicating which bits of the stencil
- *                 values and ref value are to be used.
- * \return mask indicating which pixels passed the stencil test
- */
-static unsigned
-do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
-                unsigned ref, unsigned valMask)
-{
-   unsigned passMask = 0x0;
-   unsigned j;
-
-   ref &= valMask;
-
-   switch (func) {
-   case PIPE_FUNC_NEVER:
-      /* passMask = 0x0 */
-      break;
-   case PIPE_FUNC_LESS:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref < (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref == (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref <= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref > (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref != (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref >= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   return passMask;
-}
-
-
-/**
- * Apply the stencil operator to stencil values.
- *
- * \param stencilVals  the stencil buffer values (read and written)
- * \param mask  indicates which pixels to update
- * \param op  the stencil operator (PIPE_STENCIL_OP_x)
- * \param ref  the stencil reference value
- * \param wrtMask  writemask controlling which bits are changed in the
- *                 stencil values
- */
-static void
-apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
-                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
-{
-   unsigned j;
-   ubyte newstencil[QUAD_SIZE];
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      newstencil[j] = stencilVals[j];
-   }
-
-   switch (op) {
-   case PIPE_STENCIL_OP_KEEP:
-      /* no-op */
-      break;
-   case PIPE_STENCIL_OP_ZERO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = 0;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_REPLACE:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ref;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] < STENCIL_MAX) {
-               newstencil[j] = stencilVals[j] + 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] > 0) {
-               newstencil[j] = stencilVals[j] - 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] + 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] - 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INVERT:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ~stencilVals[j];
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /*
-    * update the stencil values
-    */
-   if (wrtMask != STENCIL_MAX) {
-      /* apply bit-wise stencil buffer writemask */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
-      }
-   }
-   else {
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = newstencil[j];
-      }
-   }
-}
-
-
-/**
- * Do stencil (and depth) testing.  Stenciling depends on the outcome of
- * depth testing.
- */
-static void
-stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   unsigned func, zFailOp, zPassOp, failOp;
-   ubyte ref, wrtMask, valMask;
-   ubyte stencilVals[QUAD_SIZE];
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
-   uint j;
-   uint face = quad->input.facing;
-
-   if (!softpipe->depth_stencil->stencil[1].enabled) {
-      /* single-sided stencil test, use front (face=0) state */
-      face = 0;
-   }
-
-   /* choose front or back face function, operator, etc */
-   /* XXX we could do these initializations once per primitive */
-   func    = softpipe->depth_stencil->stencil[face].func;
-   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
-   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
-   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
-   ref     = softpipe->depth_stencil->stencil[face].ref_value;
-   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
-   valMask = softpipe->depth_stencil->stencil[face].valuemask;
-
-   assert(ps); /* shouldn't get here if there's no stencil buffer */
-
-   /* get stencil values from cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] >> 24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.stencil8[y][x];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /* do the stencil test first */
-   {
-      unsigned passMask, failMask;
-      passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->inout.mask & ~passMask;
-      quad->inout.mask &= passMask;
-
-      if (failOp != PIPE_STENCIL_OP_KEEP) {
-         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
-      }
-   }
-
-   if (quad->inout.mask) {
-      /* now the pixels that passed the stencil test are depth tested */
-      if (softpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->inout.mask;
-
-         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
-
-         /* update stencil buffer values according to z pass/fail result */
-         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->inout.mask;
-            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
-         }
-
-         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->inout.mask;
-            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
-         }
-      }
-      else {
-         /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
-      }
-
-   }
-
-   /* put new stencil values into cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint s8z24 = tile->data.depth32[y][x];
-         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
-         tile->data.depth32[y][x] = s8z24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint z24s8 = tile->data.depth32[y][x];
-         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
-         tile->data.depth32[y][x] = z24s8;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         tile->data.stencil8[y][x] = stencilVals[j];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void stencil_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void stencil_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = stencil_begin;
-   stage->run = stencil_test_quad;
-   stage->destroy = stencil_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index 07162db7b6..a0527a596a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -14,14 +14,20 @@
  * Apply polygon stipple to quads produced by triangle rasterization
  */
 static void
-stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned q;
+
+   pass = 0;
+
+   for (q = 0; q < nr; q++)  {
+      struct quad_header *quad = quads[q];
 
-   if (quad->input.prim == QUAD_PRIM_TRI) {
-      struct softpipe_context *softpipe = qs->softpipe;
-      /* need to invert Y to index into OpenGL's stipple pattern */
       const int col0 = quad->input.x0 % 32;
       const int y0 = quad->input.y0;
       const int y1 = y0 + 1;
@@ -41,13 +47,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
       if ((stipple1 & (bit30 >> col0)) == 0)
          quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
 
-      if (!quad->inout.mask) {
-         /* all fragments failed stipple test, end of quad pipeline */
-         return;
-      }
+      if (quad->inout.mask)
+         quads[pass++] = quad;
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 769425bd12..81fb7aa20c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -40,7 +40,7 @@
 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -65,8 +65,6 @@ softpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -137,10 +135,14 @@ softpipe_is_format_supported( struct pipe_screen *screen,
           target == PIPE_TEXTURE_CUBE);
 
    switch(format) {
+   case PIPE_FORMAT_L16_UNORM:
+   case PIPE_FORMAT_YCBCR_REV:
+   case PIPE_FORMAT_YCBCR:
    case PIPE_FORMAT_DXT1_RGB:
    case PIPE_FORMAT_DXT1_RGBA:
    case PIPE_FORMAT_DXT3_RGBA:
    case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_Z32_FLOAT:
       return FALSE;
    default:
       return TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index de3ae3c369..ade125662a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "sp_context.h"
-#include "sp_prim_setup.h"
 #include "sp_quad.h"
 #include "sp_quad_pipe.h"
 #include "sp_setup.h"
@@ -61,87 +60,9 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
 
-/* Set to 1 if you want other threads to be instantly
- * notified of pending jobs.
- */
-#define INSTANT_NOTEMPTY_NOTIFY 0
-
-struct thread_info
-{
-   struct setup_context *setup;
-   uint id;
-   pipe_thread handle;
-};
-
-struct quad_job;
-
-typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
-
-struct quad_job
-{
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   quad_job_routine routine;
-};
-
-#define NUM_QUAD_JOBS 64
-
-struct quad_job_que
-{
-   struct quad_job jobs[NUM_QUAD_JOBS];
-   uint first;
-   uint last;
-   pipe_mutex que_mutex;
-   pipe_condvar que_notfull_condvar;
-   pipe_condvar que_notempty_condvar;
-   uint jobs_added;
-   uint jobs_done;
-   pipe_condvar que_done_condvar;
-};
-
-static void
-add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
-{
-#if INSTANT_NOTEMPTY_NOTIFY
-   boolean empty;
-#endif
-
-   /* Wait for empty slot, see if the que is empty.
-    */
-   pipe_mutex_lock( que->que_mutex );
-   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
-#if !INSTANT_NOTEMPTY_NOTIFY
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-#endif
-      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
-   }
-#if INSTANT_NOTEMPTY_NOTIFY
-   empty = que->last == que->first;
-#endif
-   que->jobs_added++;
-   pipe_mutex_unlock( que->que_mutex );
+#define MAX_QUADS 16
 
-   /* Submit new job.
-    */
-   que->jobs[que->last].input = quad->input;
-   que->jobs[que->last].inout = quad->inout;
-   que->jobs[que->last].routine = routine;
-   que->last = (que->last + 1) % NUM_QUAD_JOBS;
-
-#if INSTANT_NOTEMPTY_NOTIFY
-   /* If the que was empty, notify consumers there's a job to be done.
-    */
-   if (empty) {
-      pipe_mutex_lock( que->que_mutex );
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-#endif
-}
-
-#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -164,22 +85,19 @@ struct setup_context {
    struct edge emaj;
 
    float oneoverarea;
+   int facing;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
 
    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
    struct tgsi_interp_coef posCoef;  /* For Z, W */
-   struct quad_header quad;
-
-#if SP_NUM_QUAD_THREADS > 1
-   struct quad_job_que que;
-   struct thread_info threads[SP_NUM_QUAD_THREADS];
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
       int y;
-      unsigned y_flags;
-      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
    } span;
 
 #if DEBUG_FRAGS
@@ -190,67 +108,6 @@ struct setup_context {
    unsigned winding;		/* which winding to cull */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static PIPE_THREAD_ROUTINE( quad_thread, param )
-{
-   struct thread_info *info = (struct thread_info *) param;
-   struct quad_job_que *que = &info->setup->que;
-
-   for (;;) {
-      struct quad_job job;
-      boolean full;
-
-      /* Wait for an available job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      while (que->last == que->first)
-         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
-
-      /* See if the que is full.
-       */
-      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
-
-      /* Take a job and remove it from que.
-       */
-      job = que->jobs[que->first];
-      que->first = (que->first + 1) % NUM_QUAD_JOBS;
-
-      /* Notify the producer if the que is not full.
-       */
-      if (full)
-         pipe_condvar_signal( que->que_notfull_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-
-      job.routine( info->setup, info->id, &job );
-
-      /* Notify the producer if that's the last finished job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      que->jobs_done++;
-      if (que->jobs_added == que->jobs_done)
-         pipe_condvar_signal( que->que_done_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-
-   return NULL;
-}
-
-#define WAIT_FOR_COMPLETION(setup) \
-   do {\
-      pipe_mutex_lock( setup->que.que_mutex );\
-      if (!INSTANT_NOTEMPTY_NOTIFY)\
-         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
-      while (setup->que.jobs_added != setup->que.jobs_done)\
-         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
-      pipe_mutex_unlock( setup->que.que_mutex );\
-   } while (0)
-
-#else
-
-#define WAIT_FOR_COMPLETION(setup) ((void) 0)
-
-#endif
 
 
 
@@ -313,98 +170,18 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
 {
    quad_clip( setup, quad );
+
    if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
 
-      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+      sp->quad.first->run( sp->quad.first, &quad, 1 );
    }
 }
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static void
-clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   clip_emit_quad( setup, &quad, thread );
-}
-
-#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
-
-#else
-
-#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
-
-#endif
-
-/**
- * Emit a quad (pass to next stage).  No clipping is done.
- */
-static INLINE void
-emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
-{
-   struct softpipe_context *sp = setup->softpipe;
-#if DEBUG_FRAGS
-   uint mask = quad->inout.mask;
-#endif
-
-#if DEBUG_FRAGS
-   if (mask & 1) setup->numFragsEmitted++;
-   if (mask & 2) setup->numFragsEmitted++;
-   if (mask & 4) setup->numFragsEmitted++;
-   if (mask & 8) setup->numFragsEmitted++;
-#endif
-   sp->quad[thread].first->run( sp->quad[thread].first, quad );
-#if DEBUG_FRAGS
-   mask = quad->inout.mask;
-   if (mask & 1) setup->numFragsWritten++;
-   if (mask & 2) setup->numFragsWritten++;
-   if (mask & 4) setup->numFragsWritten++;
-   if (mask & 8) setup->numFragsWritten++;
-#endif
-}
-
-#if SP_NUM_QUAD_THREADS > 1
 
-static void
-emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   emit_quad( setup, &quad, thread );
-}
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
-   } while (0)
-
-#else
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      emit_quad( setup, &setup->quad, 0 );\
-   } while (0)
-
-#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -412,7 +189,12 @@ emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
  */
 static INLINE int block( int x )
 {
-   return x & ~1;
+   return x & ~(2-1);
+}
+
+static INLINE int block_x( int x )
+{
+   return x & ~(16-1);
 }
 
 
@@ -421,72 +203,63 @@ static INLINE int block( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
+   const int step = 16;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
-   int minleft, maxright;
+   struct quad_stage *pipe = setup->softpipe->quad.first;
+
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
+   int maxright = MAX2(xright0, xright1);
    int x;
 
-   switch (setup->span.y_flags) {
-   case 0x3:
-      /* both odd and even lines written (both quad rows) */
-      minleft = block(MIN2(xleft0, xleft1));
-      maxright = block(MAX2(xright0, xright1));
-      for (x = minleft; x <= maxright; x += 2) {
-         /* determine which of the four pixels is inside the span bounds */
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x1:
-      /* only even line written (quad top row) */
-      minleft = block(xleft0);
-      maxright = block(xright0);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x2:
-      /* only odd line written (quad bottom row) */
-      minleft = block(xleft1);
-      maxright = block(xright1);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      unsigned q = 0;
 
-   default:
-      return;
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         do {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            if (quadmask) {
+               setup->quad[q].input.x0 = lx;
+               setup->quad[q].input.y0 = setup->span.y;
+               setup->quad[q].input.facing = setup->facing;
+               setup->quad[q].inout.mask = quadmask;
+               setup->quad_ptrs[q] = &setup->quad[q];
+               q++;
+            }
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         } while (mask0 | mask1);
+
+         pipe->run( pipe, setup->quad_ptrs, q );
+      }
    }
 
+
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 }
 
 
@@ -496,7 +269,7 @@ static void print_vertex(const struct setup_context *setup,
 {
    int i;
    debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
       debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
       if (util_is_inf_or_nan(v[i][0])) {
@@ -601,7 +374,9 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
    return TRUE;
 }
@@ -788,7 +563,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -844,11 +619,10 @@ static void subtriangle( struct setup_context *setup,
 
    /* clip top/bottom */
    start_y = sy;
-   finish_y = sy + lines;
-
    if (start_y < miny)
       start_y = miny;
 
+   finish_y = sy + lines;
    if (finish_y > maxy)
       finish_y = maxy;
 
@@ -885,7 +659,6 @@ static void subtriangle( struct setup_context *setup,
 
          setup->span.left[_y&1] = left;
          setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -958,10 +731,9 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = QUAD_PRIM_TRI;
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
    /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
@@ -983,8 +755,6 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
-   WAIT_FOR_COMPLETION(setup);
-
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -1101,7 +871,7 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -1122,20 +892,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.input.x0 ||
-       quadY != setup->quad.input.y0)
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.input.x0 != -1)
-         CLIP_EMIT_QUAD(setup);
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
 
-      setup->quad.input.x0 = quadX;
-      setup->quad.input.y0 = quadY;
-      setup->quad.inout.mask = 0x0;
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
    }
 
-   setup->quad.inout.mask |= mask;
+   setup->quad[0].inout.mask |= mask;
 }
 
 
@@ -1195,17 +965,18 @@ setup_line(struct setup_context *setup,
 
    assert(dx >= 0);
    assert(dy >= 0);
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
 
-   setup->quad.input.x0 = setup->quad.input.y0 = -1;
-   setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.input.coverage[0] =
-   setup->quad.input.coverage[1] =
-   setup->quad.input.coverage[2] =
-   setup->quad.input.coverage[3] = 1.0;
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1249,11 +1020,9 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.inout.mask) {
-      CLIP_EMIT_QUAD(setup);
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 
@@ -1300,6 +1069,8 @@ setup_point( struct setup_context *setup,
    if (softpipe->no_rast)
       return;
 
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_POINTS);
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1346,22 +1117,21 @@ setup_point( struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
    }
 
-   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.input.x0 = (int) x - ix;
-      setup->quad.input.y0 = (int) y - iy;
-      setup->quad.inout.mask = (1 << ix) << (2 * iy);
-      CLIP_EMIT_QUAD(setup);
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
    }
    else {
       if (round) {
@@ -1381,15 +1151,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.inout.mask = 0x0;
+               setup->quad[0].inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1397,8 +1167,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1406,8 +1176,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1415,14 +1185,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.inout.mask) {
-                  setup->quad.input.x0 = ix;
-                  setup->quad.input.y0 = iy;
-                  CLIP_EMIT_QUAD(setup);
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
                }
             }
          }
@@ -1466,33 +1236,25 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.inout.mask = mask;
-               setup->quad.input.x0 = ix;
-               setup->quad.input.y0 = iy;
-               CLIP_EMIT_QUAD(setup);
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
             }
          }
       }
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 void setup_prepare( struct setup_context *setup )
 {
    struct softpipe_context *sp = setup->softpipe;
-   unsigned i;
 
    if (sp->dirty) {
       softpipe_update_derived(sp);
    }
 
-   /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
-
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first->begin( sp->quad[i].first );
-   }
+   sp->quad.first->begin( sp->quad.first );
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
@@ -1520,30 +1282,17 @@ void setup_destroy_context( struct setup_context *setup )
 struct setup_context *setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
-#if SP_NUM_QUAD_THREADS > 1
-   uint i;
-#endif
+   unsigned i;
 
    setup->softpipe = softpipe;
 
-   setup->quad.coef = setup->coef;
-   setup->quad.posCoef = &setup->posCoef;
-
-#if SP_NUM_QUAD_THREADS > 1
-   setup->que.first = 0;
-   setup->que.last = 0;
-   pipe_mutex_init( setup->que.que_mutex );
-   pipe_condvar_init( setup->que.que_notfull_condvar );
-   pipe_condvar_init( setup->que.que_notempty_condvar );
-   setup->que.jobs_added = 0;
-   setup->que.jobs_done = 0;
-   pipe_condvar_init( setup->que.que_done_condvar );
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      setup->threads[i].setup = setup;
-      setup->threads[i].id = i;
-      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = setup->coef;
+      setup->quad[i].posCoef = &setup->posCoef;
    }
-#endif
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 
    return setup;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 9776e978e3..77ee3c1136 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -87,6 +87,7 @@ struct sp_fragment_shader {
 struct sp_vertex_shader {
    struct pipe_shader_state shader;
    struct draw_vertex_shader *draw_data;
+   int max_sampler;             /* -1 if no samplers */
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 384fe559af..efed082f82 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -45,7 +45,7 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->blend = (const struct pipe_blend_state *)blend;
+   softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
 }
@@ -86,7 +86,7 @@ softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   softpipe->depth_stencil = (struct pipe_depth_stencil_alpha_state *)depth_stencil;
 
    softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 75551000c9..1faeca1c2a 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -32,7 +32,10 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_private.h"
 #include "sp_context.h"
+#include "sp_screen.h"
 #include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
 
 
 /**
@@ -65,24 +68,19 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       const struct sp_fragment_shader *spfs = softpipe->fs;
       const enum interp_mode colorInterp
          = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(softpipe->draw);
       uint i;
 
-      if (softpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(softpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -91,6 +89,23 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < spfs->info.num_inputs; i++) {
          int src;
+         enum interp_mode interp;
+
+         switch (spfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
          switch (spfs->info.input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
             src = draw_find_vs_output(softpipe->draw,
@@ -106,7 +121,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
          case TGSI_SEMANTIC_FOG:
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          case TGSI_SEMANTIC_GENERIC:
@@ -114,7 +129,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
             /* this includes texcoords and varying vars */
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
                                       spfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          default:
@@ -164,11 +179,19 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 static void
 compute_cliprect(struct softpipe_context *sp)
 {
+   /* SP_NEW_FRAMEBUFFER
+    */
    uint surfWidth = sp->framebuffer.width;
    uint surfHeight = sp->framebuffer.height;
 
+   /* SP_NEW_RASTERIZER
+    */
    if (sp->rasterizer->scissor) {
-      /* clip to scissor rect */
+
+      /* SP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
       sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
       sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
       sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
@@ -184,27 +207,63 @@ compute_cliprect(struct softpipe_context *sp)
 }
 
 
+static void
+update_tgsi_samplers( struct softpipe_context *softpipe )
+{
+   unsigned i;
+
+   softpipe_reset_sampler_varients( softpipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->tex_cache[i];
+      if (tc->texture) {
+         struct softpipe_texture *spt = softpipe_texture(tc->texture);
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture( tc );
+            /*
+            _debug_printf("INV %d %d\n", tc->timestamp, spt->timestamp);
+            */
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+}
+
+
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
 void softpipe_update_derived( struct softpipe_context *softpipe )
 {
+   struct softpipe_screen *sp_screen = softpipe_screen(softpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (softpipe->tex_timestamp != sp_screen->timestamp) {
+      softpipe->tex_timestamp = sp_screen->timestamp;
+      softpipe->dirty |= SP_NEW_TEXTURE;
+   }
+      
+   if (softpipe->dirty & (SP_NEW_SAMPLER |
+                          SP_NEW_TEXTURE |
+                          SP_NEW_FS | 
+                          SP_NEW_VS))
+      update_tgsi_samplers( softpipe );
+
    if (softpipe->dirty & (SP_NEW_RASTERIZER |
                           SP_NEW_FS |
                           SP_NEW_VS))
       invalidate_vertex_layout( softpipe );
 
    if (softpipe->dirty & (SP_NEW_SCISSOR |
-                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_RASTERIZER |
                           SP_NEW_FRAMEBUFFER))
       compute_cliprect(softpipe);
 
    if (softpipe->dirty & (SP_NEW_BLEND |
                           SP_NEW_DEPTH_STENCIL_ALPHA |
                           SP_NEW_FRAMEBUFFER |
-                          SP_NEW_RASTERIZER |
-                          SP_NEW_FS | 
-			  SP_NEW_QUERY))
+                          SP_NEW_FS))
       sp_build_quad_pipeline(softpipe);
 
    softpipe->dirty = 0;
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 4330c20393..256faa94b8 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -31,9 +31,8 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_parse.h"
@@ -51,12 +50,9 @@ softpipe_create_fs_state(struct pipe_context *pipe,
       tgsi_dump(templ->tokens, 0);
 
    /* codegen */
-   state = softpipe_create_fs_llvm( softpipe, templ );
+   state = softpipe_create_fs_sse( softpipe, templ );
    if (!state) {
-      state = softpipe_create_fs_sse( softpipe, templ );
-      if (!state) {
-         state = softpipe_create_fs_exec( softpipe, templ );
-      }
+      state = softpipe_create_fs_exec( softpipe, templ );
    }
 
    assert(state);
@@ -111,6 +107,8 @@ softpipe_create_vs_state(struct pipe_context *pipe,
    if (state->draw_data == NULL) 
       goto fail;
 
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
    return state;
 
 fail:
@@ -128,7 +126,7 @@ softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->vs = (const struct sp_vertex_shader *)vs;
+   softpipe->vs = (struct sp_vertex_shader *) vs;
 
    draw_bind_vertex_shader(softpipe->draw,
                            (softpipe->vs ? softpipe->vs->draw_data : NULL));
@@ -142,8 +140,7 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   struct sp_vertex_shader *state =
-      (struct sp_vertex_shader *)vs;
+   struct sp_vertex_shader *state = (struct sp_vertex_shader *) vs;
 
    draw_delete_vertex_shader(softpipe->draw, state->draw_data);
    FREE( state );
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index cb517b02e4..db0b8ab76b 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -32,21 +32,37 @@
 #include "util/u_memory.h"
 
 #include "draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
-#include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
-#include "draw/draw_context.h"
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
+struct sp_sampler {
+   struct pipe_sampler_state base;
+   struct sp_sampler_varient *varients;
+   struct sp_sampler_varient *current;
+};
+
+static struct sp_sampler *sp_sampler( struct pipe_sampler_state *sampler )
+{
+   return (struct sp_sampler *)sampler;
+}
+
 
 void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   return mem_dup(sampler, sizeof(*sampler));
+   struct sp_sampler *sp_sampler = CALLOC_STRUCT(sp_sampler);
+
+   sp_sampler->base = *sampler;
+   sp_sampler->varients = NULL;
+
+   return (void *)sp_sampler;
 }
 
 
@@ -97,7 +113,7 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num ? texture[i] : NULL;
 
       pipe_texture_reference(&softpipe->texture[i], tex);
-      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+      sp_tex_tile_cache_set_texture(softpipe->tex_cache[i], tex);
    }
 
    softpipe->num_textures = num;
@@ -106,10 +122,111 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
 }
 
 
+/**
+ * Find/create an sp_sampler_varient object for sampling the given texture,
+ * sampler and tex unit.
+ *
+ * Note that the tex unit is significant.  We can't re-use a sampler
+ * varient for multiple texture units because the sampler varient contains
+ * the texture object pointer.  If the texture object pointer were stored
+ * somewhere outside the sampler varient, we could re-use samplers for
+ * multiple texture units.
+ */
+static struct sp_sampler_varient *
+get_sampler_varient( unsigned unit,
+                     struct sp_sampler *sampler,
+                     struct pipe_texture *texture,
+                     unsigned processor )
+{
+   struct softpipe_texture *sp_texture = softpipe_texture(texture);
+   struct sp_sampler_varient *v = NULL;
+   union sp_sampler_key key;
+
+   /* if this fails, widen the key.unit field and update this assertion */
+   assert(PIPE_MAX_SAMPLERS <= 16);
+
+   key.bits.target = sp_texture->base.target;
+   key.bits.is_pot = sp_texture->pot;
+   key.bits.processor = processor;
+   key.bits.unit = unit;
+   key.bits.pad = 0;
+
+   if (sampler->current && 
+       key.value == sampler->current->key.value) {
+      v = sampler->current;
+   }
+
+   if (v == NULL) {
+      for (v = sampler->varients; v; v = v->next)
+         if (v->key.value == key.value)
+            break;
+
+      if (v == NULL) {
+         v = sp_create_sampler_varient( &sampler->base, key );
+         v->next = sampler->varients;
+         sampler->varients = v;
+      }
+   }
+   
+   sampler->current = v;
+   return v;
+}
+
+
+
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
+{
+   int i;
+
+   /* It's a bit hard to build these samplers ahead of time -- don't
+    * really know which samplers are going to be used for vertex and
+    * fragment programs.
+    */
+   for (i = 0; i <= softpipe->vs->max_sampler; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.vert_samplers_list[i] = 
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_VERTEX );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+
+   for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.frag_samplers_list[i] =
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_FRAGMENT );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.frag_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+}
+
+
+
 void
 softpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
+   struct sp_sampler *sp_sampler = (struct sp_sampler *)sampler;
+   struct sp_sampler_varient *v, *tmp;
+
+   for (v = sp_sampler->varients; v; v = tmp) {
+      tmp = v->next;
+      sp_sampler_varient_destroy(v);
+   }
+
    FREE( sampler );
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index 181bff8f75..bc0e201130 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -53,7 +53,7 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
-         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+         sp_flush_tile_cache(sp->cbuf_cache[i]);
 
          /* assign new */
          pipe_surface_reference(&sp->framebuffer.cbufs[i], fb->cbufs[i]);
@@ -68,58 +68,28 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    /* zbuf changing? */
    if (sp->framebuffer.zsbuf != fb->zsbuf) {
       /* flush old */
-      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+      sp_flush_tile_cache(sp->zsbuf_cache);
 
       /* assign new */
       pipe_surface_reference(&sp->framebuffer.zsbuf, fb->zsbuf);
 
       /* update cache */
       sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
-   }
-
-#if 0
-   /* XXX combined depth/stencil here */
-
-   /* sbuf changing? */
-   if (sp->framebuffer.sbuf != fb->sbuf) {
-      /* flush old */
-      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
-
-      /* assign new */
-      sp->framebuffer.sbuf = fb->sbuf;
-
-      /* update cache */
-      if (fb->sbuf != fb->zbuf) {
-         /* separate stencil buf */
-         sp->sbuf_cache = sp->sbuf_cache_sep;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-      else {
-         /* combined depth/stencil */
-         sp->sbuf_cache = sp->zbuf_cache;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-   }
-#endif
 
-   /* Tell draw module how deep the Z/depth buffer is */
-   {
-      int depth_bits;
-      double mrd;
+      /* Tell draw module how deep the Z/depth buffer is */
       if (sp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
          depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
                                             PIPE_FORMAT_COMP_Z);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(sp->draw, mrd);
       }
-      else {
-         depth_bits = 0;
-      }
-      if (depth_bits > 16) {
-         mrd = 0.0000001;
-      }
-      else {
-         mrd = 0.00002;
-      }
-      draw_set_mrd(sp->draw, mrd);
    }
 
    sp->framebuffer.width = fb->width;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index f99a30277d..c22ee86b66 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -31,29 +31,33 @@
  *
  * Authors:
  *   Brian Paul
+ *   Keith Whitwell
  */
 
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
-#include "sp_tex_sample.h"
-#include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "sp_quad.h"   /* only for #define QUAD_* tokens */
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
 
 /*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * Return fractional part of 'f'.  Used for computing interpolation weights.
+ * Need to be careful with negative values.
+ * Note, if this function isn't perfect you'll sometimes see 1-pixel bands
+ * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
  */
-#define FRAC(f)  ((f) - util_ifloor(f))
+static INLINE float
+frac(float f)
+{
+   return f - util_ifloor(f);
+}
+
 
 
 /**
@@ -100,10 +104,16 @@ lerp_3d(float a, float b, float c,
 
 
 /**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ * Compute coord % size for repeat wrap modes.
+ * Note that if coord is a signed integer, coord % size doesn't give
+ * the right value for coord < 0 (in terms of texture repeat).  Just
+ * casting to unsigned fixes that.
  */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+static INLINE int
+repeat(int coord, unsigned size)
+{
+   return (int) ((unsigned) coord % size);
+}
 
 
 /**
@@ -115,133 +125,153 @@ lerp_3d(float a, float b, float c,
  * \param icoord  returns the integer texcoords
  * \return  integer texture index
  */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
+static void
+wrap_nearest_repeat(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
+   /* s limited to [0,1) */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch] * size);
+      icoord[ch] = repeat(i, size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1] */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= 0.0F)
+         icoord[ch] = 0;
+      else if (s[ch] >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_edge(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] < min)
+         icoord[ch] = 0;
+      else if (s[ch] > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_border(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [-1, size] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= min)
+         icoord[ch] = -1;
+      else if (s[ch] >= max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
       /* s limited to [0,1] */
       /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
+      const float u = fabsf(s[ch]);
+      if (u <= 0.0F)
+         icoord[ch] = 0;
+      else if (u >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                  int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_border(const float s[4], unsigned size,
+                                    int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = -1;
+      else if (u > max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(u * size);
    }
 }
 
@@ -256,125 +286,156 @@ nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * \param w  returns blend factor/weight between texture indexes
  * \param icoord  returns the computed integer texture coords
  */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+static void
+wrap_linear_repeat(const float s[4], unsigned size,
+                   int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = s[ch] * size - 0.5F;
+      icoord0[ch] = repeat(util_ifloor(u), size);
+      icoord1[ch] = repeat(icoord0[ch] + 1, size);
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp(const float s[4], unsigned size,
                   int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
 
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
+
+static void
+wrap_linear_clamp_to_edge(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_border(const float s[4], unsigned size,
+                            int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], min, max);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_repeat(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      u = u * size - 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp(const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u <= min)
+         u = min * size;
+      else if (u >= max)
+         u = max * size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
@@ -383,27 +444,27 @@ linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
+static void
+wrap_nearest_unorm_clamp(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch]);
+      icoord[ch]= CLAMP(i, 0, (int) size-1);
+   }
+}
+
+
+/**
+ * Handles clamp_to_edge and clamp_to_border:
+ */
+static void
+wrap_nearest_unorm_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
    }
 }
 
@@ -412,358 +473,971 @@ nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords.
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
+static void
+wrap_linear_unorm_clamp(const float s[4], unsigned size,
+                        int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      /* Not exactly what the spec says, but it matches NVIDIA output */
+      float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
 
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+static void
+wrap_linear_unorm_clamp_to_border(const float s[4], unsigned size,
+                                  int icoord0[4], int icoord1[4], float w[4])
 {
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx >= ary && arx >= arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ */
+static float
+compute_lambda_1d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float rho = MAX2(dsdx, dsdy) * texture->width[0];
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_2d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float rho  = MAX2(maxx, maxy);
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_3d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
+   float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float maxz = MAX2(dpdx, dpdy) * texture->depth[0];
+   float rho, lambda;
+
+   rho = MAX2(maxx, maxy);
+   rho = MAX2(rho, maxz);
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Compute lambda for a vertex texture sampler.
+ * Since there aren't derivatives to use, just return the LOD bias.
+ */
+static float
+compute_lambda_vert(const struct sp_sampler_varient *samp,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias)
+{
+   return lodbias;
+}
+
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param addr  the template tex address containing cube, z, face info.
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param rgba  the quad to put the texel/color into
+ *
+ * XXX maybe move this into sp_tex_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+
+
+
+
+static INLINE const float *
+get_texel_2d_no_border(const struct sp_sampler_varient *samp,
+		       union tex_tile_address addr, int x, int y)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_2d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_2d_no_border( samp, addr, x, y );
+   }
+}
+
+
+/* Gather a quad of adjacent texels within a tile:
+ */
+static INLINE void
+get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_varient *samp,
+					union tex_tile_address addr, 
+					unsigned x, unsigned y, 
+					const float *out[4])
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+      
+   out[0] = &tile->data.color[y  ][x  ][0];
+   out[1] = &tile->data.color[y  ][x+1][0];
+   out[2] = &tile->data.color[y+1][x  ][0];
+   out[3] = &tile->data.color[y+1][x+1][0];
+}
+
+
+/* Gather a quad of potentially non-adjacent texels:
+ */
+static INLINE void
+get_texel_quad_2d_no_border(const struct sp_sampler_varient *samp,
+			    union tex_tile_address addr,
+			    int x0, int y0, 
+			    int x1, int y1,
+			    const float *out[4])
+{
+   out[0] = get_texel_2d_no_border( samp, addr, x0, y0 );
+   out[1] = get_texel_2d_no_border( samp, addr, x1, y0 );
+   out[2] = get_texel_2d_no_border( samp, addr, x0, y1 );
+   out[3] = get_texel_2d_no_border( samp, addr, x1, y1 );
+}
+
+/* Can involve a lot of unnecessary checks for border color:
+ */
+static INLINE void
+get_texel_quad_2d(const struct sp_sampler_varient *samp,
+		  union tex_tile_address addr,
+		  int x0, int y0, 
+		  int x1, int y1,
+		  const float *out[4])
+{
+   out[0] = get_texel_2d( samp, addr, x0, y0 );
+   out[1] = get_texel_2d( samp, addr, x1, y0 );
+   out[3] = get_texel_2d( samp, addr, x1, y1 );
+   out[2] = get_texel_2d( samp, addr, x0, y1 );
+}
+
+
+
+/* 3d varients:
+ */
+static INLINE const float *
+get_texel_3d_no_border(const struct sp_sampler_varient *samp,
+                       union tex_tile_address addr, int x, int y, int z)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_3d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y, int z)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_3d_no_border( samp, addr, x, y, z );
+   }
+}
+
+
+/**
+ * Given the logbase2 of a mipmap's base level size and a mipmap level,
+ * return the size (in texels) of that mipmap level.
+ * For example, if level[0].width = 256 then base_pot will be 8.
+ * If level = 2, then we'll return 64 (the width at level=2).
+ * Return 1 if level > base_pot.
+ */
+static INLINE unsigned
+pot_level_size(unsigned base_pot, unsigned level)
+{
+   return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
+}
+
+
+/* Some image-filter fastpaths:
+ */
+static INLINE void
+img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   unsigned xmax = (xpot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; */
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *tx[4];      
+      
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax) {
+         get_texel_quad_2d_no_border_single_tile(samp, addr, x0, y0, tx);
       }
       else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_no_border(samp, addr, x0, y0, x1, y1, tx);
+      }
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              tx[0][c], tx[1][c], 
+                              tx[2][c], tx[3][c]);
       }
    }
-   else if (ary >= arx && ary >= arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
+}
+
+
+static INLINE void
+img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                 const float s[QUAD_SIZE],
+                                 const float t[QUAD_SIZE],
+                                 const float p[QUAD_SIZE],
+                                 float lodbias,
+                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const float *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
+}
+
+
+static void
+img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], 0);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
+   }
+}
+
+
+static void
+img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
+}
 
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
 
-   return face;
+static INLINE union tex_tile_address
+face(union tex_tile_address addr, unsigned face )
+{
+   addr.bits.face = face;
+   return addr;
 }
 
 
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(const struct pipe_texture *tex,
-               const struct pipe_sampler_state *sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
+static void
+img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   float rho, lambda;
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
 
-   assert(sampler->normalized_coords);
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
 
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * tex->width[0];
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * tex->height[0];
-      rho = MAX2(rho, max);
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, face(addr, faces[j]), x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
    }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * tex->depth[0];
-      rho = MAX2(rho, max);
+}
+
+
+static void
+img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x[4], y[4], z[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->nearest_texcoord_s(s, width,  x);
+   samp->nearest_texcoord_t(t, height, y);
+   samp->nearest_texcoord_p(p, depth,  z);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_3d(samp, addr, x[j], y[j], z[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }      
    }
+}
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+static void
+img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x0[4], x1[4];
+   float xw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width, x0, x1, xw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], 0);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], 0);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp(xw[j], tx0[c], tx1[c]);
+      }
+   }
 }
 
 
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
 static void
-choose_mipmap_levels(const struct pipe_texture *texture,
-                     const struct pipe_sampler_state *sampler,
+img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     boolean computeLambda,
                      float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addr, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addr, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
-      else {
-         *imgFilter = sampler->mag_img_filter;
+   }
+}
+
+
+static void
+img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      union tex_tile_address addrj = face(addr, faces[j]);
+      const float *tx0 = get_texel_2d(samp, addrj, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addrj, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addrj, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addrj, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
    }
-   else {
-      float lambda;
+}
 
-      if (computeLambda)
-         /* fragment shader */
-         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-      else
-         /* vertex shader */
-         lambda = lodbias; /* not really a bias, but absolute LOD */
 
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
+static void
+img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+   float xw[4], yw[4], zw[4]; /* interpolation weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   addr.value = 0;
+   addr.bits.level = level0;
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+   samp->linear_texcoord_p(p, depth,  z0, z1, zw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      const float *tx00 = get_texel_3d(samp, addr, x0[j], y0[j], z0[j]);
+      const float *tx01 = get_texel_3d(samp, addr, x1[j], y0[j], z0[j]);
+      const float *tx02 = get_texel_3d(samp, addr, x0[j], y1[j], z0[j]);
+      const float *tx03 = get_texel_3d(samp, addr, x1[j], y1[j], z0[j]);
+      
+      const float *tx10 = get_texel_3d(samp, addr, x0[j], y0[j], z1[j]);
+      const float *tx11 = get_texel_3d(samp, addr, x1[j], y0[j], z1[j]);
+      const float *tx12 = get_texel_3d(samp, addr, x0[j], y1[j], z1[j]);
+      const float *tx13 = get_texel_3d(samp, addr, x1[j], y1[j], z1[j]);
+      
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                              tx00[c], tx01[c],
+                              tx02[c], tx03[c],
+                              tx10[c], tx11[c],
+                              tx12[c], tx13[c]);
       }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+   }
+}
+
+
+static void
+mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
          }
       }
    }
 }
 
 
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into sp_tile_cache.c and merge with the
- * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
 static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-          unsigned face, unsigned level, int x, int y, int z,
-          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
+                   const float s[QUAD_SIZE],
+                   const float t[QUAD_SIZE],
+                   const float p[QUAD_SIZE],
+                   float lodbias,
+                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   float lambda;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
    else {
-      const int tx = x % TILE_SIZE;
-      const int ty = y % TILE_SIZE;
-      const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sp, samp->cache,
-                                  x, y, z, face, level);
-      rgba[0][j] = tile->data.color[ty][tx][0];
-      rgba[1][j] = tile->data.color[ty][tx][1];
-      rgba[2][j] = tile->data.color[ty][tx][2];
-      rgba[3][j] = tile->data.color[ty][tx][3];
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
+      samp->level = (int)(lambda + 0.5) ;
+      samp->level = MIN2(samp->level, (int)texture->last_level);
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+
+#if 0
+   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+#endif
+}
+
+
+static void
+mip_filter_none(struct tgsi_sampler *tgsi_sampler,
+                const float s[QUAD_SIZE],
+                const float t[QUAD_SIZE],
+                const float p[QUAD_SIZE],
+                float lodbias,
+                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
 }
 
 
+
 /**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
+ * Specialized version of mip_filter_linear with hard-wired calls to
+ * 2d lambda calculation and 2d_linear_repeat_POT img filters.
  */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
+static void
+mip_filter_linear_2d_linear_repeat_POT(
+   struct tgsi_sampler *tgsi_sampler,
+   const float s[QUAD_SIZE],
+   const float t[QUAD_SIZE],
+   const float p[QUAD_SIZE],
+   float lodbias,
+   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   /* Catches both negative and large values of level0:
+    */
+   if ((unsigned)level0 >= texture->last_level) { 
+      if (level0 < 0)
+         samp->level = 0;
+      else
+         samp->level = texture->last_level;
+
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
    }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
 
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
+      samp->level = level0;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
 }
 
 
+
 /**
- * As above, but do four z/texture comparisons.
+ * Do shadow/depth comparisons.
  */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
+static void
+sample_compare(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_sampler_state *sampler = samp->sampler;
    int j, k0, k1, k2, k3;
    float val;
 
+   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+
+   /**
+    * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+    * When we sampled the depth texture, the depth value was put into all
+    * RGBA channels.  We look at the red channel here.
+    */
+
    /* compare four texcoords vs. four texture samples */
    switch (sampler->compare_func) {
    case PIPE_FUNC_LESS:
@@ -826,470 +1500,392 @@ shadow_compare4(const struct pipe_sampler_state *sampler,
 
 
 /**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
+ * Compute which cube face is referenced by each texcoord and put that
+ * info into the sampler faces[] array.  Then sample the cube faces
  */
 static void
-sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         boolean computeLambda,
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
+sample_cube(struct tgsi_sampler *tgsi_sampler,
+            const float s[QUAD_SIZE],
+            const float t[QUAD_SIZE],
+            const float p[QUAD_SIZE],
+            float lodbias,
+            float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned j;
+   float ssss[4], tttt[4];
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
+   /*
+     major axis
+     direction     target                             sc     tc    ma
+     ----------    -------------------------------    ---    ---   ---
+     +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+     -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+     +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+     -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+     +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+     -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   for (j = 0; j < QUAD_SIZE; j++) {
+      float rx = s[j];
+      float ry = t[j];
+      float rz = p[j];
+      const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+      unsigned face;
+      float sc, tc, ma;
+
+      if (arx >= ary && arx >= arz) {
+         if (rx >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_X;
+            sc = -rz;
+            tc = -ry;
+            ma = arx;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_X;
+            sc = rz;
+            tc = -ry;
+            ma = arx;
          }
       }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+      else if (ary >= arx && ary >= arz) {
+         if (ry >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_Y;
+            sc = rx;
+            tc = rz;
+            ma = ary;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Y;
+            sc = rx;
+            tc = -rz;
+            ma = ary;
+         }
+      }
+      else {
+         if (rz > 0.0F) {
+            face = PIPE_TEX_FACE_POS_Z;
+            sc = rx;
+            tc = -ry;
+            ma = arz;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Z;
+            sc = -rx;
+            tc = -ry;
+            ma = arz;
+         }
+      }
+
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
+	 const float ima = 1.0 / ma;
+	 ssss[j] = ( sc * ima + 1.0F ) * 0.5F;
+	 tttt[j] = ( tc * ima + 1.0F ) * 0.5F;
+	 samp->faces[j] = face;
+      }
+   }
 
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
+   /* In our little pipeline, the compare stage is next.  If compare
+    * is not active, this will point somewhere deeper into the
+    * pipeline, eg. to mip_filter or even img_filter.
+    */
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+}
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+
+
+static wrap_nearest_func
+get_nearest_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_unorm_clamp_to_border;
    default:
       assert(0);
+      return wrap_nearest_unorm_clamp;
    }
 }
 
 
-static INLINE void
-sp_get_samples_1d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_nearest_func
+get_nearest_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_nearest_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_nearest_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_nearest_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_nearest_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_nearest_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_repeat;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_2d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_unorm_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, t, p,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_unorm_clamp;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_wrap(unsigned mode)
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend;
-   const uint face = 0;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_linear_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_linear_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_linear_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_linear_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_linear_mirror_clamp_to_border;
    default:
       assert(0);
+      return wrap_linear_repeat;
    }
 }
 
 
-static void
-sp_get_samples_cube(const struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+static compute_lambda_func
+get_lambda_func(const union sp_sampler_key key)
 {
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   if (key.bits.processor == TGSI_PROCESSOR_VERTEX)
+      return compute_lambda_vert;
+   
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      return compute_lambda_1d;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return compute_lambda_2d;
+   case PIPE_TEXTURE_3D:
+      return compute_lambda_3d;
+   default:
+      assert(0);
+      return compute_lambda_1d;
    }
-   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
+static filter_func
+get_img_filter(const union sp_sampler_key key,
+               unsigned filter,
+               const struct pipe_sampler_state *sampler)
+{
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_1d_nearest;
+      else
+         return img_filter_1d_linear;
       break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+   case PIPE_TEXTURE_2D:
+      /* Try for fast path:
+       */
+      if (key.bits.is_pot &&
+          sampler->wrap_s == sampler->wrap_t &&
+          sampler->normalized_coords) 
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
+         switch (sampler->wrap_s) {
+         case PIPE_TEX_WRAP_REPEAT:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_repeat_POT;
+            case PIPE_TEX_FILTER_LINEAR:
+               return img_filter_2d_linear_repeat_POT;
+            default:
+               break;
             }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            break;
+         case PIPE_TEX_WRAP_CLAMP:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_clamp_POT;
+            default:
+               break;
             }
          }
       }
+      /* Otherwise use default versions:
+       */
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_2d_nearest;
+      else
+         return img_filter_2d_linear;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_cube_nearest;
+      else
+         return img_filter_cube_linear;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_3d_nearest;
+      else
+         return img_filter_3d_linear;
       break;
    default:
       assert(0);
+      return img_filter_1d_nearest;
    }
 }
 
 
 /**
- * Common code for vertex/fragment program texture sampling.
+ * Bind the given texture object and texture cache to the sampler varient.
  */
-static INLINE void
-sp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               boolean computeLambda,
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
+void
+sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
+                                 struct softpipe_tex_tile_cache *tex_cache,
+                                 const struct pipe_texture *texture )
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-
-   if (!texture)
-      return;
+   const struct pipe_sampler_state *sampler = samp->sampler;
 
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_2D:
-      if (sampler->normalized_coords)
-         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      else
-         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_3D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_CUBE:
-      assert(sampler->normalized_coords);
-      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   default:
-      assert(0);
-   }
-
-#if 0 /* DEBUG */
-   {
-      int i;
-      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
-      for (i = 0; i < 4; i++) {
-         printf("Frag %d: %f %f %f %f\n", i,
-                rgba[0][i],
-                rgba[1][i],
-                rgba[2][i],
-                rgba[3][i]);
-      }
-   }
-#endif
+   samp->texture = texture;
+   samp->cache = tex_cache;
+   samp->xpot = util_unsigned_logbase2( texture->width[0] );
+   samp->ypot = util_unsigned_logbase2( texture->height[0] );
+   samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
 }
 
 
-/**
- * Called via tgsi_sampler::get_samples() when running a fragment shader.
- * Get four filtered RGBA values from the sampler's texture.
- */
 void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+sp_sampler_varient_destroy( struct sp_sampler_varient *samp )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+   FREE(samp);
 }
 
 
 /**
- * Called via tgsi_sampler::get_samples() when running a vertex shader.
- * Get four filtered RGBA values from the sampler's texture.
+ * Create a sampler varient for a given set of non-orthogonal state.
  */
-void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+   struct sp_sampler_varient *samp = CALLOC_STRUCT(sp_sampler_varient);
+   if (!samp)
+      return NULL;
+
+   samp->sampler = sampler;
+   samp->key = key;
+
+   /* Note that (for instance) linear_texcoord_s and
+    * nearest_texcoord_s may be active at the same time, if the
+    * sampler min_img_filter differs from its mag_img_filter.
+    */
+   if (sampler->normalized_coords) {
+      samp->linear_texcoord_s = get_linear_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_wrap( sampler->wrap_r );
+   }
+   else {
+      samp->linear_texcoord_s = get_linear_unorm_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_unorm_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_unorm_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_unorm_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_unorm_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_unorm_wrap( sampler->wrap_r );
+   }
+   
+   samp->compute_lambda = get_lambda_func( key );
+
+   samp->min_img_filter = get_img_filter(key, sampler->min_img_filter, sampler);
+   samp->mag_img_filter = get_img_filter(key, sampler->mag_img_filter, sampler);
+
+   switch (sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      if (sampler->min_img_filter == sampler->mag_img_filter) 
+         samp->mip_filter = samp->min_img_filter;         
+      else
+         samp->mip_filter = mip_filter_none;
+      break;
+
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      samp->mip_filter = mip_filter_nearest;
+      break;
+
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      if (key.bits.is_pot &&
+          sampler->min_img_filter == sampler->mag_img_filter &&
+          sampler->normalized_coords &&
+          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
+          sampler->wrap_t == PIPE_TEX_WRAP_REPEAT &&
+          sampler->min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      {
+         samp->mip_filter = mip_filter_linear_2d_linear_repeat_POT;
+      }
+      else 
+      {
+         samp->mip_filter = mip_filter_linear;
+      }
+      break;
+   }
+
+   if (sampler->compare_mode != FALSE) {
+      samp->compare = sample_compare;
+   }
+   else {
+      /* Skip compare operation by promoting the mip_filter function
+       * pointer:
+       */
+      samp->compare = samp->mip_filter;
+   }
+   
+   if (key.bits.target == PIPE_TEXTURE_CUBE) {
+      samp->base.get_samples = sample_cube;
+   }
+   else {
+      samp->faces[0] = 0;
+      samp->faces[1] = 0;
+      samp->faces[2] = 0;
+      samp->faces[3] = 0;
+
+      /* Skip cube face determination by promoting the compare
+       * function pointer:
+       */
+      samp->base.get_samples = samp->compare;
+   }
+
+   return samp;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 40d8eb2c2a..b0797711d3 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -31,43 +31,122 @@
 
 #include "tgsi/tgsi_exec.h"
 
+struct sp_sampler_varient;
+
+typedef void (*wrap_nearest_func)(const float s[4],
+                                  unsigned size,
+                                  int icoord[4]);
+
+typedef void (*wrap_linear_func)(const float s[4], 
+                                 unsigned size,
+                                 int icoord0[4],
+                                 int icoord1[4],
+                                 float w[4]);
+
+typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias);
+
+typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
+                            const float s[QUAD_SIZE],
+                            const float t[QUAD_SIZE],
+                            const float p[QUAD_SIZE],
+                            float lodbias,
+                            float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+union sp_sampler_key {
+   struct {
+      unsigned target:3;
+      unsigned is_pot:1;
+      unsigned processor:2;
+      unsigned unit:4;
+      unsigned pad:22;
+   } bits;
+   unsigned value;
+};
 
 /**
  * Subclass of tgsi_sampler
  */
-struct sp_shader_sampler
+struct sp_sampler_varient
 {
    struct tgsi_sampler base;  /**< base class */
 
-   uint unit;
-   struct softpipe_context *sp;
-   struct softpipe_tile_cache *cache;
+   union sp_sampler_key key;
+
+   /* The owner of this struct:
+    */
+   const struct pipe_sampler_state *sampler;
+
+
+   /* Currently bound texture:
+    */
+   const struct pipe_texture *texture;
+   struct softpipe_tex_tile_cache *cache;
+
+   unsigned processor;
+
+   /* For sp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   unsigned faces[4];
+   
+   wrap_nearest_func nearest_texcoord_s;
+   wrap_nearest_func nearest_texcoord_t;
+   wrap_nearest_func nearest_texcoord_p;
+
+   wrap_linear_func linear_texcoord_s;
+   wrap_linear_func linear_texcoord_t;
+   wrap_linear_func linear_texcoord_p;
+
+   filter_func min_img_filter;
+   filter_func mag_img_filter;
+
+   compute_lambda_func compute_lambda;
+
+   filter_func mip_filter;
+   filter_func compare;
+   
+   /* Linked list:
+    */
+   struct sp_sampler_varient *next;
 };
 
+struct sp_sampler;
 
+/* Create a sampler varient for a given set of non-orthogonal state.  Currently the 
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key );
 
-static INLINE const struct sp_shader_sampler *
-sp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (const struct sp_shader_sampler *) sampler;
-}
+void sp_sampler_varient_bind_texture( struct sp_sampler_varient *varient,
+                                      struct softpipe_tex_tile_cache *tex_cache,
+                                      const struct pipe_texture *tex );
 
+void sp_sampler_varient_destroy( struct sp_sampler_varient * );
 
-extern void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+static INLINE struct sp_sampler_varient *
+sp_sampler_varient(const struct tgsi_sampler *sampler)
+{
+   return (struct sp_sampler_varient *) sampler;
+}
 
 extern void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
 #endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
new file mode 100644
index 0000000000..407a22a9f4
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -0,0 +1,273 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+   
+
+struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_destroy(tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+
+
+void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
+}
+
+
+void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+/**
+ * Invalidate all cached tiles for the cached texture.
+ * Should be called when the texture is modified.
+ */
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc)
+{
+   unsigned i;
+
+   assert(tc);
+   assert(tc->texture);
+
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].addr.bits.invalid = 1;
+   }
+}
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (tc->texture != texture) {
+      pipe_texture_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         struct pipe_screen *screen = tc->tex_trans->texture->screen;
+         
+         if (tc->tex_trans_map) {
+            screen->transfer_unmap(screen, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   int pos;
+
+   if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->tex_face = -1;
+   }
+
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                        union tex_tile_address addr )
+{
+   struct pipe_screen *screen = tc->screen;
+   struct softpipe_tex_cached_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TILE_SIZE, y/TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            screen->tex_transfer_destroy(tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            screen->get_tex_transfer(screen, tc->texture, 
+                                     addr.bits.face, 
+                                     addr.bits.level, 
+                                     addr.bits.z, 
+                                     PIPE_TRANSFER_READ, 0, 0,
+                                     tc->texture->width[addr.bits.level],
+                                     tc->texture->height[addr.bits.level]);
+
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_rgba(tc->tex_trans,
+                         addr.bits.x * TILE_SIZE, 
+                         addr.bits.y * TILE_SIZE,
+                         TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
new file mode 100644
index 0000000000..ac6886a3df
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_TILE_CACHE_H
+#define SP_TEX_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_tex_cached_tile
+{
+   union tex_tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+struct softpipe_tex_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_texture *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct softpipe_tex_cached_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_tex_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+extern void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+
+extern const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr );
+
+static INLINE union tex_tile_address
+tex_tile_address( unsigned x,
+		  unsigned y,
+		  unsigned z,
+		  unsigned face,
+		  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct softpipe_tex_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile_tex( tc, addr );
+}
+
+
+
+
+
+#endif /* SP_TEX_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 70f0932431..1c64d58372 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -30,26 +30,21 @@
   *   Michel Dänzer <michel@tungstengraphics.com>
   */
 
-#include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
 #include "sp_screen.h"
 #include "sp_winsys.h"
 
 
-/* Simple, maximally packed layout.
- */
-
-
-/* Conventional allocation path for non-display textures:
+/**
+ * Conventional allocation path for non-display textures:
+ * Use a simple, maximally packed layout.
  */
 static boolean
 softpipe_texture_layout(struct pipe_screen *screen,
@@ -89,6 +84,10 @@ softpipe_texture_layout(struct pipe_screen *screen,
    return spt->buffer != NULL;
 }
 
+
+/**
+ * Texture layout for simple color buffers.
+ */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
                               struct softpipe_texture * spt)
@@ -112,21 +111,22 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
 }
 
 
-
-
-
 static struct pipe_texture *
 softpipe_texture_create(struct pipe_screen *screen,
-                        const struct pipe_texture *templat)
+                        const struct pipe_texture *template)
 {
    struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
    if (!spt)
       return NULL;
 
-   spt->base = *templat;
+   spt->base = *template;
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
 
+   spt->pot = (util_is_power_of_two(template->width[0]) &&
+               util_is_power_of_two(template->height[0]) &&
+               util_is_power_of_two(template->depth[0]));
+
    if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!softpipe_displaytarget_layout(screen, spt))
@@ -222,6 +222,13 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
          ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
 
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->timestamp++;
+         softpipe_screen(screen)->timestamp++;
+      }
+
       ps->face = face;
       ps->level = level;
       ps->zslice = zslice;
@@ -342,14 +349,13 @@ softpipe_transfer_map( struct pipe_screen *screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
-   {
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
        */
       softpipe_screen(screen)->timestamp++;
    }
-   
+
    xfer_map = map + softpipe_transfer(transfer)->offset +
       transfer->y / transfer->block.height * transfer->stride +
       transfer->x / transfer->block.width * transfer->block.size;
@@ -360,7 +366,7 @@ softpipe_transfer_map( struct pipe_screen *screen,
 
 static void
 softpipe_transfer_unmap(struct pipe_screen *screen,
-                       struct pipe_transfer *transfer)
+                        struct pipe_transfer *transfer)
 {
    struct softpipe_texture *spt;
 
@@ -371,14 +377,60 @@ softpipe_transfer_unmap(struct pipe_screen *screen,
 
    if (transfer->usage != PIPE_TRANSFER_READ) {
       /* Mark the texture as dirty to expire the tile caches. */
-      spt->modified = TRUE;
+      spt->timestamp++;
    }
 }
 
+static struct pipe_video_surface*
+softpipe_video_surface_create(struct pipe_screen *screen,
+                              enum pipe_video_chroma_format chroma_format,
+                              unsigned width, unsigned height)
+{
+   struct softpipe_video_surface *sp_vsfc;
+   struct pipe_texture template;
 
-void
-softpipe_init_texture_funcs(struct softpipe_context *sp)
+   assert(screen);
+   assert(width && height);
+
+   sp_vsfc = CALLOC_STRUCT(softpipe_video_surface);
+   if (!sp_vsfc)
+      return NULL;
+
+   pipe_reference_init(&sp_vsfc->base.reference, 1);
+   sp_vsfc->base.screen = screen;
+   sp_vsfc->base.chroma_format = chroma_format;
+   /*sp_vsfc->base.surface_format = PIPE_VIDEO_SURFACE_FORMAT_VUYA;*/
+   sp_vsfc->base.width = width;
+   sp_vsfc->base.height = height;
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   template.last_level = 0;
+   /* vl_mpeg12_mc_renderer expects this when it's initialized with pot_buffers=true */
+   template.width[0] = util_next_power_of_two(width);
+   template.height[0] = util_next_power_of_two(height);
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   sp_vsfc->tex = screen->texture_create(screen, &template);
+   if (!sp_vsfc->tex) {
+      FREE(sp_vsfc);
+      return NULL;
+   }
+
+   return &sp_vsfc->base;
+}
+
+
+static void
+softpipe_video_surface_destroy(struct pipe_video_surface *vsfc)
 {
+   struct softpipe_video_surface *sp_vsfc = softpipe_video_surface(vsfc);
+
+   pipe_texture_reference(&sp_vsfc->tex, NULL);
+   FREE(sp_vsfc);
 }
 
 
@@ -396,6 +448,9 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->tex_transfer_destroy = softpipe_tex_transfer_destroy;
    screen->transfer_map = softpipe_transfer_map;
    screen->transfer_unmap = softpipe_transfer_unmap;
+
+   screen->video_surface_create = softpipe_video_surface_create;
+   screen->video_surface_destroy = softpipe_video_surface_destroy;
 }
 
 
@@ -404,7 +459,7 @@ softpipe_get_texture_buffer( struct pipe_texture *texture,
                              struct pipe_buffer **buf,
                              unsigned *stride )
 {
-   struct softpipe_texture *tex = (struct softpipe_texture *)texture;
+   struct softpipe_texture *tex = (struct softpipe_texture *) texture;
 
    if (!tex)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 893aa7d11d..2ef64e1e7c 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -30,6 +30,7 @@
 
 
 #include "pipe/p_state.h"
+#include "pipe/p_video_state.h"
 
 
 struct pipe_context;
@@ -48,7 +49,11 @@ struct softpipe_texture
     */
    struct pipe_buffer *buffer;
 
-   boolean modified;
+   /* True if texture images are power-of-two in all dimensions:
+    */
+   boolean pot;
+
+   unsigned timestamp;
 };
 
 struct softpipe_transfer
@@ -58,6 +63,15 @@ struct softpipe_transfer
    unsigned long offset;
 };
 
+struct softpipe_video_surface
+{
+   struct pipe_video_surface base;
+
+   /* The data is held here:
+    */
+   struct pipe_texture *tex;
+};
+
 
 /** cast wrappers */
 static INLINE struct softpipe_texture *
@@ -72,9 +86,12 @@ softpipe_transfer(struct pipe_transfer *pt)
    return (struct softpipe_transfer *) pt;
 }
 
+static INLINE struct softpipe_video_surface *
+softpipe_video_surface(struct pipe_video_surface *pvs)
+{
+   return (struct softpipe_video_surface *) pvs;
+}
 
-extern void
-softpipe_init_texture_funcs( struct softpipe_context *softpipe );
 
 extern void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 5f7864e671..83fb4e0d15 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 /**
- * Texture tile caching.
+ * Render target tile caching.
  *
  * Author:
  *    Brian Paul
@@ -35,38 +35,8 @@
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
-#include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 50
-
-
-/** XXX move these */
-#define MAX_WIDTH 4096
-#define MAX_HEIGHT 4096
-
-
-struct softpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-   struct pipe_texture *texture;  /**< if caching a texture */
-   struct softpipe_cached_tile entries[NUM_ENTRIES];
-   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
-   float clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
-
-   struct pipe_transfer *tex_trans;
-   void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
-
-   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
-};
 
 
 /**
@@ -76,7 +46,7 @@ struct softpipe_tile_cache
  * a LRU replacement policy.
  */
 #define CACHE_POS(x, y) \
-   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+   (((x) + (y) * 5) % NUM_ENTRIES)
 
 
 
@@ -84,12 +54,10 @@ struct softpipe_tile_cache
  * Is the tile at (x,y) in cleared state?
  */
 static INLINE uint
-is_clear_flag_set(const uint *bitvec, int x, int y)
+is_clear_flag_set(const uint *bitvec, union tile_address addr)
 {
    int pos, bit;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bit = bitvec[pos / 32] & (1 << (pos & 31));
    return bit;
@@ -100,12 +68,10 @@ is_clear_flag_set(const uint *bitvec, int x, int y)
  * Mark the tile at (x,y) as not cleared.
  */
 static INLINE void
-clear_clear_flag(uint *bitvec, int x, int y)
+clear_clear_flag(uint *bitvec, union tile_address addr)
 {
    int pos;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bitvec[pos / 32] &= ~(1 << (pos & 31));
 }
@@ -127,9 +93,9 @@ sp_create_tile_cache( struct pipe_screen *screen )
    if (tc) {
       tc->screen = screen;
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x =
-         tc->entries[pos].y = -1;
+         tc->entries[pos].addr.bits.invalid = 1;
       }
+      tc->last_tile = &tc->entries[0]; /* any tile */
 
 #if TILE_CLEAR_OPTIMIZATION
       /* set flags to indicate all the tiles are cleared */
@@ -153,10 +119,6 @@ sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
       screen = tc->transfer->texture->screen;
       screen->tex_transfer_destroy(tc->transfer);
    }
-   if (tc->tex_trans) {
-      screen = tc->tex_trans->texture->screen;
-      screen->tex_transfer_destroy(tc->tex_trans);
-   }
 
    FREE( tc );
 }
@@ -169,8 +131,6 @@ void
 sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
                           struct pipe_surface *ps)
 {
-   assert(!tc->texture);
-
    if (tc->transfer) {
       struct pipe_screen *screen = tc->transfer->texture->screen;
 
@@ -222,9 +182,6 @@ sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
 {
    if (tc->transfer && !tc->transfer_map)
       tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-
-   if (tc->tex_trans && !tc->tex_trans_map)
-      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
 }
 
 
@@ -235,47 +192,6 @@ sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
       tc->screen->transfer_unmap(tc->screen, tc->transfer);
       tc->transfer_map = NULL;
    }
-
-   if (tc->tex_trans_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-      tc->tex_trans_map = NULL;
-   }
-}
-
-
-/**
- * Specify the texture to cache.
- */
-void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture)
-{
-   uint i;
-
-   assert(!tc->transfer);
-
-   pipe_texture_reference(&tc->texture, texture);
-
-   if (tc->tex_trans) {
-      struct pipe_screen *screen = tc->tex_trans->texture->screen;
-
-      if (tc->tex_trans_map) {
-         screen->transfer_unmap(screen, tc->tex_trans);
-         tc->tex_trans_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->tex_trans);
-      tc->tex_trans = NULL;
-   }
-
-   /* mark as entries as invalid/empty */
-   /* XXX we should try to avoid this when the teximage hasn't changed */
-   for (i = 0; i < NUM_ENTRIES; i++) {
-      tc->entries[i].x = -1;
-   }
-
-   tc->tex_face = -1; /* any invalid value here */
 }
 
 
@@ -319,7 +235,7 @@ clear_tile(struct softpipe_cached_tile *tile,
 
    switch (pf_get_size(format)) {
    case 1:
-      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
       break;
    case 2:
       if (clear_value == 0) {
@@ -355,8 +271,7 @@ clear_tile(struct softpipe_cached_tile *tile,
  * Actually clear the tiles which were flagged as being in a clear state.
  */
 static void
-sp_tile_cache_flush_clear(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc)
+sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    const uint w = tc->transfer->width;
@@ -370,13 +285,15 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
       for (x = 0; x < w; x += TILE_SIZE) {
-         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         union tile_address addr = tile_address(x, y);
+
+         if (is_clear_flag_set(tc->clear_flags, addr)) {
             pipe_put_tile_raw(pt,
                               x, y, TILE_SIZE, TILE_SIZE,
                               tc->tile.data.color32, 0/*STRIDE*/);
 
             /* do this? */
-            clear_clear_flag(tc->clear_flags, x, y);
+            clear_clear_flag(tc->clear_flags, addr);
 
             numCleared++;
          }
@@ -393,8 +310,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
  * any tiles "flagged" as cleared will be "really" cleared.
  */
 void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc)
+sp_flush_tile_cache(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    int inuse = 0, pos;
@@ -403,33 +319,30 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
       /* caching a drawing transfer */
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
          struct softpipe_cached_tile *tile = tc->entries + pos;
-         if (tile->x >= 0) {
+         if (!tile->addr.bits.invalid) {
             if (tc->depth_stencil) {
                pipe_put_tile_raw(pt,
-                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->addr.bits.x * TILE_SIZE, 
+                                 tile->addr.bits.y * TILE_SIZE, 
+                                 TILE_SIZE, TILE_SIZE,
                                  tile->data.depth32, 0/*STRIDE*/);
             }
             else {
                pipe_put_tile_rgba(pt,
-                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  tile->addr.bits.x * TILE_SIZE, 
+                                  tile->addr.bits.y * TILE_SIZE, 
+                                  TILE_SIZE, TILE_SIZE,
                                   (float *) tile->data.color);
             }
-            tile->x = tile->y = -1;  /* mark as empty */
+            tile->addr.bits.invalid = 1;  /* mark as empty */
             inuse++;
          }
       }
 
 #if TILE_CLEAR_OPTIMIZATION
-      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+      sp_tile_cache_flush_clear(tc);
 #endif
    }
-   else if (tc->texture) {
-      /* caching a texture, mark all entries as empty */
-      for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x = -1;
-      }
-      tc->tex_face = -1;
-   }
 
 #if 0
    debug_printf("flushed tiles in use: %d\n", inuse);
@@ -442,40 +355,39 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
  * \param x, y  position of tile, in pixels
  */
 struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y)
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr )
 {
    struct pipe_transfer *pt = tc->transfer;
-
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-
+   
    /* cache pos/entry: */
-   const int pos = CACHE_POS(x, y);
+   const int pos = CACHE_POS(addr.bits.x,
+                             addr.bits.y);
    struct softpipe_cached_tile *tile = tc->entries + pos;
 
-   if (tile_x != tile->x ||
-       tile_y != tile->y) {
+   if (addr.value != tile->addr.value) {
 
-      if (tile->x != -1) {
+      if (tile->addr.bits.invalid == 0) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE,
+                              tile->addr.bits.y * TILE_SIZE,
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE,
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
 
-      tile->x = tile_x;
-      tile->y = tile_y;
+      tile->addr = addr;
 
-      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+      if (is_clear_flag_set(tc->clear_flags, addr)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
             clear_tile(tile, pt->format, tc->clear_val);
@@ -483,125 +395,33 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          else {
             clear_tile_rgba(tile, pt->format, tc->clear_color);
          }
-         clear_clear_flag(tc->clear_flags, x, y);
+         clear_clear_flag(tc->clear_flags, addr);
       }
       else {
          /* get new tile data from transfer */
          if (tc->depth_stencil) {
             pipe_get_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE, 
+                              tile->addr.bits.y * TILE_SIZE, 
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE, 
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
    }
 
+   tc->last_tile = tile;
    return tile;
 }
 
 
-/**
- * Given the texture face, level, zslice, x and y values, compute
- * the cache entry position/index where we'd hope to find the
- * cached texture tile.
- * This is basically a direct-map cache.
- * XXX There's probably lots of ways in which we can improve this.
- */
-static INLINE uint
-tex_cache_pos(int x, int y, int z, int face, int level)
-{
-   uint entry = x + y * 9 + z * 3 + face + level * 7;
-   return entry % NUM_ENTRIES;
-}
-
 
-/**
- * Similar to sp_get_cached_tile() but for textures.
- * Tiles are read-only and indexed with more params.
- */
-const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *sp,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level)
-{
-   struct pipe_screen *screen = sp->pipe.screen;
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-   /* cache pos/entry: */
-   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
-                                  face, level);
-   struct softpipe_cached_tile *tile = tc->entries + pos;
-
-   if (tc->texture) {
-      struct softpipe_texture *spt = softpipe_texture(tc->texture);
-      if (spt->modified) {
-         /* texture was modified, invalidate all cached tiles */
-         uint p;
-         for (p = 0; p < NUM_ENTRIES; p++) {
-            tile = tc->entries + p;
-            tile->x = -1;
-         }
-         spt->modified = FALSE;
-      }
-   }
-
-   if (tile_x != tile->x ||
-       tile_y != tile->y ||
-       z != tile->z ||
-       face != tile->face ||
-       level != tile->level) {
-      /* cache miss */
-
-#if 0
-      printf("miss at %u  x=%d y=%d z=%d face=%d level=%d\n", pos,
-             x/TILE_SIZE, y/TILE_SIZE, z, face, level);
-#endif
-      /* check if we need to get a new transfer */
-      if (!tc->tex_trans ||
-          tc->tex_face != face ||
-          tc->tex_level != level ||
-          tc->tex_z != z) {
-         /* get new transfer (view into texture) */
-
-         if (tc->tex_trans) {
-            if (tc->tex_trans_map) {
-               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-               tc->tex_trans_map = NULL;
-            }
-
-            screen->tex_transfer_destroy(tc->tex_trans);
-            tc->tex_trans = NULL;
-         }
-
-         tc->tex_trans = screen->get_tex_transfer(screen, tc->texture, face, level, z, 
-                                                  PIPE_TRANSFER_READ, 0, 0,
-                                                  tc->texture->width[level],
-                                                  tc->texture->height[level]);
-         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
-
-         tc->tex_face = face;
-         tc->tex_level = level;
-         tc->tex_z = z;
-      }
-
-      /* get tile from the transfer (view into texture) */
-      pipe_get_tile_rgba(tc->tex_trans,
-                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
-                         (float *) tile->data.color);
-      tile->x = tile_x;
-      tile->y = tile_y;
-      tile->z = z;
-      tile->face = face;
-      tile->level = level;
-   }
-
-   return tile;
-}
 
 
 /**
@@ -632,6 +452,6 @@ sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
 
    for (pos = 0; pos < NUM_ENTRIES; pos++) {
       struct softpipe_cached_tile *tile = tc->entries + pos;
-      tile->x = tile->y = -1;
+      tile->addr.bits.invalid = 1;
    }
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 8f247d0e58..a12092702a 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -34,7 +34,6 @@
 #include "pipe/p_compiler.h"
 
 
-struct softpipe_context;
 struct softpipe_tile_cache;
 
 
@@ -44,11 +43,23 @@ struct softpipe_tile_cache;
 #define TILE_SIZE 64
 
 
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned invalid:1;
+      unsigned pad:19;
+   } bits;
+   unsigned value;
+};
+
 
 struct softpipe_cached_tile
 {
-   int x, y;           /**< pos of tile in window coords */
-   int z, face, level; /**< Extra texture indexes */
+   union tile_address addr;
    union {
       float color[TILE_SIZE][TILE_SIZE][4];
       uint color32[TILE_SIZE][TILE_SIZE];
@@ -59,6 +70,32 @@ struct softpipe_cached_tile
    } data;
 };
 
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+
+   struct softpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
 
 extern struct softpipe_tile_cache *
 sp_create_tile_cache( struct pipe_screen *screen );
@@ -80,26 +117,45 @@ extern void
 sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture);
-
-extern void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc);
+sp_flush_tile_cache(struct softpipe_tile_cache *tc);
 
 extern void
 sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
                     uint clearValue);
 
 extern struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y);
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr );
+
+
+static INLINE union tile_address
+tile_address( unsigned x,
+              unsigned y )
+{
+   union tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_tile_cache *tc, 
+                   int x, int y )
+{
+   union tile_address addr = tile_address( x, y );
+
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile( tc, addr );
+}
+
 
-extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *softpipe,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level);
 
 
 #endif /* SP_TILE_CACHE_H */
diff --git a/src/gallium/drivers/softpipe/sp_video_context.c b/src/gallium/drivers/softpipe/sp_video_context.c
new file mode 100644
index 0000000000..ccb29726b6
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_video_context.c
@@ -0,0 +1,268 @@
+#include "sp_video_context.h"
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include "softpipe/sp_winsys.h"
+#include "softpipe/sp_texture.h"
+
+static void
+sp_mpeg12_destroy(struct pipe_video_context *vpipe)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+	
+   /* Asserted in softpipe_delete_fs_state() for some reason */
+   ctx->pipe->bind_vs_state(ctx->pipe, NULL);
+   ctx->pipe->bind_fs_state(ctx->pipe, NULL);
+
+   ctx->pipe->delete_blend_state(ctx->pipe, ctx->blend);
+   ctx->pipe->delete_rasterizer_state(ctx->pipe, ctx->rast);
+   ctx->pipe->delete_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+
+   pipe_video_surface_reference(&ctx->decode_target, NULL);
+   vl_compositor_cleanup(&ctx->compositor);
+   vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+   ctx->pipe->destroy(ctx->pipe);
+
+   FREE(ctx);
+}
+
+static void
+sp_mpeg12_decode_macroblocks(struct pipe_video_context *vpipe,
+                             struct pipe_video_surface *past,
+                             struct pipe_video_surface *future,
+                             unsigned num_macroblocks,
+                             struct pipe_macroblock *macroblocks,
+                             struct pipe_fence_handle **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+   struct pipe_mpeg12_macroblock *mpeg12_macroblocks = (struct pipe_mpeg12_macroblock*)macroblocks;
+
+   assert(vpipe);
+   assert(num_macroblocks);
+   assert(macroblocks);
+   assert(macroblocks->codec == PIPE_VIDEO_CODEC_MPEG12);
+   assert(ctx->decode_target);
+
+   vl_mpeg12_mc_renderer_render_macroblocks(&ctx->mc_renderer,
+                                            softpipe_video_surface(ctx->decode_target)->tex,
+                                            past ? softpipe_video_surface(past)->tex : NULL,
+                                            future ? softpipe_video_surface(future)->tex : NULL,
+                                            num_macroblocks, mpeg12_macroblocks, fence);
+}
+
+static void
+sp_mpeg12_clear_surface(struct pipe_video_context *vpipe,
+                        unsigned x, unsigned y,
+                        unsigned width, unsigned height,
+                        unsigned value,
+                        struct pipe_surface *surface)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(surface);
+
+   ctx->pipe->surface_fill(ctx->pipe, surface, x, y, width, height, value);
+}
+
+static void
+sp_mpeg12_render_picture(struct pipe_video_context     *vpipe,
+                         /*struct pipe_surface         *backround,
+                         struct pipe_video_rect        *backround_area,*/
+                         struct pipe_video_surface     *src_surface,
+                         enum pipe_mpeg12_picture_type picture_type,
+                         /*unsigned                    num_past_surfaces,
+                         struct pipe_video_surface     *past_surfaces,
+                         unsigned                      num_future_surfaces,
+                         struct pipe_video_surface     *future_surfaces,*/
+                         struct pipe_video_rect        *src_area,
+                         struct pipe_surface           *dst_surface,
+                         struct pipe_video_rect        *dst_area,
+                         /*unsigned                      num_layers,
+                         struct pipe_surface           *layers,
+                         struct pipe_video_rect        *layer_src_areas,
+                         struct pipe_video_rect        *layer_dst_areas*/
+                         struct pipe_fence_handle      **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+	
+   assert(vpipe);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+	
+   vl_compositor_render(&ctx->compositor, softpipe_video_surface(src_surface)->tex,
+                        picture_type, src_area, dst_surface->texture, dst_area, fence);
+}
+
+static void
+sp_mpeg12_set_decode_target(struct pipe_video_context *vpipe,
+                            struct pipe_video_surface *dt)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(dt);
+
+   pipe_video_surface_reference(&ctx->decode_target, dt);
+}
+
+static bool
+init_pipe_state(struct sp_mpeg12_context *ctx)
+{
+   struct pipe_rasterizer_state rast;
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   unsigned i;
+
+   assert(ctx);
+	
+   rast.flatshade = 1;
+   rast.flatshade_first = 0;
+   rast.light_twoside = 0;
+   rast.front_winding = PIPE_WINDING_CCW;
+   rast.cull_mode = PIPE_WINDING_CW;
+   rast.fill_cw = PIPE_POLYGON_MODE_FILL;
+   rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
+   rast.offset_cw = 0;
+   rast.offset_ccw = 0;
+   rast.scissor = 0;
+   rast.poly_smooth = 0;
+   rast.poly_stipple_enable = 0;
+   rast.point_sprite = 0;
+   rast.point_size_per_vertex = 0;
+   rast.multisample = 0;
+   rast.line_smooth = 0;
+   rast.line_stipple_enable = 0;
+   rast.line_stipple_factor = 0;
+   rast.line_stipple_pattern = 0;
+   rast.line_last_pixel = 0;
+   rast.bypass_vs_clip_and_viewport = 0;
+   rast.line_width = 1;
+   rast.point_smooth = 0;
+   rast.point_size = 1;
+   rast.offset_units = 1;
+   rast.offset_scale = 1;
+   /*rast.sprite_coord_mode[i] = ;*/
+   ctx->rast = ctx->pipe->create_rasterizer_state(ctx->pipe, &rast);
+   ctx->pipe->bind_rasterizer_state(ctx->pipe, ctx->rast);
+
+   blend.blend_enable = 0;
+   blend.rgb_func = PIPE_BLEND_ADD;
+   blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_func = PIPE_BLEND_ADD;
+   blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.logicop_enable = 0;
+   blend.logicop_func = PIPE_LOGICOP_CLEAR;
+   /* Needed to allow color writes to FB, even if blending disabled */
+   blend.colormask = PIPE_MASK_RGBA;
+   blend.dither = 0;
+   ctx->blend = ctx->pipe->create_blend_state(ctx->pipe, &blend);
+   ctx->pipe->bind_blend_state(ctx->pipe, ctx->blend);
+
+   dsa.depth.enabled = 0;
+   dsa.depth.writemask = 0;
+   dsa.depth.func = PIPE_FUNC_ALWAYS;
+   dsa.depth.occlusion_count = 0;
+   for (i = 0; i < 2; ++i) {
+      dsa.stencil[i].enabled = 0;
+      dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
+      dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].ref_value = 0;
+      dsa.stencil[i].valuemask = 0;
+      dsa.stencil[i].writemask = 0;
+   }
+   dsa.alpha.enabled = 0;
+   dsa.alpha.func = PIPE_FUNC_ALWAYS;
+   dsa.alpha.ref_value = 0;
+   ctx->dsa = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &dsa);
+   ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+	
+   return true;
+}
+
+static struct pipe_video_context *
+sp_mpeg12_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                 enum pipe_video_chroma_format chroma_format,
+                 unsigned width, unsigned height)
+{
+   struct sp_mpeg12_context *ctx;
+
+   assert(u_reduce_video_profile(profile) == PIPE_VIDEO_CODEC_MPEG12);
+
+   ctx = CALLOC_STRUCT(sp_mpeg12_context);
+
+   if (!ctx)
+      return NULL;
+
+   ctx->base.profile = profile;
+   ctx->base.chroma_format = chroma_format;
+   ctx->base.width = width;
+   ctx->base.height = height;
+
+   ctx->base.screen = screen;
+   ctx->base.destroy = sp_mpeg12_destroy;
+   ctx->base.decode_macroblocks = sp_mpeg12_decode_macroblocks;
+   ctx->base.clear_surface = sp_mpeg12_clear_surface;
+   ctx->base.render_picture = sp_mpeg12_render_picture;
+   ctx->base.set_decode_target = sp_mpeg12_set_decode_target;
+
+   ctx->pipe = softpipe_create(screen);
+   if (!ctx->pipe) {
+      FREE(ctx);
+      return NULL;
+   }
+
+   /* TODO: Use slice buffering for softpipe when implemented, no advantage to buffering an entire picture */
+   if (!vl_mpeg12_mc_renderer_init(&ctx->mc_renderer, ctx->pipe,
+                                   width, height, chroma_format,
+                                   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE,
+                                   /* TODO: Use XFER_NONE when implemented */
+                                   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE,
+                                   true)) {
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!vl_compositor_init(&ctx->compositor, ctx->pipe)) {
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!init_pipe_state(ctx)) {
+      vl_compositor_cleanup(&ctx->compositor);
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+
+   return &ctx->base;
+}
+
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height)
+{
+   assert(screen);
+   assert(width && height);
+
+   switch (u_reduce_video_profile(profile)) {
+      case PIPE_VIDEO_CODEC_MPEG12:
+         return sp_mpeg12_create(screen, profile,
+                                 chroma_format,
+                                 width, height);
+      default:
+         return NULL;
+   }
+}
diff --git a/src/gallium/drivers/softpipe/sp_video_context.h b/src/gallium/drivers/softpipe/sp_video_context.h
new file mode 100644
index 0000000000..2c7691c7cb
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_video_context.h
@@ -0,0 +1,30 @@
+#ifndef SP_VIDEO_CONTEXT_H
+#define SP_VIDEO_CONTEXT_H
+
+#include <pipe/p_video_context.h>
+#include <vl/vl_mpeg12_mc_renderer.h>
+#include <vl/vl_compositor.h>
+
+struct pipe_screen;
+struct pipe_context;
+struct pipe_video_surface;
+
+struct sp_mpeg12_context
+{
+   struct pipe_video_context base;
+   struct pipe_context *pipe;
+   struct pipe_video_surface *decode_target;
+   struct vl_mpeg12_mc_renderer mc_renderer;
+   struct vl_compositor compositor;
+
+   void *rast;
+   void *dsa;
+   void *blend;
+};
+
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height);
+
+#endif /* SP_VIDEO_CONTEXT_H */
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ae0af4d055..bf470b46ae 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -125,11 +125,11 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    } else if ((tr_ctx->draw_rule.blocker & flag) &&
               (tr_ctx->draw_blocker & 4)) {
       boolean block = FALSE;
-      debug_printf("%s (%lu %lu) (%lu %lu) (%lu %u) (%lu %u)\n", __FUNCTION__,
-					tr_ctx->draw_rule.fs, tr_ctx->curr.fs,
-					tr_ctx->draw_rule.vs, tr_ctx->curr.vs,
-					tr_ctx->draw_rule.surf, 0,
-					tr_ctx->draw_rule.tex, 0);
+      debug_printf("%s (%p %p) (%p %p) (%p %u) (%p %u)\n", __FUNCTION__,
+                   (void *) tr_ctx->draw_rule.fs, (void *) tr_ctx->curr.fs,
+                   (void *) tr_ctx->draw_rule.vs, (void *) tr_ctx->curr.vs,
+                   (void *) tr_ctx->draw_rule.surf, 0,
+                   (void *) tr_ctx->draw_rule.tex, 0);
       if (tr_ctx->draw_rule.fs &&
           tr_ctx->draw_rule.fs == tr_ctx->curr.fs)
          block = TRUE;
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index de99957d9d..78fe1f4c87 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -122,18 +122,22 @@
 
 #if defined(__linux__)
 #define PIPE_OS_LINUX
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__FreeBSD__)
 #define PIPE_OS_BSD
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__sun)
 #define PIPE_OS_SOLARIS
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__APPLE__)
 #define PIPE_OS_APPLE
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(_WIN32) || defined(WIN32)
@@ -142,6 +146,7 @@
 
 #if defined(__HAIKU__)
 #define PIPE_OS_HAIKU
+#define PIPE_OS_UNIX
 #endif
 
 /*
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b01ab6d137..ad42beff47 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -281,7 +281,7 @@ enum pipe_transfer_usage {
 #define PIPE_CAP_NPOT_TEXTURES           2
 #define PIPE_CAP_TWO_SIDED_STENCIL       3
 #define PIPE_CAP_GLSL                    4  /* XXX need something better */
-#define PIPE_CAP_S3TC                    5
+#define PIPE_CAP_S3TC                    5  /* XXX: deprecated; cap determined via supported sampler formats */
 #define PIPE_CAP_ANISOTROPIC_FILTER      6
 #define PIPE_CAP_POINT_SPRITE            7
 #define PIPE_CAP_MAX_RENDER_TARGETS      8
@@ -315,6 +315,31 @@ enum pipe_transfer_usage {
 #define PIPE_REFERENCED_FOR_READ  (1 << 0)
 #define PIPE_REFERENCED_FOR_WRITE (1 << 1)
 
+
+enum pipe_video_codec
+{
+   PIPE_VIDEO_CODEC_UNKNOWN = 0,
+   PIPE_VIDEO_CODEC_MPEG12,   /**< MPEG1, MPEG2 */
+   PIPE_VIDEO_CODEC_MPEG4,    /**< DIVX, XVID */
+   PIPE_VIDEO_CODEC_VC1,      /**< WMV */
+   PIPE_VIDEO_CODEC_MPEG4_AVC /**< H.264 */
+};
+
+enum pipe_video_profile
+{
+   PIPE_VIDEO_PROFILE_MPEG1,
+   PIPE_VIDEO_PROFILE_MPEG2_SIMPLE,
+   PIPE_VIDEO_PROFILE_MPEG2_MAIN,
+   PIPE_VIDEO_PROFILE_MPEG4_SIMPLE,
+   PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE,
+   PIPE_VIDEO_PROFILE_VC1_SIMPLE,
+   PIPE_VIDEO_PROFILE_VC1_MAIN,
+   PIPE_VIDEO_PROFILE_VC1_ADVANCED,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index c4469d4a9e..af23080920 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -613,6 +613,24 @@ pf_has_alpha( enum pipe_format format )
    }
 }
 
+enum pipe_video_chroma_format
+{
+   PIPE_VIDEO_CHROMA_FORMAT_420,
+   PIPE_VIDEO_CHROMA_FORMAT_422,
+   PIPE_VIDEO_CHROMA_FORMAT_444
+};
+
+#if 0
+enum pipe_video_surface_format
+{
+   PIPE_VIDEO_SURFACE_FORMAT_NV12,  /**< Planar; Y plane, UV plane */
+   PIPE_VIDEO_SURFACE_FORMAT_YV12,  /**< Planar; Y plane, U plane, V plane */
+   PIPE_VIDEO_SURFACE_FORMAT_YUYV,  /**< Interleaved; Y,U,Y,V,Y,U,Y,V */
+   PIPE_VIDEO_SURFACE_FORMAT_UYVY,  /**< Interleaved; U,Y,V,Y,U,Y,V,Y */
+   PIPE_VIDEO_SURFACE_FORMAT_VUYA   /**< Packed; A31-24|Y23-16|U15-8|V7-0 */
+};
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index a5c1e8270a..30a4aaf409 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -118,7 +118,7 @@ pipe_buffer_write(struct pipe_screen *screen,
                   unsigned offset, unsigned size,
                   const void *data)
 {
-   uint8_t *map;
+   void *map;
    
    assert(offset < buf->size);
    assert(offset + size <= buf->size);
@@ -129,7 +129,7 @@ pipe_buffer_write(struct pipe_screen *screen,
                                PIPE_BUFFER_USAGE_FLUSH_EXPLICIT);
    assert(map);
    if(map) {
-      memcpy(map + offset, data, size);
+      memcpy((uint8_t *)map + offset, data, size);
       pipe_buffer_flush_mapped_range(screen, buf, offset, size);
       pipe_buffer_unmap(screen, buf);
    }
@@ -141,7 +141,7 @@ pipe_buffer_read(struct pipe_screen *screen,
                  unsigned offset, unsigned size,
                  void *data)
 {
-   uint8_t *map;
+   void *map;
    
    assert(offset < buf->size);
    assert(offset + size <= buf->size);
@@ -150,11 +150,31 @@ pipe_buffer_read(struct pipe_screen *screen,
    map = pipe_buffer_map_range(screen, buf, offset, size, PIPE_BUFFER_USAGE_CPU_READ);
    assert(map);
    if(map) {
-      memcpy(data, map + offset, size);
+      memcpy(data, (const uint8_t *)map + offset, size);
       pipe_buffer_unmap(screen, buf);
    }
 }
 
+static INLINE void *
+pipe_transfer_map( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   return screen->transfer_map(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_unmap( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->transfer_unmap(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_destroy( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->tex_transfer_destroy(transf);
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 3f30c52a16..f0a4de5df3 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -53,7 +53,10 @@ extern "C" {
 struct pipe_fence_handle;
 struct pipe_winsys;
 struct pipe_buffer;
-
+struct pipe_texture;
+struct pipe_surface;
+struct pipe_video_surface;
+struct pipe_transfer;
 
 
 /**
@@ -252,6 +255,17 @@ struct pipe_screen {
 
    void (*buffer_destroy)( struct pipe_buffer *buf );
 
+   /**
+    * Create a video surface suitable for use as a decoding target by the
+    * driver's pipe_video_context.
+    */
+   struct pipe_video_surface*
+   (*video_surface_create)( struct pipe_screen *screen,
+                            enum pipe_video_chroma_format chroma_format,
+                            unsigned width, unsigned height );
+
+   void (*video_surface_destroy)( struct pipe_video_surface *vsfc );
+
 
    /**
     * Do any special operations to ensure frontbuffer contents are
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 626bedb35a..b59d6b7ae3 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -114,11 +114,29 @@ struct pipe_rasterizer_state
     * the vertex shader, clipping and viewport processing.  Note that
     * a vertex shader is still needed though, to indicate the mapping
     * from vertex elements to fragment shader input semantics.
+    *
+    * XXX: considered for removal.
     */
    unsigned bypass_vs_clip_and_viewport:1;
 
-   unsigned flatshade_first:1;   /**< take color attribute from the first vertex of a primitive */
-   unsigned gl_rasterization_rules:1; /**< enable tweaks for GL rasterization?  */
+   /** 
+    * Use the first vertex of a primitive as the provoking vertex for
+    * flat shading.
+    */
+   unsigned flatshade_first:1;   
+
+   /** 
+    * When true, triangle rasterization uses (0.5, 0.5) pixel centers
+    * for determining pixel ownership.
+    *
+    * When false, triangle rasterization uses (0,0) pixel centers for
+    * determining pixel ownership.
+    *
+    * Triangle rasterization always uses a 'top,left' rule for pixel
+    * ownership, this just alters which point we consider the pixel
+    * center for that test.
+    */
+   unsigned gl_rasterization_rules:1;
 
    float line_width;
    float point_size;           /**< used when no per-vertex size */
@@ -307,7 +325,7 @@ struct pipe_transfer
    unsigned nblocksx;            /**< allocated width in blocks */
    unsigned nblocksy;            /**< allocated height in blocks */
    unsigned stride;              /**< stride in bytes between rows of blocks */
-   unsigned usage;               /**< PIPE_TRANSFER_*  */
+   enum pipe_transfer_usage usage; /**< PIPE_TRANSFER_*  */
 
    struct pipe_texture *texture; /**< texture to transfer to/from  */
    unsigned face;
diff --git a/src/gallium/include/pipe/p_video_context.h b/src/gallium/include/pipe/p_video_context.h
new file mode 100644
index 0000000000..937705ac50
--- /dev/null
+++ b/src/gallium/include/pipe/p_video_context.h
@@ -0,0 +1,92 @@
+#ifndef PIPE_VIDEO_CONTEXT_H
+#define PIPE_VIDEO_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <pipe/p_video_state.h>
+
+struct pipe_screen;
+struct pipe_buffer;
+struct pipe_surface;
+struct pipe_video_surface;
+struct pipe_macroblock;
+struct pipe_picture_desc;
+struct pipe_fence_handle;
+
+/**
+ * Gallium video rendering context
+ */
+struct pipe_video_context
+{
+   struct pipe_screen *screen;
+   enum pipe_video_profile profile;
+   enum pipe_video_chroma_format chroma_format;
+   unsigned width;
+   unsigned height;
+
+   void *priv; /**< context private data (for DRI for example) */
+
+   void (*destroy)(struct pipe_video_context *vpipe);
+
+   /**
+    * Picture decoding and displaying
+    */
+   /*@{*/
+   void (*decode_bitstream)(struct pipe_video_context *vpipe,
+                            unsigned num_bufs,
+                            struct pipe_buffer **bitstream_buf);
+
+   void (*decode_macroblocks)(struct pipe_video_context *vpipe,
+                              struct pipe_video_surface *past,
+                              struct pipe_video_surface *future,
+                              unsigned num_macroblocks,
+                              struct pipe_macroblock *macroblocks,
+                              struct pipe_fence_handle **fence);
+
+   void (*clear_surface)(struct pipe_video_context *vpipe,
+                         unsigned x, unsigned y,
+                         unsigned width, unsigned height,
+                         unsigned value,
+                         struct pipe_surface *surface);
+
+   void (*render_picture)(struct pipe_video_context     *vpipe,
+                          /*struct pipe_surface         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_video_surface     *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_video_surface     *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_video_surface     *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_surface           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas,*/
+                          struct pipe_fence_handle      **fence);
+   /*@}*/
+
+   /**
+    * Parameter-like states (or properties)
+    */
+   /*@{*/
+   void (*set_picture_desc)(struct pipe_video_context *vpipe,
+                            const struct pipe_picture_desc *desc);
+
+   void (*set_decode_target)(struct pipe_video_context *vpipe,
+                             struct pipe_video_surface *dt);
+
+   /* TODO: Interface for CSC matrix, scaling modes, post-processing, etc. */
+   /*@}*/
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_VIDEO_CONTEXT_H */
diff --git a/src/gallium/include/pipe/p_video_state.h b/src/gallium/include/pipe/p_video_state.h
new file mode 100644
index 0000000000..2a7422bf04
--- /dev/null
+++ b/src/gallium/include/pipe/p_video_state.h
@@ -0,0 +1,157 @@
+#ifndef PIPE_VIDEO_STATE_H
+#define PIPE_VIDEO_STATE_H
+
+/* u_reduce_video_profile() needs these */
+#include <pipe/p_compiler.h>
+#include <util/u_debug.h>
+
+#include <pipe/p_defines.h>
+#include <pipe/p_format.h>
+#include <pipe/p_refcnt.h>
+#include <pipe/p_screen.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_video_surface
+{
+   struct pipe_reference reference;
+   struct pipe_screen *screen;
+   enum pipe_video_chroma_format chroma_format;
+   /*enum pipe_video_surface_format surface_format;*/
+   unsigned width;
+   unsigned height;
+};
+
+static INLINE void
+pipe_video_surface_reference(struct pipe_video_surface **ptr, struct pipe_video_surface *surf)
+{
+   struct pipe_video_surface *old_surf = *ptr;
+
+   if (pipe_reference((struct pipe_reference **)ptr, &surf->reference))
+      old_surf->screen->video_surface_destroy(old_surf);
+}
+
+struct pipe_video_rect
+{
+   unsigned x, y, w, h;
+};
+
+static INLINE enum pipe_video_codec
+u_reduce_video_profile(enum pipe_video_profile profile)
+{
+   switch (profile)
+   {
+      case PIPE_VIDEO_PROFILE_MPEG1:
+      case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+         return PIPE_VIDEO_CODEC_MPEG12;
+
+      case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+         return PIPE_VIDEO_CODEC_MPEG4;
+
+      case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+      case PIPE_VIDEO_PROFILE_VC1_MAIN:
+      case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+         return PIPE_VIDEO_CODEC_VC1;
+
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+         return PIPE_VIDEO_CODEC_MPEG4_AVC;
+
+      default:
+         assert(0);
+         return PIPE_VIDEO_CODEC_UNKNOWN;
+   }
+}
+
+enum pipe_mpeg12_picture_type
+{
+   PIPE_MPEG12_PICTURE_TYPE_FIELD_TOP,
+   PIPE_MPEG12_PICTURE_TYPE_FIELD_BOTTOM,
+   PIPE_MPEG12_PICTURE_TYPE_FRAME
+};
+
+enum pipe_mpeg12_macroblock_type
+{
+   PIPE_MPEG12_MACROBLOCK_TYPE_INTRA,
+   PIPE_MPEG12_MACROBLOCK_TYPE_FWD,
+   PIPE_MPEG12_MACROBLOCK_TYPE_BKWD,
+   PIPE_MPEG12_MACROBLOCK_TYPE_BI,
+	
+   PIPE_MPEG12_MACROBLOCK_NUM_TYPES
+};
+
+enum pipe_mpeg12_motion_type
+{
+   PIPE_MPEG12_MOTION_TYPE_FIELD,
+   PIPE_MPEG12_MOTION_TYPE_FRAME,
+   PIPE_MPEG12_MOTION_TYPE_DUALPRIME,
+   PIPE_MPEG12_MOTION_TYPE_16x8
+};
+
+enum pipe_mpeg12_dct_type
+{
+   PIPE_MPEG12_DCT_TYPE_FIELD,
+   PIPE_MPEG12_DCT_TYPE_FRAME
+};
+
+struct pipe_macroblock
+{
+   enum pipe_video_codec codec;
+};
+
+struct pipe_mpeg12_macroblock
+{
+   struct pipe_macroblock base;
+
+   unsigned mbx;
+   unsigned mby;
+   enum pipe_mpeg12_macroblock_type mb_type;
+   enum pipe_mpeg12_motion_type mo_type;
+   enum pipe_mpeg12_dct_type dct_type;
+   signed pmv[2][2][2];
+   unsigned cbp;
+   void *blocks;
+};
+
+#if 0
+struct pipe_picture_desc
+{
+   enum pipe_video_format format;
+};
+
+struct pipe_mpeg12_picture_desc
+{
+   struct pipe_picture_desc base;
+
+   /* TODO: Use bitfields where possible? */
+   struct pipe_surface *forward_reference;
+   struct pipe_surface *backward_reference;
+   unsigned picture_coding_type;
+   unsigned fcode;
+   unsigned intra_dc_precision;
+   unsigned picture_structure;
+   unsigned top_field_first;
+   unsigned frame_pred_frame_dct;
+   unsigned concealment_motion_vectors;
+   unsigned q_scale_type;
+   unsigned intra_vlc_format;
+   unsigned alternate_scan;
+   unsigned full_pel_forward_vector;
+   unsigned full_pel_backward_vector;
+   struct pipe_buffer *intra_quantizer_matrix;
+   struct pipe_buffer *non_intra_quantizer_matrix;
+   struct pipe_buffer *chroma_intra_quantizer_matrix;
+   struct pipe_buffer *chroma_non_intra_quantizer_matrix;
+};
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_VIDEO_STATE_H */
diff --git a/src/gallium/state_trackers/egl/egl_surface.c b/src/gallium/state_trackers/egl/egl_surface.c
index 69e2d6b708..542ac56121 100644
--- a/src/gallium/state_trackers/egl/egl_surface.c
+++ b/src/gallium/state_trackers/egl/egl_surface.c
@@ -152,7 +152,6 @@ drm_takedown_shown_screen(_EGLDisplay *dpy, struct drm_screen *screen)
 
 	pipe_surface_reference(&screen->surface, NULL);
 	pipe_texture_reference(&screen->tex, NULL);
-	pipe_buffer_reference(&screen->buffer, NULL);
 
 	screen->shown = 0;
 }
@@ -250,8 +249,8 @@ drm_show_screen_surface_mesa(_EGLDriver *drv, _EGLDisplay *dpy,
 
 
 	drm_create_texture(dpy, scrn, mode->Width, mode->Height);
-	if (!scrn->buffer)
-		return EGL_FALSE;
+	if (!scrn->tex)
+		goto err_tex;
 
 	ret = drmModeAddFB(dev->drmFD,
 	                   scrn->front.width, scrn->front.height,
@@ -325,8 +324,8 @@ err_fb:
 err_bo:
 	pipe_surface_reference(&scrn->surface, NULL);
 	pipe_texture_reference(&scrn->tex, NULL);
-	pipe_buffer_reference(&scrn->buffer, NULL);
 
+err_tex:
 	return EGL_FALSE;
 }
 
diff --git a/src/gallium/state_trackers/egl/egl_tracker.h b/src/gallium/state_trackers/egl/egl_tracker.h
index dd4730f957..f280748d65 100644
--- a/src/gallium/state_trackers/egl/egl_tracker.h
+++ b/src/gallium/state_trackers/egl/egl_tracker.h
@@ -94,7 +94,6 @@ struct drm_screen
 	 * pipe
 	 */
 
-	struct pipe_buffer *buffer;
 	struct pipe_texture *tex;
 	struct pipe_surface *surface;
 
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
index 20d682de3f..b1683b891b 100644
--- a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
@@ -51,6 +51,7 @@ static int vlResizeFrameBuffer
 	struct vlBasicCSC	*basic_csc;
 	struct pipe_context	*pipe;
 	struct pipe_texture	template;
+	float			clear_color[4];
 
 	assert(csc);
 
@@ -68,7 +69,12 @@ static int vlResizeFrameBuffer
 	basic_csc->viewport.translate[1] = 0;
 	basic_csc->viewport.translate[2] = 0;
 	basic_csc->viewport.translate[3] = 0;
-	
+
+	clear_color[0]	= 0.0f;
+	clear_color[1]  = 0.0f;
+	clear_color[2]  = 0.0f;
+	clear_color[3]  = 0.0f;
+
 	if (basic_csc->framebuffer_tex)
 	{
 		pipe_surface_reference(&basic_csc->framebuffer.cbufs[0], NULL);
@@ -98,7 +104,7 @@ static int vlResizeFrameBuffer
 
 	/* Clear to black, in case video doesn't fill the entire window */
 	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
-	pipe->clear(pipe, PIPE_CLEAR_COLOR, 0, 0.0f, 0);
+	pipe->clear(pipe, PIPE_CLEAR_COLOR, clear_color, 0.0f, 0);
 
 	return 0;
 }
@@ -425,7 +431,7 @@ static int vlCreateVertexShader
 	 */
 	for (i = 0; i < 2; ++i)
 	{
-		inst = vl_inst4(TGSI_OPCODE_MADD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+		inst = vl_inst4(TGSI_OPCODE_MAD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
 		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 	}
 
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
index ef4a4b2add..34d93e1df0 100644
--- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
@@ -615,7 +615,7 @@ static int vlCreateFragmentShaderFieldPMB
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	inst = vl_inst2(TGSI_OPCODE_FLR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* mul t3, t3, c1.y		; Multiply by 2 */
@@ -632,7 +632,7 @@ static int vlCreateFragmentShaderFieldPMB
 
 	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
 	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* add o0, t0, t1		; Add ref and differential to form final output */
@@ -969,7 +969,7 @@ static int vlCreateFragmentShaderFrameBMB
 	}
 
 	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
@@ -1116,7 +1116,7 @@ static int vlCreateFragmentShaderFieldBMB
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	inst = vl_inst2(TGSI_OPCODE_FLR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* mul t3, t3, c1.y		; Multiply by 2 */
@@ -1143,7 +1143,7 @@ static int vlCreateFragmentShaderFieldBMB
 
 	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
 	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/*
@@ -1158,11 +1158,11 @@ static int vlCreateFragmentShaderFieldBMB
 
 	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
 	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
 	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 
 	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
 	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index f0a4826a00..f0abd12e3d 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -172,6 +172,7 @@ st_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
                                   unsigned width, unsigned height,
                                   enum pipe_format format,
                                   unsigned usage,
+                                  unsigned tex_usage,
                                   unsigned *stride)
 {
    const unsigned alignment = 64;
diff --git a/src/gallium/state_trackers/wgl/SConscript b/src/gallium/state_trackers/wgl/SConscript
index 69b88618ec..b05944a33b 100644
--- a/src/gallium/state_trackers/wgl/SConscript
+++ b/src/gallium/state_trackers/wgl/SConscript
@@ -18,20 +18,17 @@ if env['platform'] in ['windows']:
     ])
      
     sources = [
-        'icd/stw_icd.c',
-
-        'wgl/stw_wgl.c',
-
-        'shared/stw_context.c',
-        'shared/stw_device.c',
-        'shared/stw_framebuffer.c',
-        'shared/stw_pixelformat.c',
-        'shared/stw_extensionsstring.c',
-        'shared/stw_extswapinterval.c',
-        'shared/stw_getprocaddress.c',
-        'shared/stw_extgallium.c',
-        'shared/stw_arbpixelformat.c',
-        'shared/stw_tls.c',
+        'stw_context.c',
+        'stw_device.c',
+        'stw_ext_extensionsstring.c',
+        'stw_ext_gallium.c',
+        'stw_ext_pixelformat.c',
+        'stw_ext_swapinterval.c',
+        'stw_framebuffer.c',
+        'stw_getprocaddress.c',
+        'stw_pixelformat.c',
+        'stw_tls.c',
+        'stw_wgl.c',
     ]
 
     wgl = env.ConvenienceLibrary(
diff --git a/src/gallium/state_trackers/wgl/opengl32.def b/src/gallium/state_trackers/wgl/opengl32.def
index 596417ed84..5daa6ddd41 100644
--- a/src/gallium/state_trackers/wgl/opengl32.def
+++ b/src/gallium/state_trackers/wgl/opengl32.def
@@ -376,6 +376,7 @@ EXPORTS
 	DrvDescribePixelFormat
 	DrvGetLayerPaletteEntries
 	DrvGetProcAddress
+	DrvPresentBuffers
 	DrvRealizeLayerPalette
 	DrvReleaseContext
 	DrvSetCallbackProcs
diff --git a/src/gallium/state_trackers/wgl/opengl32.mingw.def b/src/gallium/state_trackers/wgl/opengl32.mingw.def
index 1f03ea3b37..6ebb31a6f1 100644
--- a/src/gallium/state_trackers/wgl/opengl32.mingw.def
+++ b/src/gallium/state_trackers/wgl/opengl32.mingw.def
@@ -375,6 +375,7 @@ EXPORTS
 	DrvDescribePixelFormat = DrvDescribePixelFormat@16
 	DrvGetLayerPaletteEntries = DrvGetLayerPaletteEntries@20
 	DrvGetProcAddress = DrvGetProcAddress@4
+	DrvPresentBuffers = DrvPresentBuffers@8
 	DrvRealizeLayerPalette = DrvRealizeLayerPalette@12
 	DrvReleaseContext = DrvReleaseContext@4
 	DrvSetCallbackProcs = DrvSetCallbackProcs@8
diff --git a/src/gallium/state_trackers/wgl/shared/stw_context.c b/src/gallium/state_trackers/wgl/shared/stw_context.c
deleted file mode 100644
index 4968ecc692..0000000000
--- a/src/gallium/state_trackers/wgl/shared/stw_context.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <windows.h>
-
-#include "main/mtypes.h"
-#include "main/context.h"
-#include "pipe/p_compiler.h"
-#include "pipe/p_context.h"
-#include "state_tracker/st_context.h"
-#include "state_tracker/st_public.h"
-
-#ifdef DEBUG
-#include "trace/tr_screen.h"
-#include "trace/tr_context.h"
-#endif
-
-#include "shared/stw_device.h"
-#include "shared/stw_winsys.h"
-#include "shared/stw_framebuffer.h"
-#include "shared/stw_pixelformat.h"
-#include "stw_public.h"
-#include "stw_context.h"
-#include "stw_tls.h"
-
-
-static INLINE struct stw_context *
-stw_context(GLcontext *glctx)
-{
-   if(!glctx)
-      return NULL;
-   assert(glctx->DriverCtx);
-   return (struct stw_context *)glctx->DriverCtx;
-}
-
-static INLINE struct stw_context *
-stw_current_context(void)
-{
-   /* We must check if multiple threads are being used or GET_CURRENT_CONTEXT 
-    * might return the current context of the thread first seen. */
-   _glapi_check_multithread();
-
-   {
-      GET_CURRENT_CONTEXT( glctx );
-      return stw_context(glctx);
-   }
-}
-
-BOOL
-stw_copy_context(
-   UINT_PTR hglrcSrc,
-   UINT_PTR hglrcDst,
-   UINT mask )
-{
-   struct stw_context *src;
-   struct stw_context *dst;
-   BOOL ret = FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
-   src = stw_lookup_context_locked( hglrcSrc );
-   dst = stw_lookup_context_locked( hglrcDst );
-
-   if (src && dst) { 
-      /* FIXME */
-      assert(0);
-      (void) src;
-      (void) dst;
-      (void) mask;
-   }
-
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
-   return ret;
-}
-
-BOOL
-stw_share_lists(
-   UINT_PTR hglrc1, 
-   UINT_PTR hglrc2 )
-{
-   struct stw_context *ctx1;
-   struct stw_context *ctx2;
-   BOOL ret = FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
-   ctx1 = stw_lookup_context_locked( hglrc1 );
-   ctx2 = stw_lookup_context_locked( hglrc2 );
-
-   if (ctx1 && ctx2 &&
-       ctx1->iPixelFormat == ctx2->iPixelFormat) { 
-      ret = _mesa_share_state(ctx2->st->ctx, ctx1->st->ctx);
-   }
-
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
-   return ret;
-}
-
-static void
-stw_viewport(GLcontext * glctx, GLint x, GLint y,
-             GLsizei width, GLsizei height)
-{
-   struct stw_context *ctx = (struct stw_context *)glctx->DriverCtx;
-   struct stw_framebuffer *fb;
-   
-   fb = stw_framebuffer_from_hdc( ctx->hdc );
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
-   }
-}
-
-UINT_PTR
-stw_create_layer_context(
-   HDC hdc,
-   int iLayerPlane )
-{
-   int iPixelFormat;
-   const struct stw_pixelformat_info *pfi;
-   GLvisual visual;
-   struct stw_context *ctx = NULL;
-   struct pipe_screen *screen = NULL;
-   struct pipe_context *pipe = NULL;
-   
-   if(!stw_dev)
-      return 0;
-   
-   if (iLayerPlane != 0)
-      return 0;
-
-   iPixelFormat = GetPixelFormat(hdc);
-   if(!iPixelFormat)
-      return 0;
-   
-   pfi = stw_pixelformat_get_info( iPixelFormat - 1 );
-   stw_pixelformat_visual(&visual, pfi);
-   
-   ctx = CALLOC_STRUCT( stw_context );
-   if (ctx == NULL)
-      goto no_ctx;
-
-   ctx->hdc = hdc;
-   ctx->iPixelFormat = iPixelFormat;
-
-   screen = stw_dev->screen;
-
-#ifdef DEBUG
-   /* Unwrap screen */
-   if(stw_dev->trace_running)
-      screen = trace_screen(screen)->screen;
-#endif
-
-   pipe = stw_dev->stw_winsys->create_context( screen );
-   if (pipe == NULL) 
-      goto no_pipe;
-
-#ifdef DEBUG
-   /* Wrap context */
-   if(stw_dev->trace_running)
-      pipe = trace_context_create(stw_dev->screen, pipe);
-#endif
-
-   /* pass to stw_flush_frontbuffer as context_private */
-   assert(!pipe->priv);
-   pipe->priv = hdc;
-
-   ctx->st = st_create_context( pipe, &visual, NULL );
-   if (ctx->st == NULL) 
-      goto no_st_ctx;
-
-   ctx->st->ctx->DriverCtx = ctx;
-   ctx->st->ctx->Driver.Viewport = stw_viewport;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx->hglrc = handle_table_add(stw_dev->ctx_table, ctx);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   if (!ctx->hglrc)
-      goto no_hglrc;
-
-   return ctx->hglrc;
-
-no_hglrc:
-   st_destroy_context(ctx->st);
-   goto no_pipe; /* st_context_destroy already destroys pipe */
-no_st_ctx:
-   pipe->destroy( pipe );
-no_pipe:
-   FREE(ctx);
-no_ctx:
-   return 0;
-}
-
-BOOL
-stw_delete_context(
-   UINT_PTR hglrc )
-{
-   struct stw_context *ctx ;
-   BOOL ret = FALSE;
-   
-   if (!stw_dev)
-      return FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx = stw_lookup_context_locked(hglrc);
-   handle_table_remove(stw_dev->ctx_table, hglrc);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-
-   if (ctx) {
-      struct stw_context *curctx = stw_current_context();
-      
-      /* Unbind current if deleting current context. */
-      if (curctx == ctx)
-         st_make_current( NULL, NULL, NULL );
-
-      st_destroy_context(ctx->st);
-      FREE(ctx);
-
-      ret = TRUE;
-   }
-
-   return ret;
-}
-
-BOOL
-stw_release_context(
-   UINT_PTR hglrc )
-{
-   struct stw_context *ctx;
-
-   if (!stw_dev)
-      return FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx = stw_lookup_context_locked( hglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-
-   if (!ctx)
-      return FALSE;
-   
-   /* The expectation is that ctx is the same context which is
-    * current for this thread.  We should check that and return False
-    * if not the case.
-    */
-   if (ctx != stw_current_context())
-      return FALSE;
-
-   if (stw_make_current( NULL, 0 ) == FALSE)
-      return FALSE;
-
-   return TRUE;
-}
-
-
-UINT_PTR
-stw_get_current_context( void )
-{
-   struct stw_context *ctx;
-
-   ctx = stw_current_context();
-   if(!ctx)
-      return 0;
-   
-   return ctx->hglrc;
-}
-
-HDC
-stw_get_current_dc( void )
-{
-   struct stw_context *ctx;
-
-   ctx = stw_current_context();
-   if(!ctx)
-      return NULL;
-   
-   return ctx->hdc;
-}
-
-BOOL
-stw_make_current(
-   HDC hdc,
-   UINT_PTR hglrc )
-{
-   struct stw_context *curctx = NULL;
-   struct stw_context *ctx = NULL;
-   struct stw_framebuffer *fb = NULL;
-
-   if (!stw_dev)
-      goto fail;
-
-   curctx = stw_current_context();
-   if (curctx != NULL) {
-      if (curctx->hglrc != hglrc)
-	 st_flush(curctx->st, PIPE_FLUSH_RENDER_CACHE, NULL);
-      
-      /* Return if already current. */
-      if (curctx->hglrc == hglrc && curctx->hdc == hdc) {
-         ctx = curctx;
-         fb = stw_framebuffer_from_hdc( hdc );
-         goto success;
-      }
-   }
-
-   if (hdc == NULL || hglrc == 0) {
-      return st_make_current( NULL, NULL, NULL );
-   }
-
-   pipe_mutex_lock( stw_dev->ctx_mutex ); 
-   ctx = stw_lookup_context_locked( hglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex ); 
-   if(!ctx)
-      goto fail;
-
-   fb = stw_framebuffer_from_hdc( hdc );
-   if(!fb) { 
-      /* Applications should call SetPixelFormat before creating a context,
-       * but not all do, and the opengl32 runtime seems to use a default pixel
-       * format in some cases, so we must create a framebuffer for those here
-       */
-      int iPixelFormat = GetPixelFormat(hdc);
-      if(iPixelFormat)
-         fb = stw_framebuffer_create( hdc, iPixelFormat );
-      if(!fb) 
-         goto fail;
-   }
-   
-   if(fb->iPixelFormat != ctx->iPixelFormat)
-      goto fail;
-
-   /* Lazy allocation of the frame buffer */
-   if(!stw_framebuffer_allocate(fb))
-      goto fail;
-
-   /* Bind the new framebuffer */
-   ctx->hdc = hdc;
-   
-   /* pass to stw_flush_frontbuffer as context_private */
-   ctx->st->pipe->priv = hdc;
-   
-   if(!st_make_current( ctx->st, fb->stfb, fb->stfb ))
-      goto fail;
-
-success:
-   assert(fb);
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
-   }
-   
-   return TRUE;
-
-fail:
-   if(fb)
-      stw_framebuffer_release(fb);
-   st_make_current( NULL, NULL, NULL );
-   return FALSE;
-}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_public.h b/src/gallium/state_trackers/wgl/shared/stw_public.h
deleted file mode 100644
index 7fe9cfb356..0000000000
--- a/src/gallium/state_trackers/wgl/shared/stw_public.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef STW_PUBLIC_H
-#define STW_PUBLIC_H
-
-#include <windows.h>
-
-BOOL stw_copy_context( UINT_PTR hglrcSrc,
-                       UINT_PTR hglrcDst,
-                       UINT mask );
-
-UINT_PTR stw_create_layer_context( HDC hdc, 
-                                   int iLayerPlane );
-
-BOOL stw_share_lists( UINT_PTR hglrc1, UINT_PTR hglrc2 );
-
-BOOL stw_delete_context( UINT_PTR hglrc );
-
-BOOL
-stw_release_context( UINT_PTR dhglrc );
-
-UINT_PTR stw_get_current_context( void );
-
-HDC stw_get_current_dc( void );
-
-BOOL stw_make_current( HDC hdc, UINT_PTR hglrc );
-
-BOOL stw_swap_buffers( HDC hdc );
-
-BOOL
-stw_swap_layer_buffers( HDC hdc, UINT fuPlanes );
-
-PROC stw_get_proc_address( LPCSTR lpszProc );
-
-int stw_pixelformat_describe( HDC hdc,
-                              int iPixelFormat,
-                              UINT nBytes,
-                              LPPIXELFORMATDESCRIPTOR ppfd );
-
-int stw_pixelformat_get( HDC hdc );
-
-BOOL stw_pixelformat_set( HDC hdc,
-                          int iPixelFormat );
-
-int stw_pixelformat_choose( HDC hdc,
-                            CONST PIXELFORMATDESCRIPTOR *ppfd );
-
-#endif
diff --git a/src/gallium/state_trackers/wgl/icd/stw_icd.c b/src/gallium/state_trackers/wgl/stw_context.c
index 347f40aa06..f2f0264844 100644
--- a/src/gallium/state_trackers/wgl/icd/stw_icd.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -26,18 +26,49 @@
  **************************************************************************/
 
 #include <windows.h>
-#include <stdio.h>
 
-#include "GL/gl.h"
-
-#include "util/u_debug.h"
-#include "pipe/p_thread.h"
-
-#include "shared/stw_public.h"
-#include "icd/stw_icd.h"
+#include "main/mtypes.h"
+#include "main/context.h"
+#include "pipe/p_compiler.h"
+#include "pipe/p_context.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_public.h"
+
+#ifdef DEBUG
+#include "trace/tr_screen.h"
+#include "trace/tr_context.h"
+#endif
+
+#include "stw_icd.h"
+#include "stw_device.h"
+#include "stw_winsys.h"
+#include "stw_framebuffer.h"
+#include "stw_pixelformat.h"
+#include "stw_context.h"
+#include "stw_tls.h"
+
+
+static INLINE struct stw_context *
+stw_context(GLcontext *glctx)
+{
+   if(!glctx)
+      return NULL;
+   assert(glctx->DriverCtx);
+   return (struct stw_context *)glctx->DriverCtx;
+}
 
-#define DBG 0
+static INLINE struct stw_context *
+stw_current_context(void)
+{
+   /* We must check if multiple threads are being used or GET_CURRENT_CONTEXT 
+    * might return the current context of the thread first seen. */
+   _glapi_check_multithread();
 
+   {
+      GET_CURRENT_CONTEXT( glctx );
+      return stw_context(glctx);
+   }
+}
 
 BOOL APIENTRY
 DrvCopyContext(
@@ -45,24 +76,64 @@ DrvCopyContext(
    DHGLRC dhrcDest,
    UINT fuMask )
 {
-   return stw_copy_context(dhrcSource, dhrcDest, fuMask);
-}
+   struct stw_context *src;
+   struct stw_context *dst;
+   BOOL ret = FALSE;
 
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   
+   src = stw_lookup_context_locked( dhrcSource );
+   dst = stw_lookup_context_locked( dhrcDest );
+
+   if (src && dst) { 
+      /* FIXME */
+      assert(0);
+      (void) src;
+      (void) dst;
+      (void) fuMask;
+   }
 
-DHGLRC APIENTRY
-DrvCreateLayerContext(
-   HDC hdc,
-   INT iLayerPlane )
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   
+   return ret;
+}
+
+BOOL APIENTRY
+DrvShareLists(
+   DHGLRC dhglrc1,
+   DHGLRC dhglrc2 )
 {
-   DHGLRC r;
+   struct stw_context *ctx1;
+   struct stw_context *ctx2;
+   BOOL ret = FALSE;
+
+   pipe_mutex_lock( stw_dev->ctx_mutex );
    
-   r = stw_create_layer_context( hdc, iLayerPlane );
+   ctx1 = stw_lookup_context_locked( dhglrc1 );
+   ctx2 = stw_lookup_context_locked( dhglrc2 );
+
+   if (ctx1 && ctx2 &&
+       ctx1->iPixelFormat == ctx2->iPixelFormat) { 
+      ret = _mesa_share_state(ctx2->st->ctx, ctx1->st->ctx);
+   }
+
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
    
-   if (DBG)
-      debug_printf( "%s( %p, %i ) = %u\n",
-                    __FUNCTION__, hdc, iLayerPlane, r );
+   return ret;
+}
+
+static void
+stw_viewport(GLcontext * glctx, GLint x, GLint y,
+             GLsizei width, GLsizei height)
+{
+   struct stw_context *ctx = (struct stw_context *)glctx->DriverCtx;
+   struct stw_framebuffer *fb;
    
-   return r;
+   fb = stw_framebuffer_from_hdc( ctx->hdc );
+   if(fb) {
+      stw_framebuffer_update(fb);
+      stw_framebuffer_release(fb);
+   }
 }
 
 DHGLRC APIENTRY
@@ -72,114 +143,253 @@ DrvCreateContext(
    return DrvCreateLayerContext( hdc, 0 );
 }
 
-BOOL APIENTRY
-DrvDeleteContext(
-   DHGLRC dhglrc )
+DHGLRC APIENTRY
+DrvCreateLayerContext(
+   HDC hdc,
+   INT iLayerPlane )
 {
-   BOOL r;
+   int iPixelFormat;
+   const struct stw_pixelformat_info *pfi;
+   GLvisual visual;
+   struct stw_context *ctx = NULL;
+   struct pipe_screen *screen = NULL;
+   struct pipe_context *pipe = NULL;
    
-   r = stw_delete_context( dhglrc );
+   if(!stw_dev)
+      return 0;
    
-   if (DBG)
-      debug_printf( "%s( %u ) = %u\n",
-                    __FUNCTION__, dhglrc, r );
+   if (iLayerPlane != 0)
+      return 0;
+
+   iPixelFormat = GetPixelFormat(hdc);
+   if(!iPixelFormat)
+      return 0;
    
-   return r;
+   pfi = stw_pixelformat_get_info( iPixelFormat - 1 );
+   stw_pixelformat_visual(&visual, pfi);
+   
+   ctx = CALLOC_STRUCT( stw_context );
+   if (ctx == NULL)
+      goto no_ctx;
+
+   ctx->hdc = hdc;
+   ctx->iPixelFormat = iPixelFormat;
+
+   screen = stw_dev->screen;
+
+#ifdef DEBUG
+   /* Unwrap screen */
+   if(stw_dev->trace_running)
+      screen = trace_screen(screen)->screen;
+#endif
+
+   pipe = stw_dev->stw_winsys->create_context( screen );
+   if (pipe == NULL) 
+      goto no_pipe;
+
+#ifdef DEBUG
+   /* Wrap context */
+   if(stw_dev->trace_running)
+      pipe = trace_context_create(stw_dev->screen, pipe);
+#endif
+
+   /* pass to stw_flush_frontbuffer as context_private */
+   assert(!pipe->priv);
+   pipe->priv = hdc;
+
+   ctx->st = st_create_context( pipe, &visual, NULL );
+   if (ctx->st == NULL) 
+      goto no_st_ctx;
+
+   ctx->st->ctx->DriverCtx = ctx;
+   ctx->st->ctx->Driver.Viewport = stw_viewport;
+
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx->dhglrc = handle_table_add(stw_dev->ctx_table, ctx);
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   if (!ctx->dhglrc)
+      goto no_hglrc;
+
+   return ctx->dhglrc;
+
+no_hglrc:
+   st_destroy_context(ctx->st);
+   goto no_pipe; /* st_context_destroy already destroys pipe */
+no_st_ctx:
+   pipe->destroy( pipe );
+no_pipe:
+   FREE(ctx);
+no_ctx:
+   return 0;
 }
 
 BOOL APIENTRY
-DrvDescribeLayerPlane(
-   HDC hdc,
-   INT iPixelFormat,
-   INT iLayerPlane,
-   UINT nBytes,
-   LPLAYERPLANEDESCRIPTOR plpd )
+DrvDeleteContext(
+   DHGLRC dhglrc )
 {
-   if (DBG) 
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx ;
+   BOOL ret = FALSE;
+   
+   if (!stw_dev)
+      return FALSE;
 
-   return FALSE;
-}
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx = stw_lookup_context_locked(dhglrc);
+   handle_table_remove(stw_dev->ctx_table, dhglrc);
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
 
-LONG APIENTRY
-DrvDescribePixelFormat(
-   HDC hdc,
-   INT iPixelFormat,
-   ULONG cjpfd,
-   PIXELFORMATDESCRIPTOR *ppfd )
-{
-   LONG r;
+   if (ctx) {
+      struct stw_context *curctx = stw_current_context();
+      
+      /* Unbind current if deleting current context. */
+      if (curctx == ctx)
+         st_make_current( NULL, NULL, NULL );
 
-   r = stw_pixelformat_describe( hdc, iPixelFormat, cjpfd, ppfd );
+      st_destroy_context(ctx->st);
+      FREE(ctx);
 
-   if (DBG)
-      debug_printf( "%s( %p, %d, %u, %p ) = %d\n",
-                    __FUNCTION__, hdc, iPixelFormat, cjpfd, ppfd, r );
+      ret = TRUE;
+   }
 
-   return r;
+   return ret;
 }
 
-int APIENTRY
-DrvGetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   COLORREF *pcr )
+BOOL APIENTRY
+DrvReleaseContext(
+   DHGLRC dhglrc )
 {
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx;
 
-   return 0;
-}
+   if (!stw_dev)
+      return FALSE;
 
-PROC APIENTRY
-DrvGetProcAddress(
-   LPCSTR lpszProc )
-{
-   PROC r;
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx = stw_lookup_context_locked( dhglrc );
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
 
-   r = stw_get_proc_address( lpszProc );
+   if (!ctx)
+      return FALSE;
+   
+   /* The expectation is that ctx is the same context which is
+    * current for this thread.  We should check that and return False
+    * if not the case.
+    */
+   if (ctx != stw_current_context())
+      return FALSE;
 
-   if (DBG)
-      debug_printf( "%s( \"%s\" ) = %p\n", __FUNCTION__, lpszProc, r );
+   if (stw_make_current( NULL, 0 ) == FALSE)
+      return FALSE;
 
-   return r;
+   return TRUE;
 }
 
-BOOL APIENTRY
-DrvRealizeLayerPalette(
-   HDC hdc,
-   INT iLayerPlane,
-   BOOL bRealize )
+
+DHGLRC
+stw_get_current_context( void )
 {
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx;
 
-   return FALSE;
+   ctx = stw_current_context();
+   if(!ctx)
+      return 0;
+   
+   return ctx->dhglrc;
 }
 
-BOOL APIENTRY
-DrvReleaseContext(
-   DHGLRC dhglrc )
+HDC
+stw_get_current_dc( void )
 {
-   return stw_release_context(dhglrc);
+   struct stw_context *ctx;
+
+   ctx = stw_current_context();
+   if(!ctx)
+      return NULL;
+   
+   return ctx->hdc;
 }
 
-void APIENTRY
-DrvSetCallbackProcs(
-   INT nProcs,
-   PROC *pProcs )
+BOOL
+stw_make_current(
+   HDC hdc,
+   DHGLRC dhglrc )
 {
-   if (DBG)
-      debug_printf( "%s( %d, %p )\n", __FUNCTION__, nProcs, pProcs );
+   struct stw_context *curctx = NULL;
+   struct stw_context *ctx = NULL;
+   struct stw_framebuffer *fb = NULL;
 
-   return;
-}
+   if (!stw_dev)
+      goto fail;
+
+   curctx = stw_current_context();
+   if (curctx != NULL) {
+      if (curctx->dhglrc != dhglrc)
+	 st_flush(curctx->st, PIPE_FLUSH_RENDER_CACHE, NULL);
+      
+      /* Return if already current. */
+      if (curctx->dhglrc == dhglrc && curctx->hdc == hdc) {
+         ctx = curctx;
+         fb = stw_framebuffer_from_hdc( hdc );
+         goto success;
+      }
+   }
+
+   if (hdc == NULL || dhglrc == 0) {
+      return st_make_current( NULL, NULL, NULL );
+   }
+
+   pipe_mutex_lock( stw_dev->ctx_mutex ); 
+   ctx = stw_lookup_context_locked( dhglrc );
+   pipe_mutex_unlock( stw_dev->ctx_mutex ); 
+   if(!ctx)
+      goto fail;
+
+   fb = stw_framebuffer_from_hdc( hdc );
+   if(!fb) { 
+      /* Applications should call SetPixelFormat before creating a context,
+       * but not all do, and the opengl32 runtime seems to use a default pixel
+       * format in some cases, so we must create a framebuffer for those here
+       */
+      int iPixelFormat = GetPixelFormat(hdc);
+      if(iPixelFormat)
+         fb = stw_framebuffer_create( hdc, iPixelFormat );
+      if(!fb) 
+         goto fail;
+   }
+   
+   if(fb->iPixelFormat != ctx->iPixelFormat)
+      goto fail;
+
+   /* Lazy allocation of the frame buffer */
+   if(!stw_framebuffer_allocate(fb))
+      goto fail;
+
+   /* Bind the new framebuffer */
+   ctx->hdc = hdc;
+   
+   /* pass to stw_flush_frontbuffer as context_private */
+   ctx->st->pipe->priv = hdc;
+   
+   if(!st_make_current( ctx->st, fb->stfb, fb->stfb ))
+      goto fail;
+
+success:
+   assert(fb);
+   if(fb) {
+      stw_framebuffer_update(fb);
+      stw_framebuffer_release(fb);
+   }
+   
+   return TRUE;
 
+fail:
+   if(fb)
+      stw_framebuffer_release(fb);
+   st_make_current( NULL, NULL, NULL );
+   return FALSE;
+}
 
 /**
- * Although WGL allows different dispatch entrypoints per context 
+ * Although WGL allows different dispatch entrypoints per context
  */
 static const GLCLTPROCTABLE cpt =
 {
@@ -524,7 +734,6 @@ static const GLCLTPROCTABLE cpt =
    }
 };
 
-
 PGLCLTPROCTABLE APIENTRY
 DrvSetContext(
    HDC hdc,
@@ -532,86 +741,9 @@ DrvSetContext(
    PFN_SETPROCTABLE pfnSetProcTable )
 {
    PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt;
-   
+
    if (!stw_make_current( hdc, dhglrc ))
       r = NULL;
-      
-   if (DBG)
-      debug_printf( "%s( 0x%p, %u, 0x%p ) = %p\n", 
-                    __FUNCTION__, hdc, dhglrc, pfnSetProcTable, r );
-
-   return r;
-}
-
-int APIENTRY
-DrvSetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   CONST COLORREF *pcr )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return 0;
-}
-
-BOOL APIENTRY
-DrvSetPixelFormat(
-   HDC hdc,
-   LONG iPixelFormat )
-{
-   BOOL r;
-
-   r = stw_pixelformat_set( hdc, iPixelFormat );
-
-   if (DBG)
-      debug_printf( "%s( %p, %d ) = %s\n", __FUNCTION__, hdc, iPixelFormat, r ? "TRUE" : "FALSE" );
 
    return r;
 }
-
-BOOL APIENTRY
-DrvShareLists(
-   DHGLRC dhglrc1,
-   DHGLRC dhglrc2 )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return stw_share_lists(dhglrc1, dhglrc2);
-}
-
-BOOL APIENTRY
-DrvSwapBuffers(
-   HDC hdc )
-{
-   if (DBG)
-      debug_printf( "%s( %p )\n", __FUNCTION__, hdc );
-
-   return stw_swap_buffers( hdc );
-}
-
-BOOL APIENTRY
-DrvSwapLayerBuffers(
-   HDC hdc,
-   UINT fuPlanes )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return stw_swap_layer_buffers( hdc, fuPlanes );
-}
-
-BOOL APIENTRY
-DrvValidateVersion(
-   ULONG ulVersion )
-{
-   if (DBG)
-      debug_printf( "%s( %u )\n", __FUNCTION__, ulVersion );
-
-   /* TODO: get the expected version from the winsys */
-   
-   return ulVersion == 1;
-}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h
index 166471de5e..256c27e21e 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_context.h
+++ b/src/gallium/state_trackers/wgl/stw_context.h
@@ -35,9 +35,15 @@ struct st_context;
 struct stw_context
 {
    struct st_context *st;
-   UINT_PTR hglrc;
+   DHGLRC dhglrc;
    int iPixelFormat;
    HDC hdc;
 };
 
+DHGLRC stw_get_current_context( void );
+
+HDC stw_get_current_dc( void );
+
+BOOL stw_make_current( HDC hdc, DHGLRC dhglrc );
+
 #endif /* STW_CONTEXT_H */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index 0b6954915a..985b8f0456 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -29,6 +29,7 @@
 
 #include "glapi/glthread.h"
 #include "util/u_debug.h"
+#include "util/u_math.h"
 #include "pipe/p_screen.h"
 #include "state_tracker/st_public.h"
 
@@ -37,12 +38,12 @@
 #include "trace/tr_texture.h"
 #endif
 
-#include "shared/stw_device.h"
-#include "shared/stw_winsys.h"
-#include "shared/stw_pixelformat.h"
-#include "shared/stw_public.h"
-#include "shared/stw_tls.h"
-#include "shared/stw_framebuffer.h"
+#include "stw_device.h"
+#include "stw_winsys.h"
+#include "stw_pixelformat.h"
+#include "stw_icd.h"
+#include "stw_tls.h"
+#include "stw_framebuffer.h"
 
 #ifdef WIN32_THREADS
 extern _glthread_Mutex OneTimeLock;
@@ -62,38 +63,28 @@ stw_flush_frontbuffer(struct pipe_screen *screen,
                      struct pipe_surface *surface,
                      void *context_private )
 {
-   const struct stw_winsys *stw_winsys = stw_dev->stw_winsys;
    HDC hdc = (HDC)context_private;
    struct stw_framebuffer *fb;
    
    fb = stw_framebuffer_from_hdc( hdc );
-   /* fb can be NULL if window was destroyed already */
-   if (fb) {
+   if (!fb) {
+      /* fb can be NULL if window was destroyed already */
+      return;
+   }
+
 #if DEBUG
-      {
-         struct pipe_surface *surface2;
-   
-         if(!st_get_framebuffer_surface( fb->stfb, ST_SURFACE_FRONT_LEFT, &surface2 ))
-            assert(0);
-         else
-            assert(surface2 == surface);
-      }
-#endif
+   {
+      /* ensure that a random surface was not passed to us */
+      struct pipe_surface *surface2;
 
-#ifdef DEBUG
-      if(stw_dev->trace_running) {
-         screen = trace_screen(screen)->screen;
-         surface = trace_surface(surface)->surface;
-      }
-#endif
-   }
-   
-   stw_winsys->flush_frontbuffer(screen, surface, hdc);
-   
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
+      if(!st_get_framebuffer_surface( fb->stfb, ST_SURFACE_FRONT_LEFT, &surface2 ))
+         assert(0);
+      else
+         assert(surface2 == surface);
    }
+#endif
+
+   stw_framebuffer_present_locked(hdc, fb, ST_SURFACE_FRONT_LEFT);
 }
 
 
@@ -126,6 +117,9 @@ stw_init(const struct stw_winsys *stw_winsys)
    if(!screen)
       goto error1;
 
+   if(stw_winsys->get_adapter_luid)
+      stw_winsys->get_adapter_luid(screen, &stw_dev->AdapterLuid);
+
 #ifdef DEBUG
    stw_dev->screen = trace_screen_create(screen);
    stw_dev->trace_running = stw_dev->screen != screen ? TRUE : FALSE;
@@ -182,7 +176,7 @@ stw_cleanup(void)
       /* Ensure all contexts are destroyed */
       i = handle_table_get_first_handle(stw_dev->ctx_table);
       while (i) {
-         stw_delete_context(i);
+         DrvDeleteContext(i);
          i = handle_table_get_next_handle(stw_dev->ctx_table, i);
       }
       handle_table_destroy(stw_dev->ctx_table);
@@ -212,7 +206,7 @@ stw_cleanup(void)
 
 
 struct stw_context *
-stw_lookup_context_locked( UINT_PTR dhglrc )
+stw_lookup_context_locked( DHGLRC dhglrc )
 {
    if (dhglrc == 0)
       return NULL;
@@ -223,3 +217,28 @@ stw_lookup_context_locked( UINT_PTR dhglrc )
    return (struct stw_context *) handle_table_get(stw_dev->ctx_table, dhglrc);
 }
 
+
+void APIENTRY
+DrvSetCallbackProcs(
+   INT nProcs,
+   PROC *pProcs )
+{
+   size_t size;
+
+   if (stw_dev == NULL)
+      return;
+
+   size = MIN2(nProcs * sizeof *pProcs, sizeof stw_dev->callbacks);
+   memcpy(&stw_dev->callbacks, pProcs, size);
+
+   return;
+}
+
+
+BOOL APIENTRY
+DrvValidateVersion(
+   ULONG ulVersion )
+{
+   /* TODO: get the expected version from the winsys */
+   return ulVersion == 1;
+}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index e1bb9518dd..0bf3b0da82 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -29,11 +29,10 @@
 #define STW_DEVICE_H_
 
 
-#include <windows.h>
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_thread.h"
 #include "util/u_handle_table.h"
+#include "stw_icd.h"
 #include "stw_pixelformat.h"
 
 
@@ -53,10 +52,14 @@ struct stw_device
    boolean trace_running;
 #endif
 
+   LUID AdapterLuid;
+
    struct stw_pixelformat_info pixelformats[STW_MAX_PIXELFORMATS];
    unsigned pixelformat_count;
    unsigned pixelformat_extended_count;
 
+   GLCALLBACKTABLE callbacks;
+
    pipe_mutex ctx_mutex;
    struct handle_table *ctx_table;
    
@@ -69,7 +72,7 @@ struct stw_device
 };
 
 struct stw_context *
-stw_lookup_context_locked( UINT_PTR hglrc );
+stw_lookup_context_locked( DHGLRC hglrc );
 
 extern struct stw_device *stw_dev;
 
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extensionsstring.c b/src/gallium/state_trackers/wgl/stw_ext_extensionsstring.c
index 62c859e1f9..62c859e1f9 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extensionsstring.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_extensionsstring.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extgallium.c b/src/gallium/state_trackers/wgl/stw_ext_gallium.c
index fc22737d7e..fb30ec5dba 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extgallium.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_gallium.c
@@ -27,9 +27,9 @@
 
 
 #include "pipe/p_screen.h"
-#include "stw_public.h"
 #include "stw_device.h"
 #include "stw_winsys.h"
+#include "stw_ext_gallium.h"
 
 #ifdef DEBUG
 #include "trace/tr_screen.h"
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extgallium.h b/src/gallium/state_trackers/wgl/stw_ext_gallium.h
index cc35f2bb7f..cc35f2bb7f 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extgallium.h
+++ b/src/gallium/state_trackers/wgl/stw_ext_gallium.h
diff --git a/src/gallium/state_trackers/wgl/shared/stw_arbpixelformat.c b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
index 0e2d407699..8a9995aba8 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_arbpixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
@@ -43,7 +43,6 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
-#include "stw_public.h"
 #include "stw_pixelformat.h"
 
 
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extswapinterval.c b/src/gallium/state_trackers/wgl/stw_ext_swapinterval.c
index 9eac6a1d09..9eac6a1d09 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extswapinterval.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_swapinterval.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index b8956bb550..8a3e11b6b4 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ *
+ * Copyright 2008-2009 Vmware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,19 +10,19 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <windows.h>
@@ -38,9 +38,9 @@
 #include "trace/tr_texture.h"
 #endif
 
+#include "stw_icd.h"
 #include "stw_framebuffer.h"
 #include "stw_device.h"
-#include "stw_public.h"
 #include "stw_winsys.h"
 #include "stw_tls.h"
 
@@ -83,6 +83,9 @@ stw_framebuffer_destroy_locked(
    *link = fb->next;
    fb->next = NULL;
 
+   if(fb->shared_surface)
+      stw_dev->stw_winsys->shared_surface_close(stw_dev->screen, fb->shared_surface);
+
    st_unreference_framebuffer(fb->stfb);
    
    pipe_mutex_unlock( fb->mutex );
@@ -106,13 +109,18 @@ static INLINE void
 stw_framebuffer_get_size( struct stw_framebuffer *fb )
 {
    unsigned width, height;
-   RECT rect;
+   RECT client_rect;
+   RECT window_rect;
+   POINT client_pos;
 
    assert(fb->hWnd);
    
-   GetClientRect( fb->hWnd, &rect );
-   width = rect.right - rect.left;
-   height = rect.bottom - rect.top;
+   /* Get the client area size. */
+   GetClientRect( fb->hWnd, &client_rect );
+   assert(client_rect.left == 0);
+   assert(client_rect.top == 0);
+   width = client_rect.right - client_rect.left;
+   height = client_rect.bottom - client_rect.top;
 
    if(width < 1)
       width = 1;
@@ -124,6 +132,31 @@ stw_framebuffer_get_size( struct stw_framebuffer *fb )
       fb->width = width; 
       fb->height = height; 
    }
+
+   client_pos.x = 0;
+   client_pos.y = 0;
+   ClientToScreen(fb->hWnd, &client_pos);
+
+   GetWindowRect(fb->hWnd, &window_rect);
+
+   fb->client_rect.left = client_pos.x - window_rect.left;
+   fb->client_rect.top =  client_pos.y - window_rect.top;
+   fb->client_rect.right = fb->client_rect.left + fb->width;
+   fb->client_rect.bottom = fb->client_rect.top + fb->height;
+
+#if 0
+   debug_printf("\n");
+   debug_printf("%s: client_position = (%i, %i)\n",
+                __FUNCTION__, client_pos.x, client_pos.y);
+   debug_printf("%s: window_rect = (%i, %i) - (%i, %i)\n",
+                __FUNCTION__,
+                window_rect.left, window_rect.top,
+                window_rect.right, window_rect.bottom);
+   debug_printf("%s: client_rect = (%i, %i) - (%i, %i)\n",
+                __FUNCTION__,
+                fb->client_rect.left, fb->client_rect.top,
+                fb->client_rect.right, fb->client_rect.bottom);
+#endif
 }
 
 
@@ -155,6 +188,7 @@ stw_call_window_proc(
        * can be masked out by the application. */
       LPWINDOWPOS lpWindowPos = (LPWINDOWPOS)pParams->lParam;
       if((lpWindowPos->flags & SWP_SHOWWINDOW) || 
+         !(lpWindowPos->flags & SWP_NOMOVE) ||
          !(lpWindowPos->flags & SWP_NOSIZE)) {
          fb = stw_framebuffer_from_hwnd( pParams->hwnd );
          if(fb) {
@@ -379,10 +413,10 @@ stw_framebuffer_from_hwnd(
 }
 
 
-BOOL
-stw_pixelformat_set(
+BOOL APIENTRY
+DrvSetPixelFormat(
    HDC hdc,
-   int iPixelFormat )
+   LONG iPixelFormat )
 {
    uint count;
    uint index;
@@ -435,35 +469,24 @@ stw_pixelformat_get(
 }
 
 
-BOOL
-stw_swap_buffers(
-   HDC hdc )
+BOOL APIENTRY
+DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
 {
    struct stw_framebuffer *fb;
    struct pipe_screen *screen;
    struct pipe_surface *surface;
+   unsigned surface_index;
+   BOOL ret = FALSE;
 
    fb = stw_framebuffer_from_hdc( hdc );
    if (fb == NULL)
       return FALSE;
 
-   if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
-      stw_framebuffer_release(fb);
-      return TRUE;
-   }
-
-   /* If we're swapping the buffer associated with the current context
-    * we have to flush any pending rendering commands first.
-    */
-   st_notify_swapbuffers( fb->stfb );
-
    screen = stw_dev->screen;
-   
-   if(!st_get_framebuffer_surface( fb->stfb, ST_SURFACE_BACK_LEFT, &surface )) {
-      /* FIXME: this shouldn't happen, but does on glean */
-      stw_framebuffer_release(fb);
-      return FALSE;
-   }
+
+   surface_index = (unsigned)(uintptr_t)data->pPrivateData;
+   if(!st_get_framebuffer_surface( fb->stfb, surface_index, &surface ))
+      goto fail;
 
 #ifdef DEBUG
    if(stw_dev->trace_running) {
@@ -472,22 +495,127 @@ stw_swap_buffers(
    }
 #endif
 
-   stw_dev->stw_winsys->flush_frontbuffer( screen, surface, hdc );
-   
+   if(data->hSharedSurface != fb->hSharedSurface) {
+      if(fb->shared_surface) {
+         stw_dev->stw_winsys->shared_surface_close(screen, fb->shared_surface);
+         fb->shared_surface = NULL;
+      }
+
+      fb->hSharedSurface = data->hSharedSurface;
+
+      if(data->hSharedSurface &&
+         stw_dev->stw_winsys->shared_surface_open) {
+         fb->shared_surface = stw_dev->stw_winsys->shared_surface_open(screen, fb->hSharedSurface);
+      }
+   }
+
+   if(fb->shared_surface) {
+      stw_dev->stw_winsys->compose(screen,
+                                   surface,
+                                   fb->shared_surface,
+                                   &fb->client_rect,
+                                   data->PresentHistoryToken);
+   }
+   else {
+      stw_dev->stw_winsys->present( screen, surface, hdc );
+   }
+
+   ret = TRUE;
+
+fail:
+
    stw_framebuffer_update(fb);
+
    stw_framebuffer_release(fb);
-   
-   return TRUE;
+
+   return ret;
 }
 
 
+/**
+ * Queue a composition.
+ *
+ * It will drop the lock on success.
+ */
 BOOL
-stw_swap_layer_buffers(
+stw_framebuffer_present_locked(HDC hdc,
+                               struct stw_framebuffer *fb,
+                               unsigned surface_index)
+{
+   if(stw_dev->callbacks.wglCbPresentBuffers &&
+      stw_dev->stw_winsys->compose) {
+      GLCBPRESENTBUFFERSDATA data;
+
+      memset(&data, 0, sizeof data);
+      data.magic1 = 2;
+      data.magic2 = 0;
+      data.AdapterLuid = stw_dev->AdapterLuid;
+      data.rect = fb->client_rect;
+      data.pPrivateData = (void *)(uintptr_t)surface_index;
+
+      stw_framebuffer_release(fb);
+
+      return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data);
+   }
+   else {
+      struct pipe_screen *screen = stw_dev->screen;
+      struct pipe_surface *surface;
+
+      if(!st_get_framebuffer_surface( fb->stfb, surface_index, &surface )) {
+         /* FIXME: this shouldn't happen, but does on glean */
+         stw_framebuffer_release(fb);
+         return FALSE;
+      }
+
+#ifdef DEBUG
+      if(stw_dev->trace_running) {
+         screen = trace_screen(screen)->screen;
+         surface = trace_surface(surface)->surface;
+      }
+#endif
+
+      stw_dev->stw_winsys->present( screen, surface, hdc );
+
+      stw_framebuffer_update(fb);
+
+      stw_framebuffer_release(fb);
+
+      return TRUE;
+   }
+}
+
+
+BOOL APIENTRY
+DrvSwapBuffers(
+   HDC hdc )
+{
+   struct stw_framebuffer *fb;
+
+   fb = stw_framebuffer_from_hdc( hdc );
+   if (fb == NULL)
+      return FALSE;
+
+   if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
+      stw_framebuffer_release(fb);
+      return TRUE;
+   }
+
+   /* If we're swapping the buffer associated with the current context
+    * we have to flush any pending rendering commands first.
+    */
+   st_notify_swapbuffers( fb->stfb );
+
+   return stw_framebuffer_present_locked(hdc, fb, ST_SURFACE_BACK_LEFT);
+}
+
+
+BOOL APIENTRY
+DrvSwapLayerBuffers(
    HDC hdc,
    UINT fuPlanes )
 {
    if(fuPlanes & WGL_SWAP_MAIN_PLANE)
-      return stw_swap_buffers(hdc);
+      return DrvSwapBuffers(hdc);
 
    return FALSE;
 }
diff --git a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h
index 13d29f37e4..5afbe74908 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.h
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h
@@ -73,9 +73,20 @@ struct stw_framebuffer
    
    /* FIXME: Make this work for multiple contexts bound to the same framebuffer */
    boolean must_resize;
+
    unsigned width;
    unsigned height;
    
+   /**
+    * Client area rectangle, relative to the window upper-left corner.
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::rect.
+    */
+   RECT client_rect;
+
+   HANDLE hSharedSurface;
+   struct stw_shared_surface *shared_surface;
+
    /** 
     * This is protected by stw_device::fb_mutex, not the mutex above.
     * 
@@ -126,6 +137,11 @@ BOOL
 stw_framebuffer_allocate(
    struct stw_framebuffer *fb );
 
+BOOL
+stw_framebuffer_present_locked(HDC hdc,
+                               struct stw_framebuffer *fb,
+                               unsigned surface_index);
+
 void
 stw_framebuffer_update(
    struct stw_framebuffer *fb);
diff --git a/src/gallium/state_trackers/wgl/shared/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
index 879ced925a..8875dc22f3 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_getprocaddress.c
+++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
@@ -33,8 +33,7 @@
 #include <GL/wglext.h>
 
 #include "glapi/glapi.h"
-#include "stw_public.h"
-#include "stw_extgallium.h"
+#include "stw_ext_gallium.h"
 
 struct stw_extension_entry
 {
@@ -68,8 +67,8 @@ static const struct stw_extension_entry stw_extension_entries[] = {
    { NULL, NULL }
 };
 
-PROC
-stw_get_proc_address(
+PROC APIENTRY
+DrvGetProcAddress(
    LPCSTR lpszProc )
 {
    const struct stw_extension_entry *entry;
diff --git a/src/gallium/state_trackers/wgl/icd/stw_icd.h b/src/gallium/state_trackers/wgl/stw_icd.h
index cbc1a66548..02eb543fef 100644
--- a/src/gallium/state_trackers/wgl/icd/stw_icd.h
+++ b/src/gallium/state_trackers/wgl/stw_icd.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008-2009 Vmware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -388,6 +388,113 @@ typedef struct _GLCLTPROCTABLE
 
 typedef VOID (APIENTRY * PFN_SETPROCTABLE)(PGLCLTPROCTABLE);
 
+/**
+ * Presentation data passed to opengl32!wglCbPresentBuffers.
+ *
+ * Pure software drivers don't need to worry about this -- if they stick to the
+ * GDI API then will integrate with the Desktop Window Manager (DWM) without
+ * problems. Hardware drivers, however, cannot present directly to the primary
+ * surface while the DWM is active, as DWM gets exclusive access to the primary
+ * surface.
+ *
+ * Proper DWM integration requires:
+ * - advertise the PFD_SUPPORT_COMPOSITION flag
+ * - redirect glFlush/glfinish/wglSwapBuffers into a surface shared with the
+ * DWM process.
+ *
+ * @sa http://www.opengl.org/pipeline/article/vol003_7/
+ * @sa http://blogs.msdn.com/greg_schechter/archive/2006/05/02/588934.aspx
+ */
+typedef struct _GLCBPRESENTBUFFERSDATA
+{
+   /**
+    * wglCbPresentBuffers enforces this to be 2.
+    */
+   DWORD magic1;
+
+   /**
+    * wglCbPresentBuffers enforces to be 0 or 1, but it is most commonly
+    * set to 0.
+    */
+   DWORD magic2;
+
+   /**
+    * Locally unique identifier (LUID) of the graphics adapter.
+    *
+    * This should contain the value returned by D3DKMTOpenAdapterFromHdc. It
+    * is passed to dwmapi!DwmpDxGetWindowSharedSurface in order to obtain
+    * the shared surface handle for the bound drawable (window).
+    *
+    * @sa http://msdn.microsoft.com/en-us/library/ms799177.aspx
+    */
+   LUID AdapterLuid;
+
+   /**
+    * This is passed unmodified to DrvPresentBuffers
+    */
+   LPVOID pPrivateData;
+
+   /**
+    * Client area rectangle to update, relative to the window upper-left corner.
+    */
+   RECT rect;
+} GLCBPRESENTBUFFERSDATA, *PGLCBPRESENTBUFFERSDATA;
+
+/**
+ * Callbacks supplied to DrvSetCallbackProcs by the OpenGL runtime.
+ *
+ * Pointers to several callback functions in opengl32.dll.
+ */
+typedef struct _GLCALLBACKTABLE
+{
+   /** Unused */
+   PROC wglCbSetCurrentValue;
+
+   /** Unused */
+   PROC wglCbGetCurrentValue;
+
+   /** Unused */
+   PROC wglCbGetDhglrc;
+
+   /** Unused */
+   PROC wglCbGetDdHandle;
+
+   /**
+    * Queue a present composition.
+    *
+    * Makes the runtime call DrvPresentBuffers with the composition information.
+    */
+   BOOL (APIENTRY *wglCbPresentBuffers)(HDC hdc, PGLCBPRESENTBUFFERSDATA data);
+
+} GLCALLBACKTABLE;
+
+typedef struct _GLPRESENTBUFFERSDATA
+{
+   /**
+    * The shared surface handle.
+    *
+    * Return by dwmapi!DwmpDxGetWindowSharedSurface.
+    *
+    * @sa http://channel9.msdn.com/forums/TechOff/251261-Help-Getting-the-shared-window-texture-out-of-DWM-/
+    */
+   HANDLE hSharedSurface;
+
+   LUID AdapterLuid;
+
+   /**
+    * Present history token.
+    *
+    * This is returned by dwmapi!DwmpDxGetWindowSharedSurface and
+    * should be passed to D3DKMTRender in D3DKMT_RENDER::PresentHistoryToken.
+    *
+    * @sa http://msdn.microsoft.com/en-us/library/ms799176.aspx
+    */
+   ULONGLONG PresentHistoryToken;
+
+   /** Same as GLCBPRESENTBUFFERSDATA::pPrivateData */
+   LPVOID pPrivateData;
+} GLPRESENTBUFFERSDATA, *PGLPRESENTBUFFERSDATA;
+
 BOOL APIENTRY
 DrvCopyContext(
    DHGLRC dhrcSource,
@@ -435,6 +542,9 @@ DrvGetProcAddress(
    LPCSTR lpszProc );
 
 BOOL APIENTRY
+DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data);
+
+BOOL APIENTRY
 DrvRealizeLayerPalette(
    HDC hdc,
    INT iLayerPlane,
diff --git a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index c296744838..7abe5d9f7f 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -34,9 +34,9 @@
 
 #include "util/u_debug.h"
 
+#include "stw_icd.h"
 #include "stw_device.h"
 #include "stw_pixelformat.h"
-#include "stw_public.h"
 #include "stw_tls.h"
 
 
@@ -154,8 +154,11 @@ stw_pixelformat_add(
    pfi->pfd.dwFlags = PFD_SUPPORT_OPENGL;
    
    /* TODO: also support non-native pixel formats */
-   pfi->pfd.dwFlags |= PFD_DRAW_TO_WINDOW ;
-   
+   pfi->pfd.dwFlags |= PFD_DRAW_TO_WINDOW;
+
+   /* See http://www.opengl.org/pipeline/article/vol003_7/ */
+   pfi->pfd.dwFlags |= PFD_SUPPORT_COMPOSITION;
+
    if (doublebuffer)
       pfi->pfd.dwFlags |= PFD_DOUBLEBUFFER | PFD_SWAP_COPY;
    
@@ -288,12 +291,12 @@ stw_pixelformat_visual(GLvisual *visual,
 }
 
 
-int
-stw_pixelformat_describe(
+LONG APIENTRY
+DrvDescribePixelFormat(
    HDC hdc,
-   int iPixelFormat,
-   UINT nBytes,
-   LPPIXELFORMATDESCRIPTOR ppfd )
+   INT iPixelFormat,
+   ULONG cjpfd,
+   PIXELFORMATDESCRIPTOR *ppfd )
 {
    uint count;
    uint index;
@@ -306,7 +309,7 @@ stw_pixelformat_describe(
 
    if (ppfd == NULL)
       return count;
-   if (index >= count || nBytes != sizeof( PIXELFORMATDESCRIPTOR ))
+   if (index >= count || cjpfd != sizeof( PIXELFORMATDESCRIPTOR ))
       return 0;
 
    pfi = stw_pixelformat_get_info( index );
@@ -316,6 +319,52 @@ stw_pixelformat_describe(
    return count;
 }
 
+BOOL APIENTRY
+DrvDescribeLayerPlane(
+   HDC hdc,
+   INT iPixelFormat,
+   INT iLayerPlane,
+   UINT nBytes,
+   LPLAYERPLANEDESCRIPTOR plpd )
+{
+   assert(0);
+   return FALSE;
+}
+
+int APIENTRY
+DrvGetLayerPaletteEntries(
+   HDC hdc,
+   INT iLayerPlane,
+   INT iStart,
+   INT cEntries,
+   COLORREF *pcr )
+{
+   assert(0);
+   return 0;
+}
+
+int APIENTRY
+DrvSetLayerPaletteEntries(
+   HDC hdc,
+   INT iLayerPlane,
+   INT iStart,
+   INT cEntries,
+   CONST COLORREF *pcr )
+{
+   assert(0);
+   return 0;
+}
+
+BOOL APIENTRY
+DrvRealizeLayerPalette(
+   HDC hdc,
+   INT iLayerPlane,
+   BOOL bRealize )
+{
+   assert(0);
+   return FALSE;
+}
+
 /* Only used by the wgl code, but have it here to avoid exporting the
  * pixelformat.h functionality.
  */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.h b/src/gallium/state_trackers/wgl/stw_pixelformat.h
index bec429231b..3a690b35ba 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.h
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.h
@@ -30,6 +30,10 @@
 
 #include <windows.h>
 
+#ifndef PFD_SUPPORT_COMPOSITION
+#define PFD_SUPPORT_COMPOSITION 0x00008000
+#endif
+
 #include "main/mtypes.h"
 
 #include "pipe/p_compiler.h"
@@ -62,4 +66,11 @@ void
 stw_pixelformat_visual(GLvisual *visual, 
                        const struct stw_pixelformat_info *pfi );
 
+int
+stw_pixelformat_choose( HDC hdc,
+                        CONST PIXELFORMATDESCRIPTOR *ppfd );
+
+int
+stw_pixelformat_get(HDC hdc);
+
 #endif /* STW_PIXELFORMAT_H */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_tls.c b/src/gallium/state_trackers/wgl/stw_tls.c
index 4bd6a9289c..4bd6a9289c 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_tls.c
+++ b/src/gallium/state_trackers/wgl/stw_tls.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_tls.h b/src/gallium/state_trackers/wgl/stw_tls.h
index fbf8b1cbee..fbf8b1cbee 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_tls.h
+++ b/src/gallium/state_trackers/wgl/stw_tls.h
diff --git a/src/gallium/state_trackers/wgl/wgl/stw_wgl.c b/src/gallium/state_trackers/wgl/stw_wgl.c
index a131292f7a..bb199fdd25 100644
--- a/src/gallium/state_trackers/wgl/wgl/stw_wgl.c
+++ b/src/gallium/state_trackers/wgl/stw_wgl.c
@@ -28,7 +28,9 @@
 #include <windows.h>
 
 #include "util/u_debug.h"
-#include "shared/stw_public.h"
+#include "stw_icd.h"
+#include "stw_context.h"
+#include "stw_pixelformat.h"
 #include "stw_wgl.h"
 
 
@@ -38,16 +40,16 @@ wglCopyContext(
    HGLRC hglrcDst,
    UINT mask )
 {
-   return stw_copy_context( (UINT_PTR)hglrcSrc, 
-                            (UINT_PTR)hglrcDst, 
-                            mask );
+   return DrvCopyContext( (DHGLRC)(UINT_PTR)hglrcSrc,
+                          (DHGLRC)(UINT_PTR)hglrcDst,
+                          mask );
 }
 
 WINGDIAPI HGLRC APIENTRY
 wglCreateContext(
    HDC hdc )
 {
-   return wglCreateLayerContext(hdc, 0);
+   return (HGLRC) DrvCreateContext(hdc);
 }
 
 WINGDIAPI HGLRC APIENTRY
@@ -55,21 +57,21 @@ wglCreateLayerContext(
    HDC hdc,
    int iLayerPlane )
 {
-   return (HGLRC) stw_create_layer_context( hdc, iLayerPlane );
+   return (HGLRC) DrvCreateLayerContext( hdc, iLayerPlane );
 }
 
 WINGDIAPI BOOL APIENTRY
 wglDeleteContext(
    HGLRC hglrc )
 {
-   return stw_delete_context( (UINT_PTR)hglrc );
+   return DrvDeleteContext((DHGLRC)(UINT_PTR)hglrc );
 }
 
 
 WINGDIAPI HGLRC APIENTRY
 wglGetCurrentContext( VOID )
 {
-   return (HGLRC)stw_get_current_context();
+   return (HGLRC)(UINT_PTR)stw_get_current_context();
 }
 
 WINGDIAPI HDC APIENTRY
@@ -83,7 +85,7 @@ wglMakeCurrent(
    HDC hdc,
    HGLRC hglrc )
 {
-   return stw_make_current( hdc, (UINT_PTR)hglrc );
+   return DrvSetContext( hdc, (DHGLRC)(UINT_PTR)hglrc, NULL ) ? TRUE : FALSE;
 }
 
 
@@ -91,7 +93,7 @@ WINGDIAPI BOOL APIENTRY
 wglSwapBuffers(
    HDC hdc )
 {
-   return stw_swap_buffers( hdc );
+   return DrvSwapBuffers( hdc );
 }
 
 
@@ -100,14 +102,14 @@ wglSwapLayerBuffers(
    HDC hdc,
    UINT fuPlanes )
 {
-   return stw_swap_layer_buffers( hdc, fuPlanes );
+   return DrvSwapLayerBuffers( hdc, fuPlanes );
 }
 
 WINGDIAPI PROC APIENTRY
 wglGetProcAddress(
     LPCSTR lpszProc )
 {
-   return stw_get_proc_address( lpszProc );
+   return DrvGetProcAddress( lpszProc );
 }
 
 
@@ -141,7 +143,7 @@ wglDescribePixelFormat(
    UINT nBytes,
    LPPIXELFORMATDESCRIPTOR ppfd )
 {
-   return stw_pixelformat_describe( hdc, iPixelFormat, nBytes, ppfd );
+   return DrvDescribePixelFormat( hdc, iPixelFormat, nBytes, ppfd );
 }
 
 WINGDIAPI int APIENTRY
@@ -160,7 +162,7 @@ wglSetPixelFormat(
    if (ppfd->nSize != sizeof( PIXELFORMATDESCRIPTOR ))
       return FALSE;
 
-   return stw_pixelformat_set( hdc, iPixelFormat );
+   return DrvSetPixelFormat( hdc, iPixelFormat );
 }
 
 
@@ -186,7 +188,8 @@ wglShareLists(
    HGLRC hglrc1,
    HGLRC hglrc2 )
 {
-   return stw_share_lists( (UINT_PTR)hglrc1, (UINT_PTR)hglrc2);;
+   return DrvShareLists((DHGLRC)(UINT_PTR)hglrc1,
+                        (DHGLRC)(UINT_PTR)hglrc2);
 }
 
 WINGDIAPI BOOL APIENTRY
@@ -264,15 +267,7 @@ wglDescribeLayerPlane(
    UINT nBytes,
    LPLAYERPLANEDESCRIPTOR plpd )
 {
-   (void) hdc;
-   (void) iPixelFormat;
-   (void) iLayerPlane;
-   (void) nBytes;
-   (void) plpd;
-
-   assert( 0 );
-
-   return FALSE;
+   return DrvDescribeLayerPlane(hdc, iPixelFormat, iLayerPlane, nBytes, plpd);
 }
 
 WINGDIAPI int APIENTRY
@@ -283,15 +278,7 @@ wglSetLayerPaletteEntries(
    int cEntries,
    CONST COLORREF *pcr )
 {
-   (void) hdc;
-   (void) iLayerPlane;
-   (void) iStart;
-   (void) cEntries;
-   (void) pcr;
-
-   assert( 0 );
-
-   return 0;
+   return DrvSetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
 }
 
 WINGDIAPI int APIENTRY
@@ -302,15 +289,7 @@ wglGetLayerPaletteEntries(
    int cEntries,
    COLORREF *pcr )
 {
-   (void) hdc;
-   (void) iLayerPlane;
-   (void) iStart;
-   (void) cEntries;
-   (void) pcr;
-
-   assert( 0 );
-
-   return 0;
+   return DrvGetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
 }
 
 WINGDIAPI BOOL APIENTRY
diff --git a/src/gallium/state_trackers/wgl/wgl/stw_wgl.h b/src/gallium/state_trackers/wgl/stw_wgl.h
index a98179944a..a98179944a 100644
--- a/src/gallium/state_trackers/wgl/wgl/stw_wgl.h
+++ b/src/gallium/state_trackers/wgl/stw_wgl.h
diff --git a/src/gallium/state_trackers/wgl/shared/stw_winsys.h b/src/gallium/state_trackers/wgl/stw_winsys.h
index c0bf82c9ed..1ead47d6e6 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_winsys.h
+++ b/src/gallium/state_trackers/wgl/stw_winsys.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008-2009 Vmware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -36,6 +36,8 @@ struct pipe_screen;
 struct pipe_context;
 struct pipe_surface;
 
+struct stw_shared_surface;
+
 struct stw_winsys
 {
    struct pipe_screen *
@@ -44,10 +46,52 @@ struct stw_winsys
    struct pipe_context *
    (*create_context)( struct pipe_screen *screen );
 
+   /**
+    * Present the color buffer to the window associated with the device context.
+    */
+   void
+   (*present)( struct pipe_screen *screen,
+               struct pipe_surface *surf,
+               HDC hDC );
+
+   /**
+    * Locally unique identifier (LUID) of the graphics adapter.
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::AdapterLuid;
+    */
+   boolean
+   (*get_adapter_luid)( struct pipe_screen *screen,
+                        LUID *pAdapterLuid );
+
+   /**
+    * Open a shared surface (optional).
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::hSharedSurface;
+    */
+   struct stw_shared_surface *
+   (*shared_surface_open)(struct pipe_screen *screen,
+                          HANDLE hSharedSurface);
+
+   /**
+    * Open a shared surface (optional).
+    */
+   void
+   (*shared_surface_close)(struct pipe_screen *screen,
+                           struct stw_shared_surface *surface);
+
+   /**
+    * Compose into a shared (optional).
+    *
+    * Blit the color buffer into a shared surface.
+    *
+    * @sa GLPRESENTBUFFERSDATA::PresentHistoryToken.
+    */
    void
-   (*flush_frontbuffer)( struct pipe_screen *screen,
-                         struct pipe_surface *surf,
-                         HDC hDC );
+   (*compose)( struct pipe_screen *screen,
+               struct pipe_surface *src,
+               struct stw_shared_surface *dest,
+               LPCRECT pRect,
+               ULONGLONG PresentHistoryToken );
 };
 
 boolean
diff --git a/src/gallium/state_trackers/xorg/xorg_composite.c b/src/gallium/state_trackers/xorg/xorg_composite.c
index c708ac3170..7037d17e43 100644
--- a/src/gallium/state_trackers/xorg/xorg_composite.c
+++ b/src/gallium/state_trackers/xorg/xorg_composite.c
@@ -4,6 +4,7 @@
 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h"
+#include "util/u_math.h"
 
 #include "pipe/p_inlines.h"
 
@@ -11,9 +12,9 @@ struct xorg_composite_blend {
    int op:8;
 
    unsigned rgb_src_factor:5;    /**< PIPE_BLENDFACTOR_x */
-   unsigned rgb_dst_factor:5;    /**< PIPE_BLENDFACTOR_x */
-
    unsigned alpha_src_factor:5;  /**< PIPE_BLENDFACTOR_x */
+
+   unsigned rgb_dst_factor:5;    /**< PIPE_BLENDFACTOR_x */
    unsigned alpha_dst_factor:5;  /**< PIPE_BLENDFACTOR_x */
 };
 
@@ -40,22 +41,20 @@ static const struct xorg_composite_blend xorg_blends[] = {
      PIPE_BLENDFACTOR_INV_SRC_ALPHA, PIPE_BLENDFACTOR_INV_SRC_ALPHA },
 };
 
+
 static INLINE void
-pixel_to_float4(PictFormatPtr format,
-                CARD32 pixel, float *color)
+pixel_to_float4(Pixel pixel, float *color)
 {
    CARD32	    r, g, b, a;
 
-   debug_assert(format->type == PictTypeDirect);
-
-   r = (pixel >> format->direct.red) & format->direct.redMask;
-   g = (pixel >> format->direct.green) & format->direct.greenMask;
-   b = (pixel >> format->direct.blue) & format->direct.blueMask;
-   a = (pixel >> format->direct.alpha) & format->direct.alphaMask;
-   color[0] = ((float)r) / ((float)format->direct.redMask);
-   color[1] = ((float)g) / ((float)format->direct.greenMask);
-   color[2] = ((float)b) / ((float)format->direct.blueMask);
-   color[3] = ((float)a) / ((float)format->direct.alphaMask);
+   a = (pixel >> 24) & 0xff;
+   r = (pixel >> 16) & 0xff;
+   g = (pixel >>  8) & 0xff;
+   b = (pixel >>  0) & 0xff;
+   color[0] = ((float)r) / 255.;
+   color[1] = ((float)g) / 255.;
+   color[2] = ((float)b) / 255.;
+   color[3] = ((float)a) / 255.;
 }
 
 struct acceleration_info {
@@ -133,24 +132,22 @@ setup_vertex_data0(struct exa_context *ctx,
                    int srcX, int srcY, int maskX, int maskY,
                    int dstX, int dstY, int width, int height)
 {
-   float vertices[4][2][4];
-
    /* 1st vertex */
-   setup_vertex0(vertices[0], dstX, dstY,
+   setup_vertex0(ctx->vertices2[0], dstX, dstY,
                  ctx->solid_color);
    /* 2nd vertex */
-   setup_vertex0(vertices[1], dstX + width, dstY,
+   setup_vertex0(ctx->vertices2[1], dstX + width, dstY,
                  ctx->solid_color);
    /* 3rd vertex */
-   setup_vertex0(vertices[2], dstX + width, dstY + height,
+   setup_vertex0(ctx->vertices2[2], dstX + width, dstY + height,
                  ctx->solid_color);
    /* 4th vertex */
-   setup_vertex0(vertices[3], dstX, dstY + height,
+   setup_vertex0(ctx->vertices2[3], dstX, dstY + height,
                  ctx->solid_color);
 
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
+   return pipe_user_buffer_create(ctx->pipe->screen,
+                                  ctx->vertices2,
+                                  sizeof(ctx->vertices2));
 }
 
 static INLINE void
@@ -172,7 +169,6 @@ setup_vertex_data1(struct exa_context *ctx,
                    int srcX, int srcY, int maskX, int maskY,
                    int dstX, int dstY, int width, int height)
 {
-   float vertices[4][2][4];
    float s0, t0, s1, t1;
    struct pipe_texture *src = ctx->bound_textures[0];
 
@@ -182,24 +178,49 @@ setup_vertex_data1(struct exa_context *ctx,
    t1 = srcY + height / src->height[0];
 
    /* 1st vertex */
-   setup_vertex1(vertices[0], dstX, dstY,
+   setup_vertex1(ctx->vertices2[0], dstX, dstY,
+                 s0, t0);
+   /* 2nd vertex */
+   setup_vertex1(ctx->vertices2[1], dstX + width, dstY,
+                 s1, t0);
+   /* 3rd vertex */
+   setup_vertex1(ctx->vertices2[2], dstX + width, dstY + height,
+                 s1, t1);
+   /* 4th vertex */
+   setup_vertex1(ctx->vertices2[3], dstX, dstY + height,
+                 s0, t1);
+
+   return pipe_user_buffer_create(ctx->pipe->screen,
+                                  ctx->vertices2,
+                                  sizeof(ctx->vertices2));
+}
+
+static struct pipe_buffer *
+setup_vertex_data_tex(struct exa_context *ctx,
+                      float x0, float y0, float x1, float y1,
+                      float s0, float t0, float s1, float t1,
+                      float z)
+{
+   /* 1st vertex */
+   setup_vertex1(ctx->vertices2[0], x0, y0,
                  s0, t0);
    /* 2nd vertex */
-   setup_vertex1(vertices[1], dstX + width, dstY,
+   setup_vertex1(ctx->vertices2[1], x1, y0,
                  s1, t0);
    /* 3rd vertex */
-   setup_vertex1(vertices[2], dstX + width, dstY + height,
+   setup_vertex1(ctx->vertices2[2], x1, y1,
                  s1, t1);
    /* 4th vertex */
-   setup_vertex1(vertices[3], dstX, dstY + height,
+   setup_vertex1(ctx->vertices2[3], x0, y1,
                  s0, t1);
 
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
+   return pipe_user_buffer_create(ctx->pipe->screen,
+                                  ctx->vertices2,
+                                  sizeof(ctx->vertices2));
 }
 
 
+
 static INLINE void
 setup_vertex2(float vertex[3][4], float x, float y,
               float s0, float t0, float s1, float t1)
@@ -225,7 +246,6 @@ setup_vertex_data2(struct exa_context *ctx,
                    int srcX, int srcY, int maskX, int maskY,
                    int dstX, int dstY, int width, int height)
 {
-   float vertices[4][3][4];
    float st0[4], st1[4];
    struct pipe_texture *src = ctx->bound_textures[0];
    struct pipe_texture *mask = ctx->bound_textures[0];
@@ -241,21 +261,21 @@ setup_vertex_data2(struct exa_context *ctx,
    st1[3] = maskY + height / mask->height[0];
 
    /* 1st vertex */
-   setup_vertex2(vertices[0], dstX, dstY,
+   setup_vertex2(ctx->vertices3[0], dstX, dstY,
                  st0[0], st0[1], st1[0], st1[1]);
    /* 2nd vertex */
-   setup_vertex2(vertices[1], dstX + width, dstY,
+   setup_vertex2(ctx->vertices3[1], dstX + width, dstY,
                  st0[2], st0[1], st1[2], st1[1]);
    /* 3rd vertex */
-   setup_vertex2(vertices[2], dstX + width, dstY + height,
+   setup_vertex2(ctx->vertices3[2], dstX + width, dstY + height,
                  st0[2], st0[3], st1[2], st1[3]);
    /* 4th vertex */
-   setup_vertex2(vertices[3], dstX, dstY + height,
+   setup_vertex2(ctx->vertices3[3], dstX, dstY + height,
                  st0[0], st0[3], st1[0], st1[3]);
 
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
+   return pipe_user_buffer_create(ctx->pipe->screen,
+                                  ctx->vertices3,
+                                  sizeof(ctx->vertices3));
 }
 
 boolean xorg_composite_accelerated(int op,
@@ -267,22 +287,25 @@ boolean xorg_composite_accelerated(int op,
    unsigned accel_ops_count =
       sizeof(accelerated_ops)/sizeof(struct acceleration_info);
 
-
-   /*FIXME: currently accel is disabled */
-   return FALSE;
-
-   if (pSrcPicture) {
-      /* component alpha not supported */
-      if (pSrcPicture->componentAlpha)
+   if (pSrcPicture->pSourcePict) {
+      /* Gradients not yet supported */
+      if (pSrcPicture->pSourcePict->type != SourcePictTypeSolidFill)
          return FALSE;
-      /* fills not supported */
-      if (pSrcPicture->pSourcePict)
+
+      /* Solid source with mask not yet handled properly */
+      if (pMaskPicture)
          return FALSE;
    }
 
    for (i = 0; i < accel_ops_count; ++i) {
       if (op == accelerated_ops[i].op) {
-         if (pMaskPicture && !accelerated_ops[i].with_mask)
+         /* Check for unsupported component alpha */
+         if ((pSrcPicture->componentAlpha &&
+              !accelerated_ops[i].component_alpha) ||
+             (pMaskPicture &&
+              (!accelerated_ops[i].with_mask ||
+               (pMaskPicture->componentAlpha &&
+                !accelerated_ops[i].component_alpha))))
             return FALSE;
          return TRUE;
       }
@@ -291,16 +314,20 @@ boolean xorg_composite_accelerated(int op,
 }
 
 static void
-bind_framebuffer_state(struct exa_context *exa, PicturePtr pDstPicture,
-                       struct exa_pixmap_priv *pDst)
+bind_clip_state(struct exa_context *exa)
+{
+}
+
+static void
+bind_framebuffer_state(struct exa_context *exa, struct exa_pixmap_priv *pDst)
 {
    unsigned i;
    struct pipe_framebuffer_state state;
    struct pipe_surface *surface = exa_gpu_surface(exa, pDst);
    memset(&state, 0, sizeof(struct pipe_framebuffer_state));
 
-   state.width  = pDstPicture->pDrawable->width;
-   state.height = pDstPicture->pDrawable->height;
+   state.width  = pDst->tex->width[0];
+   state.height = pDst->tex->height[0];
 
    state.nr_cbufs = 1;
    state.cbufs[0] = surface;
@@ -311,6 +338,9 @@ bind_framebuffer_state(struct exa_context *exa, PicturePtr pDstPicture,
    state.zsbuf = 0;
 
    cso_set_framebuffer(exa->cso, &state);
+
+   /* we do fire and forget for the framebuffer, this is the forget part */
+   pipe_surface_reference(&surface, NULL);
 }
 
 enum AxisOrientation {
@@ -338,10 +368,12 @@ set_viewport(struct exa_context *exa, int width, int height,
 }
 
 static void
-bind_viewport_state(struct exa_context *exa, PicturePtr pDstPicture)
+bind_viewport_state(struct exa_context *exa, struct exa_pixmap_priv *pDst)
 {
-   int width = pDstPicture->pDrawable->width;
-   int height = pDstPicture->pDrawable->height;
+   int width = pDst->tex->width[0];
+   int height = pDst->tex->height[0];
+
+   /*debug_printf("Bind viewport (%d, %d)\n", width, height);*/
 
    set_viewport(exa, width, height, Y0_TOP);
 }
@@ -350,13 +382,9 @@ static void
 bind_blend_state(struct exa_context *exa, int op,
                  PicturePtr pSrcPicture, PicturePtr pMaskPicture)
 {
-   boolean component_alpha = pSrcPicture->componentAlpha;
    struct xorg_composite_blend blend_opt;
    struct pipe_blend_state blend;
 
-   if (component_alpha) {
-      op = PictOpOver;
-   }
    blend_opt = blend_for_op(op);
 
    memset(&blend, 0, sizeof(struct pipe_blend_state));
@@ -390,14 +418,17 @@ bind_shaders(struct exa_context *exa, int op,
    unsigned vs_traits = 0, fs_traits = 0;
    struct xorg_shader shader;
 
+   exa->has_solid_color = FALSE;
+
    if (pSrcPicture) {
       if (pSrcPicture->pSourcePict) {
          if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
             fs_traits |= FS_SOLID_FILL;
             vs_traits |= VS_SOLID_FILL;
-            pixel_to_float4(pSrcPicture->pFormat,
-                            pSrcPicture->pSourcePict->solidFill.color,
+            debug_assert(pSrcPicture->format == PICT_a8r8g8b8);
+            pixel_to_float4(pSrcPicture->pSourcePict->solidFill.color,
                             exa->solid_color);
+            exa->has_solid_color = TRUE;
          } else {
             debug_assert("!gradients not supported");
          }
@@ -434,6 +465,12 @@ bind_samplers(struct exa_context *exa, int op,
    memset(&src_sampler, 0, sizeof(struct pipe_sampler_state));
    memset(&mask_sampler, 0, sizeof(struct pipe_sampler_state));
 
+   if ((pSrc && exa->pipe->is_texture_referenced(exa->pipe, pSrc->tex, 0, 0) &
+        PIPE_REFERENCED_FOR_WRITE) ||
+       (pMask && exa->pipe->is_texture_referenced(exa->pipe, pMask->tex, 0, 0) &
+        PIPE_REFERENCED_FOR_WRITE))
+      exa->pipe->flush(exa->pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
    if (pSrcPicture && pSrc) {
       unsigned src_wrap = render_repeat_to_gallium(
          pSrcPicture->repeatType);
@@ -478,15 +515,15 @@ setup_vs_constant_buffer(struct exa_context *exa,
    struct pipe_constant_buffer *cbuf = &exa->vs_const_buffer;
 
    pipe_buffer_reference(&cbuf->buffer, NULL);
-   cbuf->buffer = pipe_buffer_create(exa->ctx->screen, 16,
+   cbuf->buffer = pipe_buffer_create(exa->pipe->screen, 16,
                                      PIPE_BUFFER_USAGE_CONSTANT,
                                      param_bytes);
 
    if (cbuf->buffer) {
-      pipe_buffer_write(exa->ctx->screen, cbuf->buffer,
+      pipe_buffer_write(exa->pipe->screen, cbuf->buffer,
                         0, param_bytes, vs_consts);
    }
-   exa->ctx->set_constant_buffer(exa->ctx, PIPE_SHADER_VERTEX, 0, cbuf);
+   exa->pipe->set_constant_buffer(exa->pipe, PIPE_SHADER_VERTEX, 0, cbuf);
 }
 
 
@@ -500,22 +537,22 @@ setup_fs_constant_buffer(struct exa_context *exa)
    struct pipe_constant_buffer *cbuf = &exa->fs_const_buffer;
 
    pipe_buffer_reference(&cbuf->buffer, NULL);
-   cbuf->buffer = pipe_buffer_create(exa->ctx->screen, 16,
+   cbuf->buffer = pipe_buffer_create(exa->pipe->screen, 16,
                                      PIPE_BUFFER_USAGE_CONSTANT,
                                      param_bytes);
 
    if (cbuf->buffer) {
-      pipe_buffer_write(exa->ctx->screen, cbuf->buffer,
+      pipe_buffer_write(exa->pipe->screen, cbuf->buffer,
                         0, param_bytes, fs_consts);
    }
-   exa->ctx->set_constant_buffer(exa->ctx, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   exa->pipe->set_constant_buffer(exa->pipe, PIPE_SHADER_FRAGMENT, 0, cbuf);
 }
 
 static void
-setup_constant_buffers(struct exa_context *exa, PicturePtr pDstPicture)
+setup_constant_buffers(struct exa_context *exa, struct exa_pixmap_priv *pDst)
 {
-   int width = pDstPicture->pDrawable->width;
-   int height = pDstPicture->pDrawable->height;
+   int width = pDst->tex->width[0];
+   int height = pDst->tex->height[0];
 
    setup_vs_constant_buffer(exa, width, height);
    setup_fs_constant_buffer(exa);
@@ -530,15 +567,15 @@ boolean xorg_composite_bind_state(struct exa_context *exa,
                                   struct exa_pixmap_priv *pMask,
                                   struct exa_pixmap_priv *pDst)
 {
-   bind_framebuffer_state(exa, pDstPicture, pDst);
-   bind_viewport_state(exa, pDstPicture);
+   bind_framebuffer_state(exa, pDst);
+   bind_viewport_state(exa, pDst);
    bind_blend_state(exa, op, pSrcPicture, pMaskPicture);
    bind_rasterizer_state(exa);
    bind_shaders(exa, op, pSrcPicture, pMaskPicture);
    bind_samplers(exa, op, pSrcPicture, pMaskPicture,
                  pDstPicture, pSrc, pMask, pDst);
-
-   setup_constant_buffers(exa, pDstPicture);
+   bind_clip_state(exa);
+   setup_constant_buffers(exa, pDst);
 
    return FALSE;
 }
@@ -548,7 +585,7 @@ void xorg_composite(struct exa_context *exa,
                     int srcX, int srcY, int maskX, int maskY,
                     int dstX, int dstY, int width, int height)
 {
-   struct pipe_context *pipe = exa->ctx;
+   struct pipe_context *pipe = exa->pipe;
    struct pipe_buffer *buf = 0;
 
    if (exa->num_bound_samplers == 0 ) { /* solid fill */
@@ -573,12 +610,457 @@ void xorg_composite(struct exa_context *exa,
    }
 
    if (buf) {
+      int num_attribs = 1; /*pos*/
+      num_attribs += exa->num_bound_samplers;
+      if (exa->has_solid_color)
+         ++num_attribs;
+
       util_draw_vertex_buffer(pipe, buf, 0,
                               PIPE_PRIM_TRIANGLE_FAN,
                               4,  /* verts */
-                              1 + exa->num_bound_samplers); /* attribs/vert */
+                              num_attribs); /* attribs/vert */
 
       pipe_buffer_reference(&buf, NULL);
    }
 }
 
+boolean xorg_solid_bind_state(struct exa_context *exa,
+                              struct exa_pixmap_priv *pixmap,
+                              Pixel fg)
+{
+   unsigned vs_traits, fs_traits;
+   struct xorg_shader shader;
+
+   pixel_to_float4(fg, exa->solid_color);
+   exa->has_solid_color = TRUE;
+
+   exa->solid_color[3] = 1.f;
+
+#if 0
+   debug_printf("Color Pixel=(%d, %d, %d, %d), RGBA=(%f, %f, %f, %f)\n",
+                (fg >> 24) & 0xff, (fg >> 16) & 0xff,
+                (fg >> 8) & 0xff,  (fg >> 0) & 0xff,
+                exa->solid_color[0], exa->solid_color[1],
+                exa->solid_color[2], exa->solid_color[3]);
+#endif
+
+   vs_traits = VS_SOLID_FILL;
+   fs_traits = FS_SOLID_FILL;
+
+   bind_framebuffer_state(exa, pixmap);
+   bind_viewport_state(exa, pixmap);
+   bind_rasterizer_state(exa);
+   bind_blend_state(exa, PictOpSrc, NULL, NULL);
+   setup_constant_buffers(exa, pixmap);
+   bind_clip_state(exa);
+
+   shader = xorg_shaders_get(exa->shaders, vs_traits, fs_traits);
+   cso_set_vertex_shader_handle(exa->cso, shader.vs);
+   cso_set_fragment_shader_handle(exa->cso, shader.fs);
+
+   return TRUE;
+}
+
+void xorg_solid(struct exa_context *exa,
+                struct exa_pixmap_priv *pixmap,
+                int x0, int y0, int x1, int y1)
+{
+   struct pipe_context *pipe = exa->pipe;
+   struct pipe_buffer *buf = 0;
+
+   /* 1st vertex */
+   setup_vertex0(exa->vertices2[0], x0, y0,
+                 exa->solid_color);
+   /* 2nd vertex */
+   setup_vertex0(exa->vertices2[1], x1, y0,
+                 exa->solid_color);
+   /* 3rd vertex */
+   setup_vertex0(exa->vertices2[2], x1, y1,
+                 exa->solid_color);
+   /* 4th vertex */
+   setup_vertex0(exa->vertices2[3], x0, y1,
+                 exa->solid_color);
+
+   buf = pipe_user_buffer_create(exa->pipe->screen,
+                                 exa->vertices2,
+                                 sizeof(exa->vertices2));
+
+
+   if (buf) {
+      util_draw_vertex_buffer(pipe, buf, 0,
+                              PIPE_PRIM_TRIANGLE_FAN,
+                              4,  /* verts */
+                              2); /* attribs/vert */
+
+      pipe_buffer_reference(&buf, NULL);
+   }
+}
+
+
+static INLINE void shift_rectx(float coords[4],
+                               const float *bounds,
+                               const float shift)
+{
+   coords[0] += shift;
+   coords[2] -= shift;
+   if (bounds) {
+      coords[2] = MIN2(coords[2], bounds[2]);
+      /* bound x/y + width/height */
+      if ((coords[0] + coords[2]) > (bounds[0] + bounds[2])) {
+         coords[2] = (bounds[0] + bounds[2]) - coords[0];
+      }
+   }
+}
+
+static INLINE void shift_recty(float coords[4],
+                               const float *bounds,
+                               const float shift)
+{
+   coords[1] += shift;
+   coords[3] -= shift;
+   if (bounds) {
+      coords[3] = MIN2(coords[3], bounds[3]);
+      if ((coords[1] + coords[3]) > (bounds[1] + bounds[3])) {
+         coords[3] = (bounds[1] + bounds[3]) - coords[1];
+      }
+   }
+}
+
+static INLINE void bound_rect(float coords[4],
+                              const float bounds[4],
+                              float shift[4])
+{
+   /* if outside the bounds */
+   if (coords[0] > (bounds[0] + bounds[2]) ||
+       coords[1] > (bounds[1] + bounds[3]) ||
+       (coords[0] + coords[2]) < bounds[0] ||
+       (coords[1] + coords[3]) < bounds[1]) {
+      coords[0] = 0.f;
+      coords[1] = 0.f;
+      coords[2] = 0.f;
+      coords[3] = 0.f;
+      shift[0] = 0.f;
+      shift[1] = 0.f;
+      return;
+   }
+
+   /* bound x */
+   if (coords[0] < bounds[0]) {
+      shift[0] = bounds[0] - coords[0];
+      coords[2] -= shift[0];
+      coords[0] = bounds[0];
+   } else
+      shift[0] = 0.f;
+
+   /* bound y */
+   if (coords[1] < bounds[1]) {
+      shift[1] = bounds[1] - coords[1];
+      coords[3] -= shift[1];
+      coords[1] = bounds[1];
+   } else
+      shift[1] = 0.f;
+
+   shift[2] = bounds[2] - coords[2];
+   shift[3] = bounds[3] - coords[3];
+   /* bound width/height */
+   coords[2] = MIN2(coords[2], bounds[2]);
+   coords[3] = MIN2(coords[3], bounds[3]);
+
+   /* bound x/y + width/height */
+   if ((coords[0] + coords[2]) > (bounds[0] + bounds[2])) {
+      coords[2] = (bounds[0] + bounds[2]) - coords[0];
+   }
+   if ((coords[1] + coords[3]) > (bounds[1] + bounds[3])) {
+      coords[3] = (bounds[1] + bounds[3]) - coords[1];
+   }
+
+   /* if outside the bounds */
+   if ((coords[0] + coords[2]) < bounds[0] ||
+       (coords[1] + coords[3]) < bounds[1]) {
+      coords[0] = 0.f;
+      coords[1] = 0.f;
+      coords[2] = 0.f;
+      coords[3] = 0.f;
+      return;
+   }
+}
+
+static INLINE void sync_size(float *src_loc, float *dst_loc)
+{
+   src_loc[2] = MIN2(src_loc[2], dst_loc[2]);
+   src_loc[3] = MIN2(src_loc[3], dst_loc[3]);
+   dst_loc[2] = src_loc[2];
+   dst_loc[3] = src_loc[3];
+}
+
+
+static void renderer_copy_texture(struct exa_context *exa,
+                                  struct pipe_texture *src,
+                                  float sx1, float sy1,
+                                  float sx2, float sy2,
+                                  struct pipe_texture *dst,
+                                  float dx1, float dy1,
+                                  float dx2, float dy2)
+{
+   struct pipe_context *pipe = exa->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_buffer *buf;
+   struct pipe_surface *dst_surf = screen->get_tex_surface(
+      screen, dst, 0, 0, 0,
+      PIPE_BUFFER_USAGE_GPU_WRITE);
+   struct pipe_framebuffer_state fb;
+   float s0, t0, s1, t1;
+   struct xorg_shader shader;
+
+   assert(src->width[0] != 0);
+   assert(src->height[0] != 0);
+   assert(dst->width[0] != 0);
+   assert(dst->height[0] != 0);
+
+#if 1
+   s0 = sx1 / src->width[0];
+   s1 = sx2 / src->width[0];
+   t0 = sy1 / src->height[0];
+   t1 = sy2 / src->height[0];
+#else
+   s0 = 0;
+   s1 = 1;
+   t0 = 0;
+   t1 = 1;
+#endif
+
+#if 0
+   debug_printf("copy texture src=[%f, %f, %f, %f], dst=[%f, %f, %f, %f], tex=[%f, %f, %f, %f]\n",
+                sx1, sy1, sx2, sy2, dx1, dy1, dx2, dy2,
+                s0, t0, s1, t1);
+#endif
+
+   assert(screen->is_format_supported(screen, dst_surf->format,
+                                      PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_RENDER_TARGET,
+                                      0));
+
+   /* save state (restored below) */
+   cso_save_blend(exa->cso);
+   cso_save_samplers(exa->cso);
+   cso_save_sampler_textures(exa->cso);
+   cso_save_framebuffer(exa->cso);
+   cso_save_fragment_shader(exa->cso);
+   cso_save_vertex_shader(exa->cso);
+
+   cso_save_viewport(exa->cso);
+
+
+   /* set misc state we care about */
+   {
+      struct pipe_blend_state blend;
+      memset(&blend, 0, sizeof(blend));
+      blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.colormask = PIPE_MASK_RGBA;
+      cso_set_blend(exa->cso, &blend);
+   }
+
+   /* sampler */
+   {
+      struct pipe_sampler_state sampler;
+      memset(&sampler, 0, sizeof(sampler));
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.normalized_coords = 1;
+      cso_single_sampler(exa->cso, 0, &sampler);
+      cso_single_sampler_done(exa->cso);
+   }
+
+   set_viewport(exa, dst_surf->width, dst_surf->height, Y0_TOP);
+
+   /* texture */
+   cso_set_sampler_textures(exa->cso, 1, &src);
+
+   bind_rasterizer_state(exa);
+
+   /* shaders */
+   shader = xorg_shaders_get(exa->shaders,
+                             VS_COMPOSITE,
+                             FS_COMPOSITE);
+   cso_set_vertex_shader_handle(exa->cso, shader.vs);
+   cso_set_fragment_shader_handle(exa->cso, shader.fs);
+
+   /* drawing dest */
+   memset(&fb, 0, sizeof(fb));
+   fb.width = dst_surf->width;
+   fb.height = dst_surf->height;
+   fb.nr_cbufs = 1;
+   fb.cbufs[0] = dst_surf;
+   {
+      int i;
+      for (i = 1; i < PIPE_MAX_COLOR_BUFS; ++i)
+         fb.cbufs[i] = 0;
+   }
+   cso_set_framebuffer(exa->cso, &fb);
+   setup_vs_constant_buffer(exa, fb.width, fb.height);
+   setup_fs_constant_buffer(exa);
+
+   /* draw quad */
+   buf = setup_vertex_data_tex(exa,
+                               dx1, dy1,
+                               dx2, dy2,
+                               s0, t0, s1, t1,
+                               0.0f);
+
+   if (buf) {
+      util_draw_vertex_buffer(exa->pipe, buf, 0,
+                              PIPE_PRIM_TRIANGLE_FAN,
+                              4,  /* verts */
+                              2); /* attribs/vert */
+
+      pipe_buffer_reference(&buf, NULL);
+   }
+
+   /* restore state we changed */
+   cso_restore_blend(exa->cso);
+   cso_restore_samplers(exa->cso);
+   cso_restore_sampler_textures(exa->cso);
+   cso_restore_framebuffer(exa->cso);
+   cso_restore_vertex_shader(exa->cso);
+   cso_restore_fragment_shader(exa->cso);
+   cso_restore_viewport(exa->cso);
+
+   pipe_surface_reference(&dst_surf, NULL);
+}
+
+
+static struct pipe_texture *
+create_sampler_texture(struct exa_context *ctx,
+                       struct pipe_texture *src)
+{
+   enum pipe_format format;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture *pt;
+   struct pipe_texture templ;
+
+   pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+   /* the coming in texture should already have that invariance */
+   debug_assert(screen->is_format_supported(screen, src->format,
+                                            PIPE_TEXTURE_2D,
+                                            PIPE_TEXTURE_USAGE_SAMPLER, 0));
+
+   format = src->format;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = format;
+   templ.last_level = 0;
+   templ.width[0] = src->width[0];
+   templ.height[0] = src->height[0];
+   templ.depth[0] = 1;
+   pf_get_block(format, &templ.block);
+   templ.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER;
+
+   pt = screen->texture_create(screen, &templ);
+
+   debug_assert(!pt || pipe_is_referenced(&pt->reference));
+
+   if (!pt)
+      return NULL;
+
+   {
+      /* copy source framebuffer surface into texture */
+      struct pipe_surface *ps_read = screen->get_tex_surface(
+         screen, src, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ);
+      struct pipe_surface *ps_tex = screen->get_tex_surface(
+         screen, pt, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE );
+      pipe->surface_copy(pipe,
+			 ps_tex, /* dest */
+			 0, 0, /* destx/y */
+			 ps_read,
+			 0, 0, src->width[0], src->height[0]);
+      pipe_surface_reference(&ps_read, NULL);
+      pipe_surface_reference(&ps_tex, NULL);
+   }
+
+   return pt;
+}
+
+void xorg_copy_pixmap(struct exa_context *ctx,
+                      struct exa_pixmap_priv *dst_priv, int dx, int dy,
+                      struct exa_pixmap_priv *src_priv, int sx, int sy,
+                      int width, int height)
+{
+   float dst_loc[4], src_loc[4];
+   float dst_bounds[4], src_bounds[4];
+   float src_shift[4], dst_shift[4], shift[4];
+   struct pipe_texture *dst = dst_priv->tex;
+   struct pipe_texture *src = src_priv->tex;
+
+   if (ctx->pipe->is_texture_referenced(ctx->pipe, src, 0, 0) &
+       PIPE_REFERENCED_FOR_WRITE)
+      ctx->pipe->flush(ctx->pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+   dst_loc[0] = dx;
+   dst_loc[1] = dy;
+   dst_loc[2] = width;
+   dst_loc[3] = height;
+   dst_bounds[0] = 0.f;
+   dst_bounds[1] = 0.f;
+   dst_bounds[2] = dst->width[0];
+   dst_bounds[3] = dst->height[0];
+
+   src_loc[0] = sx;
+   src_loc[1] = sy;
+   src_loc[2] = width;
+   src_loc[3] = height;
+   src_bounds[0] = 0.f;
+   src_bounds[1] = 0.f;
+   src_bounds[2] = src->width[0];
+   src_bounds[3] = src->height[0];
+
+   bound_rect(src_loc, src_bounds, src_shift);
+   bound_rect(dst_loc, dst_bounds, dst_shift);
+   shift[0] = src_shift[0] - dst_shift[0];
+   shift[1] = src_shift[1] - dst_shift[1];
+
+   if (shift[0] < 0)
+      shift_rectx(src_loc, src_bounds, -shift[0]);
+   else
+      shift_rectx(dst_loc, dst_bounds, shift[0]);
+
+   if (shift[1] < 0)
+      shift_recty(src_loc, src_bounds, -shift[1]);
+   else
+      shift_recty(dst_loc, dst_bounds, shift[1]);
+
+   sync_size(src_loc, dst_loc);
+
+   if (src_loc[2] >= 0 && src_loc[3] >= 0 &&
+       dst_loc[2] >= 0 && dst_loc[3] >= 0) {
+      struct pipe_texture *temp_src = src;
+
+      if (src == dst)
+         temp_src = create_sampler_texture(ctx, src);
+
+      renderer_copy_texture(ctx,
+                            temp_src,
+                            src_loc[0],
+                            src_loc[1],
+                            src_loc[0] + src_loc[2],
+                            src_loc[1] + src_loc[3],
+                            dst,
+                            dst_loc[0],
+                            dst_loc[1],
+                            dst_loc[0] + dst_loc[2],
+                            dst_loc[1] + dst_loc[3]);
+
+      if (src == dst)
+         pipe_texture_reference(&temp_src, NULL);
+   }
+}
+
diff --git a/src/gallium/state_trackers/xorg/xorg_composite.h b/src/gallium/state_trackers/xorg/xorg_composite.h
index 17dfcb199e..e73f1c704a 100644
--- a/src/gallium/state_trackers/xorg/xorg_composite.h
+++ b/src/gallium/state_trackers/xorg/xorg_composite.h
@@ -22,4 +22,16 @@ void xorg_composite(struct exa_context *exa,
                     int srcX, int srcY, int maskX, int maskY,
                     int dstX, int dstY, int width, int height);
 
+boolean xorg_solid_bind_state(struct exa_context *exa,
+                              struct exa_pixmap_priv *pixmap,
+                              Pixel fg);
+void xorg_solid(struct exa_context *exa,
+                struct exa_pixmap_priv *pixmap,
+                int x0, int y0, int x1, int y1);
+
+void xorg_copy_pixmap(struct exa_context *ctx,
+                      struct exa_pixmap_priv *dst, int dx, int dy,
+                      struct exa_pixmap_priv *src, int sx, int sy,
+                      int width, int height);
+
 #endif
diff --git a/src/gallium/state_trackers/xorg/xorg_crtc.c b/src/gallium/state_trackers/xorg/xorg_crtc.c
index fe08bde9ef..67fe29a69d 100644
--- a/src/gallium/state_trackers/xorg/xorg_crtc.c
+++ b/src/gallium/state_trackers/xorg/xorg_crtc.c
@@ -42,8 +42,12 @@
 #include "xorg_tracker.h"
 #include "xf86Modes.h"
 
+#ifdef HAVE_XEXTPROTO_71
+#include <X11/extensions/dpmsconst.h>
+#else
 #define DPMS_SERVER
 #include <X11/extensions/dpms.h>
+#endif
 
 #include "pipe/p_inlines.h"
 #include "util/u_rect.h"
diff --git a/src/gallium/state_trackers/xorg/xorg_dri2.c b/src/gallium/state_trackers/xorg/xorg_dri2.c
index 6431a0fe25..8a362596c7 100644
--- a/src/gallium/state_trackers/xorg/xorg_dri2.c
+++ b/src/gallium/state_trackers/xorg/xorg_dri2.c
@@ -118,6 +118,7 @@ driDoCreateBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
     }
 
     if (!tex) {
+	exaMoveInPixmap(private->pPixmap);
 	xorg_exa_set_shared_usage(private->pPixmap);
 	pScreen->ModifyPixmapHeader(private->pPixmap, 0, 0, 0, 0, 0, NULL);
 	tex = xorg_exa_get_texture(private->pPixmap);
diff --git a/src/gallium/state_trackers/xorg/xorg_driver.c b/src/gallium/state_trackers/xorg/xorg_driver.c
index 643b6b3b9e..3dff8d859e 100644
--- a/src/gallium/state_trackers/xorg/xorg_driver.c
+++ b/src/gallium/state_trackers/xorg/xorg_driver.c
@@ -558,6 +558,7 @@ ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
     xf86SetBlackWhitePixels(pScreen);
 
     ms->exa = xorg_exa_init(pScrn);
+    ms->debug_fallback = debug_get_option_bool("XORG_DEBUG_FALLBACK", TRUE);
 
     miInitializeBackingStore(pScreen);
     xf86SetBackingStore(pScreen);
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.c b/src/gallium/state_trackers/xorg/xorg_exa.c
index a17a71f23a..b54e31a701 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa.c
@@ -47,6 +47,10 @@
 
 #include "util/u_rect.h"
 
+#define DEBUG_PRINT 0
+#define DEBUG_SOLID 0
+#define DISABLE_ACCEL 0
+
 /*
  * Helper functions
  */
@@ -72,6 +76,9 @@ exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp)
 	assert(*bbp == 16);
 	break;
     case 8:
+	*format = PIPE_FORMAT_I8_UNORM;
+	assert(*bbp == 8);
+	break;
     case 4:
     case 1:
 	*format = PIPE_FORMAT_A8R8G8B8_UNORM; /* bad bad bad */
@@ -82,6 +89,25 @@ exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp)
     }
 }
 
+static void
+xorg_exa_init_state(struct exa_context *exa)
+{
+   struct pipe_depth_stencil_alpha_state dsa;
+
+   /* set common initial clip state */
+   memset(&dsa, 0, sizeof(struct pipe_depth_stencil_alpha_state));
+   cso_set_depth_stencil_alpha(exa->cso, &dsa);
+}
+
+static void
+xorg_exa_common_done(struct exa_context *exa)
+{
+   exa->copy.src = NULL;
+   exa->copy.dst = NULL;
+   exa->has_solid_color = FALSE;
+   exa->num_bound_samplers = 0;
+}
+
 /*
  * Static exported EXA functions
  */
@@ -111,9 +137,9 @@ ExaDownloadFromScreen(PixmapPtr pPix, int x,  int y, int w,  int h, char *dst,
     if (!priv || !priv->tex)
 	return FALSE;
 
-    if (exa->ctx->is_texture_referenced(exa->ctx, priv->tex, 0, 0) &
+    if (exa->pipe->is_texture_referenced(exa->pipe, priv->tex, 0, 0) &
 	PIPE_REFERENCED_FOR_WRITE)
-	exa->ctx->flush(exa->ctx, 0, NULL);
+	exa->pipe->flush(exa->pipe, 0, NULL);
 
     transfer = exa->scrn->get_tex_transfer(exa->scrn, priv->tex, 0, 0, 0,
 					   PIPE_TRANSFER_READ, x, y, w, h);
@@ -178,9 +204,9 @@ ExaPrepareAccess(PixmapPtr pPix, int index)
 
     if (priv->map_count++ == 0)
     {
-	if (exa->ctx->is_texture_referenced(exa->ctx, priv->tex, 0, 0) &
+	if (exa->pipe->is_texture_referenced(exa->pipe, priv->tex, 0, 0) &
 	    PIPE_REFERENCED_FOR_WRITE)
-	    exa->ctx->flush(exa->ctx, 0, NULL);
+	    exa->pipe->flush(exa->pipe, 0, NULL);
 
 	priv->map_transfer =
 	    exa->scrn->get_tex_transfer(exa->scrn, priv->tex, 0, 0, 0,
@@ -231,15 +257,22 @@ ExaDone(PixmapPtr pPixmap)
     if (!priv)
 	return;
 
-    if (priv->src_surf)
-	exa->scrn->tex_surface_destroy(priv->src_surf);
-    priv->src_surf = NULL;
+#if 1
+    xorg_exa_flush(exa, PIPE_FLUSH_RENDER_CACHE, NULL);
+#else
+    xorg_exa_finish(exa);
+#endif
+    xorg_exa_common_done(exa);
 }
 
 static void
 ExaDoneComposite(PixmapPtr pPixmap)
 {
+   ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct exa_context *exa = ms->exa;
 
+   xorg_exa_common_done(exa);
 }
 
 static Bool
@@ -250,24 +283,36 @@ ExaPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planeMask, Pixel fg)
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
     struct exa_context *exa = ms->exa;
 
-    if (pPixmap->drawable.depth < 15)
-	return FALSE;
-
-    if (!EXA_PM_IS_SOLID(&pPixmap->drawable, planeMask))
-	return FALSE;
+#if DEBUG_PRINT
+    debug_printf("ExaPrepareSolid(0x%x)\n", fg);
+#endif
+    if (!exa->pipe)
+	XORG_FALLBACK("solid accle not enabled");
 
     if (!priv || !priv->tex)
-	return FALSE;
+	XORG_FALLBACK("solid !priv || !priv->tex");
+
+    if (!EXA_PM_IS_SOLID(&pPixmap->drawable, planeMask))
+	XORG_FALLBACK("solid planeMask is not solid");
 
     if (alu != GXcopy)
-	return FALSE;
+	XORG_FALLBACK("solid not GXcopy");
 
-    if (!exa->ctx || !exa->ctx->surface_fill)
-	return FALSE;
+    if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                        priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+	XORG_FALLBACK("solid bad format %s", pf_name(priv->tex->format));
+    }
 
-    priv->color = fg;
+#if DEBUG_SOLID
+    fg = 0xffff0000;
+#endif
 
-    return TRUE;
+#if DISABLE_ACCEL
+    return FALSE;
+#else
+    return xorg_solid_bind_state(exa, priv, fg);
+#endif
 }
 
 static void
@@ -277,12 +322,51 @@ ExaSolid(PixmapPtr pPixmap, int x0, int y0, int x1, int y1)
     modesettingPtr ms = modesettingPTR(pScrn);
     struct exa_context *exa = ms->exa;
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
-    struct pipe_surface *surf = exa_gpu_surface(exa, priv);
 
-    exa->ctx->surface_fill(exa->ctx, surf, x0, y0, x1 - x0, y1 - y0,
-			   priv->color);
+#if DEBUG_PRINT
+    debug_printf("\tExaSolid(%d, %d, %d, %d)\n", x0, y0, x1, y1);
+#endif
+
+#if 0
+    if (x0 == 0 && y0 == 0 &&
+        x1 == priv->tex->width[0] &&
+        y1 == priv->tex->height[0]) {
+       exa->ctx->clear(exa->pipe, PIPE_CLEAR_COLOR,
+                       exa->solid_color, 1., 0);
+    } else
+#endif
 
-    exa->scrn->tex_surface_destroy(surf);
+#if DEBUG_SOLID
+       exa->solid_color[0] = 0.f;
+       exa->solid_color[1] = 1.f;
+       exa->solid_color[2] = 0.f;
+       exa->solid_color[3] = 1.f;
+    xorg_solid(exa, priv, 0, 0, 1024, 768);
+       exa->solid_color[0] = 1.f;
+       exa->solid_color[1] = 0.f;
+       exa->solid_color[2] = 0.f;
+       exa->solid_color[3] = 1.f;
+       xorg_solid(exa, priv, 0, 0, 300, 300);
+       xorg_solid(exa, priv, 300, 300, 350, 350);
+       xorg_solid(exa, priv, 350, 350, 500, 500);
+
+       xorg_solid(exa, priv,
+               priv->tex->width[0] - 10,
+               priv->tex->height[0] - 10,
+               priv->tex->width[0],
+               priv->tex->height[0]);
+
+    exa->solid_color[0] = 0.f;
+    exa->solid_color[1] = 0.f;
+    exa->solid_color[2] = 1.f;
+    exa->solid_color[3] = 1.f;
+
+    exa->has_solid_color = FALSE;
+    ExaPrepareCopy(pPixmap, pPixmap, 0, 0, GXcopy, 0xffffffff);
+    ExaCopy(pPixmap, 0, 0, 50, 50, 500, 500);
+#else
+    xorg_solid(exa, priv, x0, y0, x1, y1) ;
+#endif
 }
 
 static Bool
@@ -295,42 +379,63 @@ ExaPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, int xdir,
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
     struct exa_pixmap_priv *src_priv = exaGetPixmapDriverPrivate(pSrcPixmap);
 
-    if (alu != GXcopy)
-	return FALSE;
+#if DEBUG_PRINT
+    debug_printf("ExaPrepareCopy\n");
+#endif
+    if (!exa->pipe)
+	XORG_FALLBACK("copy accle not enabled");
 
-    if (pSrcPixmap->drawable.depth < 15 || pDstPixmap->drawable.depth < 15)
-	return FALSE;
+    if (!priv || !src_priv)
+	XORG_FALLBACK("copy !priv || !src_priv");
+
+    if (!priv->tex || !src_priv->tex)
+	XORG_FALLBACK("copy !priv->tex || !src_priv->tex");
 
     if (!EXA_PM_IS_SOLID(&pSrcPixmap->drawable, planeMask))
-	return FALSE;
+	XORG_FALLBACK("copy planeMask is not solid");
 
-    if (!priv || !src_priv)
-	return FALSE;
+    if (alu != GXcopy)
+	XORG_FALLBACK("copy alu not GXcopy");
 
-    if (!priv->tex || !src_priv->tex)
-	return FALSE;
+    if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                        priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_RENDER_TARGET, 0))
+	XORG_FALLBACK("copy pDst format %s", pf_name(priv->tex->format));
 
-    if (!exa->ctx || !exa->ctx->surface_copy)
-	return FALSE;
+    if (!exa->scrn->is_format_supported(exa->scrn, src_priv->tex->format,
+                                        src_priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_SAMPLER, 0))
+	XORG_FALLBACK("copy pSrc format %s", pf_name(src_priv->tex->format));
 
-    priv->src_surf = exa_gpu_surface(exa, src_priv);
+    exa->copy.src = src_priv;
+    exa->copy.dst = priv;
 
+#if DISABLE_ACCEL
+    return FALSE;
+#else
     return TRUE;
+#endif
 }
 
 static void
 ExaCopy(PixmapPtr pDstPixmap, int srcX, int srcY, int dstX, int dstY,
 	int width, int height)
 {
-    ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
-    modesettingPtr ms = modesettingPTR(pScrn);
-    struct exa_context *exa = ms->exa;
-    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
-    struct pipe_surface *surf = exa_gpu_surface(exa, priv);
+   ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct exa_context *exa = ms->exa;
+   struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
+
+#if DEBUG_PRINT
+   debug_printf("\tExaCopy(srcx=%d, srcy=%d, dstX=%d, dstY=%d, w=%d, h=%d)\n",
+                srcX, srcY, dstX, dstY, width, height);
+#endif
+
+   debug_assert(priv == exa->copy.dst);
 
-    exa->ctx->surface_copy(exa->ctx, surf, dstX, dstY, priv->src_surf,
-			   srcX, srcY, width, height);
-    exa->scrn->tex_surface_destroy(surf);
+   xorg_copy_pixmap(exa, exa->copy.dst, dstX, dstY,
+                    exa->copy.src, srcX, srcY,
+                    width, height);
 }
 
 static Bool
@@ -341,12 +446,55 @@ ExaPrepareComposite(int op, PicturePtr pSrcPicture,
    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
    modesettingPtr ms = modesettingPTR(pScrn);
    struct exa_context *exa = ms->exa;
+   struct exa_pixmap_priv *priv;
+
+#if DEBUG_PRINT
+   debug_printf("ExaPrepareComposite\n");
+#endif
+   if (!exa->pipe)
+      XORG_FALLBACK("comp accle not enabled");
+
+   priv = exaGetPixmapDriverPrivate(pDst);
+   if (!priv || !priv->tex)
+      XORG_FALLBACK("comp pDst %s", !priv ? "!priv" : "!priv->tex");
+
+   if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                       priv->tex->target,
+                                       PIPE_TEXTURE_USAGE_RENDER_TARGET, 0))
+      XORG_FALLBACK("copy pDst format: %s", pf_name(priv->tex->format));
+
+   if (pSrc) {
+      priv = exaGetPixmapDriverPrivate(pSrc);
+      if (!priv || !priv->tex)
+         XORG_FALLBACK("comp pSrc %s", !priv ? "!priv" : "!priv->tex");
+
+      if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                          priv->tex->target,
+                                          PIPE_TEXTURE_USAGE_SAMPLER, 0))
+         XORG_FALLBACK("copy pSrc format: %s", pf_name(priv->tex->format));
+   }
+
+   if (pMask) {
+      priv = exaGetPixmapDriverPrivate(pMask);
+      if (!priv || !priv->tex)
+         XORG_FALLBACK("comp pMask %s", !priv ? "!priv" : "!priv->tex");
+
+      if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                          priv->tex->target,
+                                          PIPE_TEXTURE_USAGE_SAMPLER, 0))
+         XORG_FALLBACK("copy pMask format: %s", pf_name(priv->tex->format));
+   }
 
+#if DISABLE_ACCEL
+   (void) exa;
+   return FALSE;
+#else
    return xorg_composite_bind_state(exa, op, pSrcPicture, pMaskPicture,
                                     pDstPicture,
-                                    exaGetPixmapDriverPrivate(pSrc),
-                                    exaGetPixmapDriverPrivate(pMask),
+                                    pSrc ? exaGetPixmapDriverPrivate(pSrc) : NULL,
+                                    pMask ? exaGetPixmapDriverPrivate(pMask) : NULL,
                                     exaGetPixmapDriverPrivate(pDst));
+#endif
 }
 
 static void
@@ -358,6 +506,10 @@ ExaComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
    struct exa_context *exa = ms->exa;
    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDst);
 
+#if DEBUG_PRINT
+   debug_printf("\tExaComposite\n");
+#endif
+
    xorg_composite(exa, priv, srcX, srcY, maskX, maskY,
                   dstX, dstY, width, height);
 }
@@ -367,10 +519,15 @@ ExaCheckComposite(int op,
 		  PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 		  PicturePtr pDstPicture)
 {
-   return xorg_composite_accelerated(op,
-                                     pSrcPicture,
-                                     pMaskPicture,
-                                     pDstPicture);
+   boolean accelerated = xorg_composite_accelerated(op,
+                                                    pSrcPicture,
+                                                    pMaskPicture,
+                                                    pDstPicture);
+#if DEBUG_PRINT
+   debug_printf("ExaCheckComposite(%d, %p, %p, %p) = %d\n",
+                op, pSrcPicture, pMaskPicture, pDstPicture, accelerated);
+#endif
+   return accelerated;
 }
 
 static void *
@@ -389,14 +546,11 @@ static void
 ExaDestroyPixmap(ScreenPtr pScreen, void *dPriv)
 {
     struct exa_pixmap_priv *priv = (struct exa_pixmap_priv *)dPriv;
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-    modesettingPtr ms = modesettingPTR(pScrn);
 
     if (!priv)
 	return;
 
-    if (priv->tex)
-	ms->screen->texture_destroy(priv->tex);
+    pipe_texture_reference(&priv->tex, NULL);
 
     xfree(priv);
 }
@@ -537,16 +691,16 @@ ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
 
 	    if (priv->tex) {
 		struct pipe_surface *dst_surf;
-
-		dst_surf = exa->scrn->get_tex_surface(exa->scrn, texture, 0, 0, 0,
-						      PIPE_BUFFER_USAGE_GPU_WRITE);
-		priv->src_surf = exa_gpu_surface(exa, priv);
-		exa->ctx->surface_copy(exa->ctx, dst_surf, 0, 0, priv->src_surf,
-				       0, 0, min(width, texture->width[0]),
-				       min(height, texture->height[0]));
+                struct pipe_surface *src_surf;
+
+		dst_surf = exa->scrn->get_tex_surface(
+                   exa->scrn, texture, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE);
+		src_surf = exa_gpu_surface(exa, priv);
+		exa->pipe->surface_copy(exa->pipe, dst_surf, 0, 0, src_surf,
+                                        0, 0, min(width, texture->width[0]),
+                                        min(height, texture->height[0]));
 		exa->scrn->tex_surface_destroy(dst_surf);
-		exa->scrn->tex_surface_destroy(priv->src_surf);
-		priv->src_surf = NULL;
+		exa->scrn->tex_surface_destroy(src_surf);
 	    } else if (pPixmap->devPrivate.ptr) {
 		struct pipe_transfer *transfer;
 
@@ -563,6 +717,9 @@ ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
 			       pPixmap->devKind, 0, 0);
 		exa->scrn->transfer_unmap(exa->scrn, transfer);
 		exa->scrn->tex_transfer_destroy(transfer);
+
+		xfree(pPixmap->devPrivate.ptr);
+		pPixmap->devPrivate.ptr = NULL;
 	    }
 	}
 #ifdef DRM_MODE_FEATURE_DIRTYFB
@@ -574,6 +731,8 @@ ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
 #endif
 
 	pipe_texture_reference(&priv->tex, texture);
+	/* the texture we create has one reference */
+	pipe_texture_reference(&texture, NULL);
     }
 
     return TRUE;
@@ -611,8 +770,8 @@ xorg_exa_close(ScrnInfoPtr pScrn)
       cso_destroy_context(exa->cso);
    }
 
-   if (exa->ctx)
-      exa->ctx->destroy(exa->ctx);
+   if (exa->pipe)
+      exa->pipe->destroy(exa->pipe);
 
    exaDriverFini(pScrn->pScreen);
    xfree(exa);
@@ -645,6 +804,12 @@ xorg_exa_init(ScrnInfoPtr pScrn)
    pExa->pixmapOffsetAlign = 0;
    pExa->pixmapPitchAlign  = 1;
    pExa->flags             = EXA_OFFSCREEN_PIXMAPS | EXA_HANDLES_PIXMAPS;
+#ifdef EXA_SUPPORTS_PREPARE_AUX
+   pExa->flags            |= EXA_SUPPORTS_PREPARE_AUX;
+#endif
+#ifdef EXA_MIXED_PIXMAPS
+   pExa->flags            |= EXA_MIXED_PIXMAPS;
+#endif
    pExa->maxX              = 8191; /* FIXME */
    pExa->maxY              = 8191; /* FIXME */
 
@@ -674,13 +839,15 @@ xorg_exa_init(ScrnInfoPtr pScrn)
    }
 
    exa->scrn = ms->screen;
-   exa->ctx = ms->api->create_context(ms->api, exa->scrn);
+   exa->pipe = ms->api->create_context(ms->api, exa->scrn);
    /* Share context with DRI */
-   ms->ctx = exa->ctx;
+   ms->ctx = exa->pipe;
 
-   exa->cso = cso_create_context(exa->ctx);
+   exa->cso = cso_create_context(exa->pipe);
    exa->shaders = xorg_shaders_create(exa);
 
+   xorg_exa_init_state(exa);
+
    return (void *)exa;
 
 out_err:
@@ -698,3 +865,19 @@ exa_gpu_surface(struct exa_context *exa, struct exa_pixmap_priv *priv)
 
 }
 
+void xorg_exa_flush(struct exa_context *exa, uint pipeFlushFlags,
+                    struct pipe_fence_handle **fence)
+{
+   exa->pipe->flush(exa->pipe, pipeFlushFlags, fence);
+}
+
+void xorg_exa_finish(struct exa_context *exa)
+{
+   struct pipe_fence_handle *fence = NULL;
+
+   xorg_exa_flush(exa, PIPE_FLUSH_RENDER_CACHE, &fence);
+
+   exa->pipe->screen->fence_finish(exa->pipe->screen, fence, 0);
+   exa->pipe->screen->fence_reference(exa->pipe->screen, &fence, NULL);
+}
+
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.h b/src/gallium/state_trackers/xorg/xorg_exa.h
index 5b515be139..d3f25ca844 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.h
+++ b/src/gallium/state_trackers/xorg/xorg_exa.h
@@ -14,7 +14,7 @@ struct xorg_shaders;
 struct exa_context
 {
    ExaDriverPtr pExa;
-   struct pipe_context *ctx;
+   struct pipe_context *pipe;
    struct pipe_screen *scrn;
    struct cso_context *cso;
    struct xorg_shaders *shaders;
@@ -26,8 +26,17 @@ struct exa_context
    int num_bound_samplers;
 
    float solid_color[4];
-};
+   boolean has_solid_color;
+
+   struct {
+      struct exa_pixmap_priv *src;
+      struct exa_pixmap_priv *dst;
+   } copy;
 
+   /* we should combine these two */
+   float vertices2[4][2][4];
+   float vertices3[4][3][4];
+};
 
 struct exa_pixmap_priv
 {
@@ -36,15 +45,25 @@ struct exa_pixmap_priv
 
    struct pipe_texture *tex;
    struct pipe_texture *depth_stencil_tex;
-   unsigned int color;
-   struct pipe_surface *src_surf; /* for copies */
 
    struct pipe_transfer *map_transfer;
    unsigned map_count;
 };
 
+#define XORG_FALLBACK(s, arg...)              \
+do {                                          \
+   if (ms->debug_fallback) {                  \
+      xf86DrvMsg(pScrn->scrnIndex, X_INFO,    \
+                 "fallback: " s "\n", ##arg); \
+   }                                          \
+   return FALSE;                              \
+} while(0)
+
 struct pipe_surface *
 exa_gpu_surface(struct exa_context *exa, struct exa_pixmap_priv *priv);
 
+void xorg_exa_flush(struct exa_context *exa, uint pipeFlushFlags,
+                    struct pipe_fence_handle **fence);
+void xorg_exa_finish(struct exa_context *exa);
 
 #endif
diff --git a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
index cfee10c3b3..bb5a42af37 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
@@ -52,8 +52,7 @@ struct xorg_shaders {
 
 static const char over_op[] =
    "SUB TEMP[3], CONST[0].wwww, TEMP[1].wwww\n"
-   "MUL TEMP[3], TEMP[0], TEMP[3]\n"
-   "ADD TEMP[0], TEMP[3], TEMP[0]\n";
+   "MAD TEMP[3], TEMP[0], TEMP[3], TEMP[0]\n";
 
 
 static INLINE void
@@ -79,8 +78,7 @@ vs_normalize_coords(struct ureg_program *ureg, struct ureg_src coords,
 {
    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    struct ureg_src ret;
-   ureg_MUL(ureg, tmp, coords, const0);
-   ureg_ADD(ureg, tmp, ureg_src(tmp), const1);
+   ureg_MAD(ureg, tmp, coords, const0, const1);
    ret = ureg_src(tmp);
    ureg_release_temporary(ureg, tmp);
    return ret;
@@ -241,42 +239,39 @@ create_vs(struct pipe_context *pipe,
    boolean is_fill = vs_traits & VS_FILL;
    boolean is_composite = vs_traits & VS_COMPOSITE;
    boolean has_mask = vs_traits & VS_MASK;
+   unsigned input_slot = 0;
 
    ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
    if (ureg == NULL)
       return 0;
 
-   const0 = ureg_DECL_constant(ureg);
-   const1 = ureg_DECL_constant(ureg);
+   const0 = ureg_DECL_constant(ureg, 0);
+   const1 = ureg_DECL_constant(ureg, 1);
 
    /* it has to be either a fill or a composite op */
    debug_assert(is_fill ^ is_composite);
 
-   src = ureg_DECL_vs_input(ureg,
-                            TGSI_SEMANTIC_POSITION, 0);
+   src = ureg_DECL_vs_input(ureg, input_slot++);
    dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
    src = vs_normalize_coords(ureg, src,
                              const0, const1);
    ureg_MOV(ureg, dst, src);
 
-
    if (is_composite) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_GENERIC, 1);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 1);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0);
       ureg_MOV(ureg, dst, src);
    }
+
    if (is_fill) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_COLOR, 1);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
       ureg_MOV(ureg, dst, src);
    }
 
    if (has_mask) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_GENERIC, 2);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 2);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 2);
       ureg_MOV(ureg, dst, src);
    }
 
@@ -315,11 +310,11 @@ create_fs(struct pipe_context *pipe,
    if (is_composite) {
       src_sampler = ureg_DECL_sampler(ureg, 0);
       src_input = ureg_DECL_fs_input(ureg,
-                                     TGSI_SEMANTIC_POSITION,
+                                     TGSI_SEMANTIC_GENERIC,
                                      0,
                                      TGSI_INTERPOLATE_PERSPECTIVE);
-   }
-   if (is_fill) {
+   } else {
+      debug_assert(is_fill);
       if (is_solid)
          src_input = ureg_DECL_fs_input(ureg,
                                         TGSI_SEMANTIC_COLOR,
@@ -335,7 +330,7 @@ create_fs(struct pipe_context *pipe,
    if (has_mask) {
       mask_sampler = ureg_DECL_sampler(ureg, 1);
       mask_pos = ureg_DECL_fs_input(ureg,
-                                    TGSI_SEMANTIC_POSITION,
+                                    TGSI_SEMANTIC_GENERIC,
                                     1,
                                     TGSI_INTERPOLATE_PERSPECTIVE);
    }
@@ -370,11 +365,11 @@ create_fs(struct pipe_context *pipe,
          else
             src = out;
 
-         coords = ureg_DECL_constant(ureg);
-         const0124 = ureg_DECL_constant(ureg);
-         matrow0 = ureg_DECL_constant(ureg);
-         matrow1 = ureg_DECL_constant(ureg);
-         matrow2 = ureg_DECL_constant(ureg);
+         coords = ureg_DECL_constant(ureg, 0);
+         const0124 = ureg_DECL_constant(ureg, 1);
+         matrow0 = ureg_DECL_constant(ureg, 2);
+         matrow1 = ureg_DECL_constant(ureg, 3);
+         matrow2 = ureg_DECL_constant(ureg, 4);
 
          if (is_lingrad) {
             linear_gradient(ureg, src,
@@ -470,12 +465,12 @@ struct xorg_shader xorg_shaders_get(struct xorg_shaders *sc,
                                     unsigned vs_traits,
                                     unsigned fs_traits)
 {
-   struct xorg_shader shader = {0};
+   struct xorg_shader shader = { NULL, NULL };
    void *vs, *fs;
 
-   vs = shader_from_cache(sc->exa->ctx, PIPE_SHADER_VERTEX,
+   vs = shader_from_cache(sc->exa->pipe, PIPE_SHADER_VERTEX,
                           sc->vs_hash, vs_traits);
-   fs = shader_from_cache(sc->exa->ctx, PIPE_SHADER_FRAGMENT,
+   fs = shader_from_cache(sc->exa->pipe, PIPE_SHADER_FRAGMENT,
                           sc->fs_hash, fs_traits);
 
    debug_assert(vs && fs);
diff --git a/src/gallium/state_trackers/xorg/xorg_output.c b/src/gallium/state_trackers/xorg/xorg_output.c
index 950af942f5..26f45f8d64 100644
--- a/src/gallium/state_trackers/xorg/xorg_output.c
+++ b/src/gallium/state_trackers/xorg/xorg_output.c
@@ -42,8 +42,12 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#ifdef HAVE_XEXTPROTO_71
+#include <X11/extensions/dpmsconst.h>
+#else
 #define DPMS_SERVER
 #include <X11/extensions/dpms.h>
+#endif
 
 #include "X11/Xatom.h"
 
diff --git a/src/gallium/state_trackers/xorg/xorg_tracker.h b/src/gallium/state_trackers/xorg/xorg_tracker.h
index b1ab783a15..db2f16f63e 100644
--- a/src/gallium/state_trackers/xorg/xorg_tracker.h
+++ b/src/gallium/state_trackers/xorg/xorg_tracker.h
@@ -94,6 +94,7 @@ typedef struct _modesettingRec
     /* exa */
     void *exa;
     Bool noEvict;
+    Bool debug_fallback;
 
 #ifdef DRM_MODE_FEATURE_DIRTYFB
     DamagePtr damage;
@@ -151,5 +152,11 @@ crtc_cursor_destroy(xf86CrtcPtr crtc);
 void
 output_init(ScrnInfoPtr pScrn);
 
+/***********************************************************************
+ * xorg_xv.c
+ */
+void
+xorg_init_video(ScreenPtr pScreen);
+
 
 #endif /* _XORG_TRACKER_H_ */
diff --git a/src/gallium/state_trackers/xorg/xorg_xv.c b/src/gallium/state_trackers/xorg/xorg_xv.c
new file mode 100644
index 0000000000..88955d47fd
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xorg_xv.c
@@ -0,0 +1,212 @@
+#include "xorg_tracker.h"
+
+#include <xf86xv.h>
+#include <X11/extensions/Xv.h>
+#include <fourcc.h>
+
+/*XXX get these from pipe's texture limits */
+#define IMAGE_MAX_WIDTH		2048
+#define IMAGE_MAX_HEIGHT	2048
+
+#define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
+
+static Atom xvBrightness, xvContrast;
+
+#define NUM_TEXTURED_ATTRIBUTES 2
+static XF86AttributeRec TexturedAttributes[NUM_TEXTURED_ATTRIBUTES] = {
+   {XvSettable | XvGettable, -128, 127, "XV_BRIGHTNESS"},
+   {XvSettable | XvGettable, 0, 255, "XV_CONTRAST"}
+};
+
+#define NUM_FORMATS 3
+static XF86VideoFormatRec Formats[NUM_FORMATS] = {
+   {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
+};
+
+static XF86VideoEncodingRec DummyEncoding[1] = {
+   {
+      0,
+      "XV_IMAGE",
+      IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT,
+      {1, 1}
+   }
+};
+
+#define NUM_IMAGES 2
+static XF86ImageRec Images[NUM_IMAGES] = {
+   XVIMAGE_UYVY,
+   XVIMAGE_YUY2,
+};
+
+struct xorg_xv_port_priv {
+   RegionRec clip;
+};
+
+
+static void
+stop_video(ScrnInfoPtr pScrn, pointer data, Bool shutdown)
+{
+}
+
+static int
+set_port_attribute(ScrnInfoPtr pScrn,
+                   Atom attribute, INT32 value, pointer data)
+{
+   return 0;
+}
+
+static int
+get_port_attribute(ScrnInfoPtr pScrn,
+                   Atom attribute, INT32 * value, pointer data)
+{
+   return 0;
+}
+
+static void
+query_best_size(ScrnInfoPtr pScrn,
+                Bool motion,
+                short vid_w, short vid_h,
+                short drw_w, short drw_h,
+                unsigned int *p_w, unsigned int *p_h, pointer data)
+{
+}
+
+static int
+put_image(ScrnInfoPtr pScrn,
+          short src_x, short src_y,
+          short drw_x, short drw_y,
+          short src_w, short src_h,
+          short drw_w, short drw_h,
+          int id, unsigned char *buf,
+          short width, short height,
+          Bool sync, RegionPtr clipBoxes, pointer data,
+          DrawablePtr pDraw)
+{
+   return 0;
+}
+
+static int
+query_image_attributes(ScrnInfoPtr pScrn,
+                       int id,
+                       unsigned short *w, unsigned short *h,
+                       int *pitches, int *offsets)
+{
+   return 0;
+}
+
+static struct xorg_xv_port_priv *
+port_priv_create(ScreenPtr pScreen)
+{
+   /*ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];*/
+   /*modesettingPtr ms = modesettingPTR(pScrn);*/
+   struct xorg_xv_port_priv *priv = NULL;
+
+   priv = calloc(1, sizeof(struct xorg_xv_port_priv));
+
+   if (!priv)
+      return NULL;
+
+    REGION_NULL(pScreen, &priv->clip);
+
+   return priv;
+}
+
+static XF86VideoAdaptorPtr
+xorg_setup_textured_adapter(ScreenPtr pScreen)
+{
+   /*ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];*/
+   /*modesettingPtr ms = modesettingPTR(pScrn);*/
+   XF86VideoAdaptorPtr adapt;
+   XF86AttributePtr attrs;
+   DevUnion *dev_unions;
+   int nports = 16, i;
+   int nattributes;
+
+   nattributes = NUM_TEXTURED_ATTRIBUTES;
+
+   adapt = calloc(1, sizeof(XF86VideoAdaptorRec));
+   dev_unions = calloc(nports, sizeof(DevUnion));
+   attrs = calloc(nattributes, sizeof(XF86AttributeRec));
+   if (adapt == NULL || dev_unions == NULL || attrs == NULL) {
+      free(adapt);
+      free(dev_unions);
+      free(attrs);
+      return NULL;
+   }
+
+   adapt->type = XvWindowMask | XvInputMask | XvImageMask;
+   adapt->flags = 0;
+   adapt->name = "Gallium3D Textured Video";
+   adapt->nEncodings = 1;
+   adapt->pEncodings = DummyEncoding;
+   adapt->nFormats = NUM_FORMATS;
+   adapt->pFormats = Formats;
+   adapt->nPorts = 0;
+   adapt->pPortPrivates = dev_unions;
+   adapt->nAttributes = nattributes;
+   adapt->pAttributes = attrs;
+   memcpy(attrs, TexturedAttributes, nattributes * sizeof(XF86AttributeRec));
+   adapt->nImages = NUM_IMAGES;
+   adapt->pImages = Images;
+   adapt->PutVideo = NULL;
+   adapt->PutStill = NULL;
+   adapt->GetVideo = NULL;
+   adapt->GetStill = NULL;
+   adapt->StopVideo = stop_video;
+   adapt->SetPortAttribute = set_port_attribute;
+   adapt->GetPortAttribute = get_port_attribute;
+   adapt->QueryBestSize = query_best_size;
+   adapt->PutImage = put_image;
+   adapt->QueryImageAttributes = query_image_attributes;
+
+   for (i = 0; i < nports; i++) {
+      struct xorg_xv_port_priv *priv =
+         port_priv_create(pScreen);
+
+      adapt->pPortPrivates[i].ptr = (pointer) (priv);
+      adapt->nPorts++;
+   }
+
+   return adapt;
+}
+
+void
+xorg_init_video(ScreenPtr pScreen)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+   /*modesettingPtr ms = modesettingPTR(pScrn);*/
+   XF86VideoAdaptorPtr *adaptors, *new_adaptors = NULL;
+   XF86VideoAdaptorPtr textured_adapter;
+   int num_adaptors;
+
+   num_adaptors = xf86XVListGenericAdaptors(pScrn, &adaptors);
+   new_adaptors = malloc((num_adaptors + 1) * sizeof(XF86VideoAdaptorPtr *));
+   if (new_adaptors == NULL)
+      return;
+
+   memcpy(new_adaptors, adaptors, num_adaptors * sizeof(XF86VideoAdaptorPtr));
+   adaptors = new_adaptors;
+
+   /* Add the adaptors supported by our hardware.  First, set up the atoms
+    * that will be used by both output adaptors.
+    */
+   xvBrightness = MAKE_ATOM("XV_BRIGHTNESS");
+   xvContrast = MAKE_ATOM("XV_CONTRAST");
+
+   textured_adapter = xorg_setup_textured_adapter(pScreen);
+
+   debug_assert(textured_adapter);
+
+   if (textured_adapter) {
+      adaptors[num_adaptors++] = textured_adapter;
+   }
+
+   if (num_adaptors) {
+      xf86XVScreenInit(pScreen, adaptors, num_adaptors);
+   } else {
+      xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
+                 "Disabling Xv because no adaptors could be initialized.\n");
+   }
+
+   free(adaptors);
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/Makefile b/src/gallium/state_trackers/xorg/xvmc/Makefile
new file mode 100644
index 0000000000..126dc6d58f
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/Makefile
@@ -0,0 +1,16 @@
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = xvmctracker
+
+LIBRARY_INCLUDES = \
+	$(shell pkg-config --cflags-only-I xvmc) \
+	-I$(TOP)/src/gallium/winsys/g3dvl
+
+C_SOURCES = block.c \
+            surface.c \
+            context.c \
+            subpicture.c \
+            attributes.c
+
+include ../../../Makefile.template
diff --git a/src/gallium/state_trackers/xorg/xvmc/SConscript b/src/gallium/state_trackers/xorg/xvmc/SConscript
new file mode 100644
index 0000000000..cb25d68bd8
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/SConscript
@@ -0,0 +1,27 @@
+#######################################################################
+# SConscript for xvmc state_tracker
+
+Import('*')
+
+if 'xorg/xvmc' in env['statetrackers']:
+
+    env = env.Clone()
+    
+    env.Append(CPPPATH = [
+	'#/src/gallium/include',
+	'#/src/gallium/auxiliary',
+	'#/src/gallium/winsys/g3dvl',
+    ])
+
+    env.ParseConfig('pkg-config --cflags --libs xvmc')
+
+    st_xvmc = env.ConvenienceLibrary(
+	target = 'st_xvmc',
+	source = [ 'block.c',
+		'surface.c',
+		'context.c',
+		'subpicture.c',
+		'attributes.c',
+		]
+    )
+    Export('st_xvmc')
diff --git a/src/gallium/state_trackers/xorg/xvmc/attributes.c b/src/gallium/state_trackers/xorg/xvmc/attributes.c
new file mode 100644
index 0000000000..638da0b577
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/attributes.c
@@ -0,0 +1,19 @@
+#include <assert.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/Xvlib.h>
+#include <X11/extensions/XvMClib.h>
+
+XvAttribute* XvMCQueryAttributes(Display *dpy, XvMCContext *context, int *number)
+{
+   return NULL;
+}
+
+Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int value)
+{
+   return BadImplementation;
+}
+
+Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int *value)
+{
+   return BadImplementation;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/block.c b/src/gallium/state_trackers/xorg/xvmc/block.c
new file mode 100644
index 0000000000..78fddfb79e
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/block.c
@@ -0,0 +1,61 @@
+#include <assert.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
+#include <util/u_memory.h>
+#include "xvmc_private.h"
+
+Status XvMCCreateBlocks(Display *dpy, XvMCContext *context, unsigned int num_blocks, XvMCBlockArray *blocks)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (num_blocks == 0)
+      return BadValue;
+
+   assert(blocks);
+
+   blocks->context_id = context->context_id;
+   blocks->num_blocks = num_blocks;
+   blocks->blocks = MALLOC(BLOCK_SIZE_BYTES * num_blocks);
+   blocks->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCDestroyBlocks(Display *dpy, XvMCBlockArray *blocks)
+{
+   assert(dpy);
+   assert(blocks);
+   FREE(blocks->blocks);
+
+   return Success;
+}
+
+Status XvMCCreateMacroBlocks(Display *dpy, XvMCContext *context, unsigned int num_blocks, XvMCMacroBlockArray *blocks)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (num_blocks == 0)
+      return BadValue;
+
+   assert(blocks);
+
+   blocks->context_id = context->context_id;
+   blocks->num_blocks = num_blocks;
+   blocks->macro_blocks = MALLOC(sizeof(XvMCMacroBlock) * num_blocks);
+   blocks->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCDestroyMacroBlocks(Display *dpy, XvMCMacroBlockArray *blocks)
+{
+   assert(dpy);
+   assert(blocks);
+   FREE(blocks->macro_blocks);
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/context.c b/src/gallium/state_trackers/xorg/xvmc/context.c
new file mode 100644
index 0000000000..6d90dfc367
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/context.c
@@ -0,0 +1,214 @@
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/XvMClib.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_video_context.h>
+#include <pipe/p_video_state.h>
+#include <pipe/p_state.h>
+#include <vl_winsys.h>
+#include <util/u_memory.h>
+#include <util/u_debug.h>
+#include "xvmc_private.h"
+
+static Status Validate(Display *dpy, XvPortID port, int surface_type_id,
+                       unsigned int width, unsigned int height, int flags,
+                       bool *found_port, int *screen, int *chroma_format,
+                       int *mc_type, int *surface_flags)
+{
+   bool found_surface = false;
+   XvAdaptorInfo *adaptor_info;
+   unsigned int num_adaptors;
+   int num_types;
+   unsigned int max_width, max_height;
+   Status ret;
+
+   assert(dpy);
+   assert(found_port);
+   assert(screen);
+   assert(chroma_format);
+   assert(mc_type);
+   assert(surface_flags);
+
+   *found_port = false;
+
+   for (unsigned int i = 0; i < XScreenCount(dpy); ++i) {
+      ret = XvQueryAdaptors(dpy, XRootWindow(dpy, i), &num_adaptors, &adaptor_info);
+      if (ret != Success)
+         return ret;
+
+      for (unsigned int j = 0; j < num_adaptors && !*found_port; ++j) {
+         for (unsigned int k = 0; k < adaptor_info[j].num_ports && !*found_port; ++k) {
+            XvMCSurfaceInfo *surface_info;
+
+            if (adaptor_info[j].base_id + k != port)
+               continue;
+
+            *found_port = true;
+
+            surface_info = XvMCListSurfaceTypes(dpy, adaptor_info[j].base_id, &num_types);
+            if (!surface_info) {
+               XvFreeAdaptorInfo(adaptor_info);
+               return BadAlloc;
+            }
+
+            for (unsigned int l = 0; l < num_types && !found_surface; ++l) {
+               if (surface_info[l].surface_type_id != surface_type_id)
+                  continue;
+
+               found_surface = true;
+               max_width = surface_info[l].max_width;
+               max_height = surface_info[l].max_height;
+               *chroma_format = surface_info[l].chroma_format;
+               *mc_type = surface_info[l].mc_type;
+               *surface_flags = surface_info[l].flags;
+               *screen = i;
+            }
+
+            XFree(surface_info);
+         }
+      }
+
+      XvFreeAdaptorInfo(adaptor_info);
+   }
+
+   if (!*found_port)
+      return XvBadPort;
+   if (!found_surface)
+      return BadMatch;
+   if (width > max_width || height > max_height)
+      return BadValue;
+   if (flags != XVMC_DIRECT && flags != 0)
+      return BadValue;
+
+   return Success;
+}
+
+static enum pipe_video_profile ProfileToPipe(int xvmc_profile)
+{
+   if (xvmc_profile & XVMC_MPEG_1)
+      assert(0);
+   if (xvmc_profile & XVMC_MPEG_2)
+      return PIPE_VIDEO_PROFILE_MPEG2_MAIN;
+   if (xvmc_profile & XVMC_H263)
+      assert(0);
+   if (xvmc_profile & XVMC_MPEG_4)
+      assert(0);
+	
+   assert(0);
+
+   return -1;
+}
+
+static enum pipe_video_chroma_format FormatToPipe(int xvmc_format)
+{
+   switch (xvmc_format) {
+      case XVMC_CHROMA_FORMAT_420:
+         return PIPE_VIDEO_CHROMA_FORMAT_420;
+      case XVMC_CHROMA_FORMAT_422:
+         return PIPE_VIDEO_CHROMA_FORMAT_422;
+      case XVMC_CHROMA_FORMAT_444:
+         return PIPE_VIDEO_CHROMA_FORMAT_444;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
+                         int width, int height, int flags, XvMCContext *context)
+{
+   bool found_port;
+   int scrn;
+   int chroma_format;
+   int mc_type;
+   int surface_flags;
+   Status ret;
+   struct pipe_screen *screen;
+   struct pipe_video_context *vpipe;
+   XvMCContextPrivate *context_priv;
+
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+
+   ret = Validate(dpy, port, surface_type_id, width, height, flags,
+                  &found_port, &scrn, &chroma_format, &mc_type, &surface_flags);
+
+   /* Success and XvBadPort have the same value */
+   if (ret != Success || !found_port)
+      return ret;
+
+   /* XXX: Current limits */
+   if (chroma_format != XVMC_CHROMA_FORMAT_420) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Unsupported chroma format.\n");
+      return BadImplementation;
+   }
+   if (mc_type != (XVMC_MOCOMP | XVMC_MPEG_2)) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Non-MPEG2/Mocomp acceleration unsupported.\n");
+      return BadImplementation;
+   }
+   if (!(surface_flags & XVMC_INTRA_UNSIGNED)) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Signed intra unsupported.\n");
+      return BadImplementation;
+   }
+
+   context_priv = CALLOC(1, sizeof(XvMCContextPrivate));
+   if (!context_priv)
+      return BadAlloc;
+
+   /* TODO: Reuse screen if process creates another context */
+   screen = vl_screen_create(dpy, scrn);
+
+   if (!screen) {
+      FREE(context_priv);
+      return BadAlloc;
+   }
+
+   vpipe = vl_video_create(screen, ProfileToPipe(mc_type),
+                           FormatToPipe(chroma_format), width, height);
+
+   if (!vpipe) {
+      screen->destroy(screen);
+      FREE(context_priv);
+      return BadAlloc;
+   }
+
+   context_priv->vpipe = vpipe;
+
+   context->context_id = XAllocID(dpy);
+   context->surface_type_id = surface_type_id;
+   context->width = width;
+   context->height = height;
+   context->flags = flags;
+   context->port = port;
+   context->privData = context_priv;
+	
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCDestroyContext(Display *dpy, XvMCContext *context)
+{
+   struct pipe_screen *screen;
+   struct pipe_video_context *vpipe;
+   XvMCContextPrivate *context_priv;
+
+   assert(dpy);
+
+   if (!context || !context->privData)
+      return XvMCBadContext;
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+   pipe_surface_reference(&context_priv->backbuffer, NULL);
+   screen = vpipe->screen;
+   vpipe->destroy(vpipe);
+   screen->destroy(screen);
+   FREE(context_priv);
+   context->privData = NULL;
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/subpicture.c b/src/gallium/state_trackers/xorg/xvmc/subpicture.c
new file mode 100644
index 0000000000..78ba618f5a
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/subpicture.c
@@ -0,0 +1,168 @@
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/XvMClib.h>
+
+Status XvMCCreateSubpicture(Display *dpy, XvMCContext *context, XvMCSubpicture *subpicture,
+                            unsigned short width, unsigned short height, int xvimage_id)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+
+   assert(subpicture);
+
+   /*if (width > || height > )
+      return BadValue;*/
+
+   /*if (xvimage_id != )
+      return BadMatch;*/
+
+   subpicture->subpicture_id = XAllocID(dpy);
+   subpicture->context_id = context->context_id;
+   subpicture->xvimage_id = xvimage_id;
+   subpicture->width = width;
+   subpicture->height = height;
+   subpicture->num_palette_entries = 0;
+   subpicture->entry_bytes = 0;
+   subpicture->component_order[0] = 0;
+   subpicture->component_order[1] = 0;
+   subpicture->component_order[2] = 0;
+   subpicture->component_order[3] = 0;
+   /* TODO: subpicture->privData = ;*/
+
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCClearSubpicture(Display *dpy, XvMCSubpicture *subpicture, short x, short y,
+                           unsigned short width, unsigned short height, unsigned int color)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   /* TODO: Assert clear rect is within bounds? Or clip? */
+
+   return Success;
+}
+
+Status XvMCCompositeSubpicture(Display *dpy, XvMCSubpicture *subpicture, XvImage *image,
+                               short srcx, short srcy, unsigned short width, unsigned short height,
+                               short dstx, short dsty)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(image);
+
+   if (subpicture->xvimage_id != image->id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+
+   return Success;
+}
+
+Status XvMCDestroySubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return BadImplementation;
+}
+
+Status XvMCSetSubpicturePalette(Display *dpy, XvMCSubpicture *subpicture, unsigned char *palette)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(palette);
+
+   /* We don't support paletted subpictures */
+   return BadMatch;
+}
+
+Status XvMCBlendSubpicture(Display *dpy, XvMCSurface *target_surface, XvMCSubpicture *subpicture,
+                           short subx, short suby, unsigned short subw, unsigned short subh,
+                           short surfx, short surfy, unsigned short surfw, unsigned short surfh)
+{
+   assert(dpy);
+
+   if (!target_surface)
+      return XvMCBadSurface;
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   if (target_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+   return Success;
+}
+
+Status XvMCBlendSubpicture2(Display *dpy, XvMCSurface *source_surface, XvMCSurface *target_surface,
+                            XvMCSubpicture *subpicture, short subx, short suby, unsigned short subw, unsigned short subh,
+                            short surfx, short surfy, unsigned short surfw, unsigned short surfh)
+{
+   assert(dpy);
+
+   if (!source_surface || !target_surface)
+      return XvMCBadSurface;
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   if (source_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   if (source_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+   return Success;
+}
+
+Status XvMCSyncSubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return Success;
+}
+
+Status XvMCFlushSubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return Success;
+}
+
+Status XvMCGetSubpictureStatus(Display *dpy, XvMCSubpicture *subpicture, int *status)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(status);
+
+   /* TODO */
+   *status = 0;
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/surface.c b/src/gallium/state_trackers/xorg/xvmc/surface.c
new file mode 100644
index 0000000000..6b7dbf11dc
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/surface.c
@@ -0,0 +1,382 @@
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <pipe/p_video_context.h>
+#include <pipe/p_video_state.h>
+#include <pipe/p_state.h>
+#include <util/u_memory.h>
+#include "xvmc_private.h"
+
+static enum pipe_mpeg12_macroblock_type TypeToPipe(int xvmc_mb_type)
+{
+   if (xvmc_mb_type & XVMC_MB_TYPE_INTRA)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_INTRA;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == XVMC_MB_TYPE_MOTION_FORWARD)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_FWD;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == XVMC_MB_TYPE_MOTION_BACKWARD)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_BKWD;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD))
+      return PIPE_MPEG12_MACROBLOCK_TYPE_BI;
+
+   assert(0);
+
+   return -1;
+}
+
+static enum pipe_mpeg12_picture_type PictureToPipe(int xvmc_pic)
+{
+   switch (xvmc_pic) {
+      case XVMC_TOP_FIELD:
+         return PIPE_MPEG12_PICTURE_TYPE_FIELD_TOP;
+      case XVMC_BOTTOM_FIELD:
+         return PIPE_MPEG12_PICTURE_TYPE_FIELD_BOTTOM;
+      case XVMC_FRAME_PICTURE:
+         return PIPE_MPEG12_PICTURE_TYPE_FRAME;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+static enum pipe_mpeg12_motion_type MotionToPipe(int xvmc_motion_type, int xvmc_dct_type)
+{
+   switch (xvmc_motion_type) {
+      case XVMC_PREDICTION_FRAME:
+         return xvmc_dct_type == XVMC_DCT_TYPE_FIELD ?
+            PIPE_MPEG12_MOTION_TYPE_16x8 : PIPE_MPEG12_MOTION_TYPE_FRAME;
+      case XVMC_PREDICTION_FIELD:
+         return PIPE_MPEG12_MOTION_TYPE_FIELD;
+      case XVMC_PREDICTION_DUAL_PRIME:
+         return PIPE_MPEG12_MOTION_TYPE_DUALPRIME;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+static bool
+CreateOrResizeBackBuffer(struct pipe_video_context *vpipe, unsigned int width, unsigned int height,
+                         struct pipe_surface **backbuffer)
+{
+   struct pipe_texture template;
+   struct pipe_texture *tex;
+
+   assert(vpipe);
+
+   if (*backbuffer) {
+      if ((*backbuffer)->width != width || (*backbuffer)->height != height)
+         pipe_surface_reference(backbuffer, NULL);
+      else
+         return true;
+   }
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   /* XXX: Needs to match the drawable's format? */
+   template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   template.last_level = 0;
+   template.width[0] = width;
+   template.height[0] = height;
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
+   tex = vpipe->screen->texture_create(vpipe->screen, &template);
+   if (!tex)
+      return false;
+
+   *backbuffer = vpipe->screen->get_tex_surface(vpipe->screen, tex, 0, 0, 0,
+                                                PIPE_BUFFER_USAGE_GPU_READ |
+                                                PIPE_BUFFER_USAGE_GPU_WRITE);
+   pipe_texture_reference(&tex, NULL);
+
+   if (!*backbuffer)
+      return false;
+
+   /* Clear the backbuffer in case the video doesn't cover the whole window */
+   /* FIXME: Need to clear every time a frame moves and leaves dirty rects */
+   vpipe->clear_surface(vpipe, 0, 0, width, height, 0, *backbuffer);
+
+   return true;
+}
+
+static void
+MacroBlocksToPipe(const XvMCMacroBlockArray *xvmc_macroblocks,
+                  const XvMCBlockArray *xvmc_blocks,
+                  unsigned int first_macroblock,
+                  unsigned int num_macroblocks,
+                  struct pipe_mpeg12_macroblock *pipe_macroblocks)
+{
+   unsigned int i, j, k, l;
+   XvMCMacroBlock *xvmc_mb;
+
+   assert(xvmc_macroblocks);
+   assert(xvmc_blocks);
+   assert(pipe_macroblocks);
+   assert(num_macroblocks);
+
+   xvmc_mb = xvmc_macroblocks->macro_blocks + first_macroblock;
+
+   for (i = 0; i < num_macroblocks; ++i) {
+      pipe_macroblocks->base.codec = PIPE_VIDEO_CODEC_MPEG12;
+      pipe_macroblocks->mbx = xvmc_mb->x;
+      pipe_macroblocks->mby = xvmc_mb->y;
+      pipe_macroblocks->mb_type = TypeToPipe(xvmc_mb->macroblock_type);
+      if (pipe_macroblocks->mb_type != PIPE_MPEG12_MACROBLOCK_TYPE_INTRA)
+         pipe_macroblocks->mo_type = MotionToPipe(xvmc_mb->motion_type, xvmc_mb->dct_type);
+      /* Get rid of Valgrind 'undefined' warnings */
+      else
+         pipe_macroblocks->mo_type = -1;
+      pipe_macroblocks->dct_type = xvmc_mb->dct_type == XVMC_DCT_TYPE_FIELD ?
+         PIPE_MPEG12_DCT_TYPE_FIELD : PIPE_MPEG12_DCT_TYPE_FRAME;
+
+      for (j = 0; j < 2; ++j)
+         for (k = 0; k < 2; ++k)
+            for (l = 0; l < 2; ++l)
+               pipe_macroblocks->pmv[j][k][l] = xvmc_mb->PMV[j][k][l];
+
+      pipe_macroblocks->cbp = xvmc_mb->coded_block_pattern;
+      pipe_macroblocks->blocks = xvmc_blocks->blocks + xvmc_mb->index * BLOCK_SIZE_SAMPLES;
+
+      ++pipe_macroblocks;
+      ++xvmc_mb;
+   }
+}
+
+Status XvMCCreateSurface(Display *dpy, XvMCContext *context, XvMCSurface *surface)
+{
+   XvMCContextPrivate *context_priv;
+   struct pipe_video_context *vpipe;
+   XvMCSurfacePrivate *surface_priv;
+   struct pipe_video_surface *vsfc;
+
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (!surface)
+      return XvMCBadSurface;
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   surface_priv = CALLOC(1, sizeof(XvMCSurfacePrivate));
+   if (!surface_priv)
+      return BadAlloc;
+
+   vsfc = vpipe->screen->video_surface_create(vpipe->screen, vpipe->chroma_format,
+                                              vpipe->width, vpipe->height);
+   if (!vsfc) {
+      FREE(surface_priv);
+      return BadAlloc;
+   }
+
+   surface_priv->pipe_vsfc = vsfc;
+   surface_priv->context = context;
+
+   surface->surface_id = XAllocID(dpy);
+   surface->context_id = context->context_id;
+   surface->surface_type_id = context->surface_type_id;
+   surface->width = context->width;
+   surface->height = context->height;
+   surface->privData = surface_priv;
+
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCRenderSurface(Display *dpy, XvMCContext *context, unsigned int picture_structure,
+                         XvMCSurface *target_surface, XvMCSurface *past_surface, XvMCSurface *future_surface,
+                         unsigned int flags, unsigned int num_macroblocks, unsigned int first_macroblock,
+                         XvMCMacroBlockArray *macroblocks, XvMCBlockArray *blocks
+)
+{
+   struct pipe_video_context *vpipe;
+   struct pipe_surface *t_vsfc;
+   struct pipe_surface *p_vsfc;
+   struct pipe_surface *f_vsfc;
+   XvMCContextPrivate *context_priv;
+   XvMCSurfacePrivate *target_surface_priv;
+   XvMCSurfacePrivate *past_surface_priv;
+   XvMCSurfacePrivate *future_surface_priv;
+   struct pipe_mpeg12_macroblock pipe_macroblocks[num_macroblocks];
+
+   assert(dpy);
+
+   if (!context || !context->privData)
+      return XvMCBadContext;
+   if (!target_surface || !target_surface->privData)
+      return XvMCBadSurface;
+
+   if (picture_structure != XVMC_TOP_FIELD &&
+       picture_structure != XVMC_BOTTOM_FIELD &&
+       picture_structure != XVMC_FRAME_PICTURE)
+      return BadValue;
+   /* Bkwd pred equivalent to fwd (past && !future) */
+   if (future_surface && !past_surface)
+      return BadMatch;
+
+   assert(context->context_id == target_surface->context_id);
+   assert(!past_surface || context->context_id == past_surface->context_id);
+   assert(!future_surface || context->context_id == future_surface->context_id);
+
+   assert(macroblocks);
+   assert(blocks);
+
+   assert(macroblocks->context_id == context->context_id);
+   assert(blocks->context_id == context->context_id);
+
+   assert(flags == 0 || flags == XVMC_SECOND_FIELD);
+
+   target_surface_priv = target_surface->privData;
+   past_surface_priv = past_surface ? past_surface->privData : NULL;
+   future_surface_priv = future_surface ? future_surface->privData : NULL;
+
+   assert(target_surface_priv->context == context);
+   assert(!past_surface || past_surface_priv->context == context);
+   assert(!future_surface || future_surface_priv->context == context);
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   t_vsfc = target_surface_priv->pipe_vsfc;
+   p_vsfc = past_surface ? past_surface_priv->pipe_vsfc : NULL;
+   f_vsfc = future_surface ? future_surface_priv->pipe_vsfc : NULL;
+
+   MacroBlocksToPipe(macroblocks, blocks, first_macroblock,
+                     num_macroblocks, pipe_macroblocks);
+
+   vpipe->set_decode_target(vpipe, t_vsfc);
+   vpipe->decode_macroblocks(vpipe, p_vsfc, f_vsfc, num_macroblocks,
+                             &pipe_macroblocks->base, target_surface_priv->render_fence);
+
+   return Success;
+}
+
+Status XvMCFlushSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   return Success;
+}
+
+Status XvMCSyncSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   return Success;
+}
+
+Status XvMCPutSurface(Display *dpy, XvMCSurface *surface, Drawable drawable,
+                      short srcx, short srcy, unsigned short srcw, unsigned short srch,
+                      short destx, short desty, unsigned short destw, unsigned short desth,
+                      int flags)
+{
+   Window root;
+   int x, y;
+   unsigned int width, height;
+   unsigned int border_width;
+   unsigned int depth;
+   struct pipe_video_context *vpipe;
+   XvMCSurfacePrivate *surface_priv;
+   XvMCContextPrivate *context_priv;
+   XvMCContext *context;
+   struct pipe_video_rect src_rect = {srcx, srcy, srcw, srch};
+   struct pipe_video_rect dst_rect = {destx, desty, destw, desth};
+
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   if (XGetGeometry(dpy, drawable, &root, &x, &y, &width, &height, &border_width, &depth) == BadDrawable)
+      return BadDrawable;
+
+   assert(flags == XVMC_TOP_FIELD || flags == XVMC_BOTTOM_FIELD || flags == XVMC_FRAME_PICTURE);
+   assert(srcx + srcw - 1 < surface->width);
+   assert(srcy + srch - 1 < surface->height);
+   /*
+    * Some apps (mplayer) hit these asserts because they call
+    * this function after the window has been resized by the WM
+    * but before they've handled the corresponding XEvent and
+    * know about the new dimensions. The output should be clipped
+    * until the app updates destw and desth.
+    */
+   /*
+   assert(destx + destw - 1 < width);
+   assert(desty + desth - 1 < height);
+    */
+
+   surface_priv = surface->privData;
+   context = surface_priv->context;
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   if (!CreateOrResizeBackBuffer(vpipe, width, height, &context_priv->backbuffer))
+      return BadAlloc;
+
+   vpipe->render_picture(vpipe, surface_priv->pipe_vsfc, PictureToPipe(flags), &src_rect,
+                         context_priv->backbuffer, &dst_rect, surface_priv->disp_fence);
+
+   vl_video_bind_drawable(vpipe, drawable);
+	
+   vpipe->screen->flush_frontbuffer
+   (
+      vpipe->screen,
+      context_priv->backbuffer,
+      vpipe->priv
+   );
+
+   return Success;
+}
+
+Status XvMCGetSurfaceStatus(Display *dpy, XvMCSurface *surface, int *status)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   assert(status);
+
+   *status = 0;
+
+   return Success;
+}
+
+Status XvMCDestroySurface(Display *dpy, XvMCSurface *surface)
+{
+   XvMCSurfacePrivate *surface_priv;
+
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   surface_priv = surface->privData;
+   pipe_video_surface_reference(&surface_priv->pipe_vsfc, NULL);
+   FREE(surface_priv);
+   surface->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCHideSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   /* No op, only for overlaid rendering */
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore b/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore
new file mode 100644
index 0000000000..e1d2f9023d
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore
@@ -0,0 +1,5 @@
+test_context
+test_surface
+test_blocks
+test_rendering
+xvmc_bench
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/Makefile b/src/gallium/state_trackers/xorg/xvmc/tests/Makefile
new file mode 100644
index 0000000000..c875dd7605
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBS = -lXvMCW -lXvMC -lXv -lX11
+
+#############################################
+
+.PHONY: default clean
+
+default: test_context test_surface test_blocks test_rendering xvmc_bench
+
+test_context: test_context.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_surface: test_surface.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_blocks: test_blocks.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_rendering: test_rendering.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+xvmc_bench: xvmc_bench.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+clean:
+	$(RM) -rf *.o test_context test_surface test_blocks test_rendering xvmc_bench
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c
new file mode 100644
index 0000000000..dc80adfa65
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c
@@ -0,0 +1,84 @@
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	min_required_blocks = 1, min_required_macroblocks = 1;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		blocks = {0};
+	XvMCMacroBlockArray	macroblocks = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+
+	/* Test NULL context */
+	assert(XvMCCreateBlocks(display, NULL, 1, &blocks) == XvMCBadContext);
+	/* Test 0 blocks */
+	assert(XvMCCreateBlocks(display, &context, 0, &blocks) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateBlocks(display, &context, min_required_blocks, &blocks) == Success);
+	/* Test context id assigned and correct */
+	assert(blocks.context_id == context.context_id);
+	/* Test number of blocks assigned and correct */
+	assert(blocks.num_blocks == min_required_blocks);
+	/* Test block pointer valid */
+	assert(blocks.blocks != NULL);
+	/* Test NULL context */
+	assert(XvMCCreateMacroBlocks(display, NULL, 1, &macroblocks) == XvMCBadContext);
+	/* Test 0 macroblocks */
+	assert(XvMCCreateMacroBlocks(display, &context, 0, &macroblocks) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateMacroBlocks(display, &context, min_required_macroblocks, &macroblocks) == Success);
+	/* Test context id assigned and correct */
+	assert(macroblocks.context_id == context.context_id);
+	/* Test macroblock pointer valid */
+	assert(macroblocks.macro_blocks != NULL);
+	/* Test valid params */
+	assert(XvMCDestroyMacroBlocks(display, &macroblocks) == Success);
+	/* Test valid params */
+	assert(XvMCDestroyBlocks(display, &blocks) == Success);
+
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c
new file mode 100644
index 0000000000..53f7449cd0
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c
@@ -0,0 +1,92 @@
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	/* Test NULL context */
+	/* XXX: XvMCBadContext not a valid return for XvMCCreateContext in the XvMC API, but openChrome driver returns it */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, NULL) == XvMCBadContext);
+	/* Test invalid port */
+	/* XXX: Success and XvBadPort have the same value, if this call actually gets passed the validation step as of now we'll crash later */
+	assert(XvMCCreateContext(display, -1, surface_type_id, width, height, XVMC_DIRECT, &context) == XvBadPort);
+	/* Test invalid surface */
+	assert(XvMCCreateContext(display, port_num, -1, width, height, XVMC_DIRECT, &context) == BadMatch);
+	/* Test invalid flags */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, -1, &context) == BadValue);
+	/* Test huge width */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, 16384, height, XVMC_DIRECT, &context) == BadValue);
+	/* Test huge height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, 16384, XVMC_DIRECT, &context) == BadValue);
+	/* Test huge width & height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, 16384, 16384, XVMC_DIRECT, &context) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+	/* Test context id assigned */
+	assert(context.context_id != 0);
+	/* Test surface type id assigned and correct */
+	assert(context.surface_type_id == surface_type_id);
+	/* Test width & height assigned and correct */
+	assert(context.width == width && context.height == height);
+	/* Test port assigned and correct */
+	assert(context.port == port_num);
+	/* Test flags assigned and correct */
+	assert(context.flags == XVMC_DIRECT);
+	/* Test NULL context */
+	assert(XvMCDestroyContext(display, NULL) == XvMCBadContext);
+	/* Test valid params */
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid width */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width + 1, height, XVMC_DIRECT, &context) == Success);
+	assert(context.width >= width + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height + 1, XVMC_DIRECT, &context) == Success);
+	assert(context.height >= height + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid width & height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width + 1, height + 1, XVMC_DIRECT, &context) == Success);
+	assert(context.width >= width + 1 && context.height >= height + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c
new file mode 100644
index 0000000000..6d720dfcdc
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c
@@ -0,0 +1,290 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <error.h>
+#include "testlib.h"
+
+#define BLOCK_WIDTH			8
+#define BLOCK_HEIGHT			8
+#define BLOCK_SIZE			(BLOCK_WIDTH * BLOCK_HEIGHT)
+#define MACROBLOCK_WIDTH		16
+#define MACROBLOCK_HEIGHT		16
+#define MACROBLOCK_WIDTH_IN_BLOCKS	(MACROBLOCK_WIDTH / BLOCK_WIDTH)
+#define MACROBLOCK_HEIGHT_IN_BLOCKS	(MACROBLOCK_HEIGHT / BLOCK_HEIGHT)
+#define BLOCKS_PER_MACROBLOCK		6
+
+#define INPUT_WIDTH			16
+#define INPUT_HEIGHT			16
+#define INPUT_WIDTH_IN_MACROBLOCKS	(INPUT_WIDTH / MACROBLOCK_WIDTH)
+#define INPUT_HEIGHT_IN_MACROBLOCKS	(INPUT_HEIGHT / MACROBLOCK_HEIGHT)
+#define NUM_MACROBLOCKS			(INPUT_WIDTH_IN_MACROBLOCKS * INPUT_HEIGHT_IN_MACROBLOCKS)
+
+#define DEFAULT_OUTPUT_WIDTH		INPUT_WIDTH
+#define DEFAULT_OUTPUT_HEIGHT		INPUT_HEIGHT
+#define DEFAULT_ACCEPTABLE_ERR		0.01
+
+void ParseArgs(int argc, char **argv, unsigned int *output_width, unsigned int *output_height, double *acceptable_error, int *prompt);
+void Gradient(short *block, unsigned int start, unsigned int stop, int horizontal);
+
+void ParseArgs(int argc, char **argv, unsigned int *output_width, unsigned int *output_height, double *acceptable_error, int *prompt)
+{
+	int fail = 0;
+	int i;
+
+	*output_width = DEFAULT_OUTPUT_WIDTH;
+	*output_height = DEFAULT_OUTPUT_WIDTH;
+	*acceptable_error = DEFAULT_ACCEPTABLE_ERR;
+	*prompt = 1;
+
+	for (i = 1; i < argc && !fail; ++i)
+	{
+		if (!strcmp(argv[i], "-w"))
+		{
+			if (sscanf(argv[++i], "%u", output_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-h"))
+		{
+			if (sscanf(argv[++i], "%u", output_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-e"))
+		{
+			if (sscanf(argv[++i], "%lf", acceptable_error) != 1)
+				fail = 1;
+		}
+		else if (strcmp(argv[i], "-n"))
+			*prompt = 0;
+		else
+			fail = 1;
+	}
+
+	if (fail)
+		error
+		(
+			1, 0,
+			"Bad argument.\n"
+			"\n"
+			"Usage: %s [options]\n"
+			"\t-w <width>\tOutput width\n"
+			"\t-h <height>\tOutput height\n"
+			"\t-e <error>\tAcceptable margin of error per pixel, from 0 to 1\n"
+			"\t-n\tDon't prompt for quit\n",
+			argv[0]
+		);
+}
+
+void Gradient(short *block, unsigned int start, unsigned int stop, int horizontal)
+{
+	unsigned int x, y;
+	unsigned int range = stop - start;
+
+	if (horizontal)
+	{
+		for (y = 0; y < BLOCK_HEIGHT; ++y)
+			for (x = 0; x < BLOCK_WIDTH; ++x)
+				block[y * BLOCK_WIDTH + x] = (short)(start + range * (x / (float)(BLOCK_WIDTH - 1)));
+	}
+	else
+	{
+		for (y = 0; y < BLOCK_HEIGHT; ++y)
+			for (x = 0; x < BLOCK_WIDTH; ++x)
+				block[y * BLOCK_WIDTH + x] = (short)(start + range * (y / (float)(BLOCK_HEIGHT - 1)));
+	}
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int		output_width;
+	unsigned int		output_height;
+	double			acceptable_error;
+	int			prompt;
+	Display			*display;
+	Window			root, window;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		block_array;
+	XvMCMacroBlockArray	mb_array;
+	int			mbx, mby, bx, by;
+	XvMCMacroBlock		*mb;
+	short			*blocks;
+	int			quit = 0;
+
+	ParseArgs(argc, argv, &output_width, &output_height, &acceptable_error, &prompt);
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		INPUT_WIDTH,
+		INPUT_HEIGHT,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	root = XDefaultRootWindow(display);
+	window = XCreateSimpleWindow(display, root, 0, 0, output_width, output_height, 0, 0, colorkey);
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, INPUT_WIDTH, INPUT_HEIGHT, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	assert(XvMCCreateBlocks(display, &context, NUM_MACROBLOCKS * BLOCKS_PER_MACROBLOCK, &block_array) == Success);
+	assert(XvMCCreateMacroBlocks(display, &context, NUM_MACROBLOCKS, &mb_array) == Success);
+
+	mb = mb_array.macro_blocks;
+	blocks = block_array.blocks;
+
+	for (mby = 0; mby < INPUT_HEIGHT_IN_MACROBLOCKS; ++mby)
+		for (mbx = 0; mbx < INPUT_WIDTH_IN_MACROBLOCKS; ++mbx)
+		{
+			mb->x = mbx;
+			mb->y = mby;
+			mb->macroblock_type = XVMC_MB_TYPE_INTRA;
+			/*mb->motion_type = ;*/
+			/*mb->motion_vertical_field_select = ;*/
+			mb->dct_type = XVMC_DCT_TYPE_FRAME;
+			/*mb->PMV[0][0][0] = ;
+			mb->PMV[0][0][1] = ;
+			mb->PMV[0][1][0] = ;
+			mb->PMV[0][1][1] = ;
+			mb->PMV[1][0][0] = ;
+			mb->PMV[1][0][1] = ;
+			mb->PMV[1][1][0] = ;
+			mb->PMV[1][1][1] = ;*/
+			mb->index = (mby * INPUT_WIDTH_IN_MACROBLOCKS + mbx) * BLOCKS_PER_MACROBLOCK;
+			mb->coded_block_pattern = 0x3F;
+
+			mb++;
+
+			for (by = 0; by < MACROBLOCK_HEIGHT_IN_BLOCKS; ++by)
+				for (bx = 0; bx < MACROBLOCK_WIDTH_IN_BLOCKS; ++bx)
+				{
+					const int start = 16, stop = 235, range = stop - start;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+				}
+
+			for (by = 0; by < MACROBLOCK_HEIGHT_IN_BLOCKS / 2; ++by)
+				for (bx = 0; bx < MACROBLOCK_WIDTH_IN_BLOCKS / 2; ++bx)
+				{
+					const int start = 16, stop = 240, range = stop - start;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+				}
+		}
+
+	XSelectInput(display, window, ExposureMask | KeyPressMask);
+	XMapWindow(display, window);
+	XSync(display, 0);
+
+	/* Test NULL context */
+	assert(XvMCRenderSurface(display, NULL, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == XvMCBadContext);
+	/* Test NULL surface */
+	assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, NULL, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == XvMCBadSurface);
+	/* Test bad picture structure */
+	assert(XvMCRenderSurface(display, &context, 0, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == BadValue);
+	/* Test valid params */
+	assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == Success);
+
+	/* Test NULL surface */
+	assert(XvMCPutSurface(display, NULL, window, 0, 0, INPUT_WIDTH, INPUT_HEIGHT, 0, 0, output_width, output_height, XVMC_FRAME_PICTURE) == XvMCBadSurface);
+	/* Test bad window */
+	/* XXX: X halts with a bad drawable for some reason, doesn't return BadDrawable as expected */
+	/*assert(XvMCPutSurface(display, &surface, 0, 0, 0, width, height, 0, 0, width, height, XVMC_FRAME_PICTURE) == BadDrawable);*/
+
+	if (prompt)
+	{
+		puts("Press any button to quit...");
+
+		while (!quit)
+		{
+			if (XPending(display) > 0)
+			{
+				XEvent event;
+
+				XNextEvent(display, &event);
+
+				switch (event.type)
+				{
+					case Expose:
+					{
+						/* Test valid params */
+						assert
+						(
+							XvMCPutSurface
+							(
+								display, &surface, window,
+								0, 0, INPUT_WIDTH, INPUT_HEIGHT,
+								0, 0, output_width, output_height,
+								XVMC_FRAME_PICTURE
+							) == Success
+						);
+						break;
+					}
+					case KeyPress:
+					{
+						quit = 1;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	assert(XvMCDestroyBlocks(display, &block_array) == Success);
+	assert(XvMCDestroyMacroBlocks(display, &mb_array) == Success);
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XDestroyWindow(display, window);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c
new file mode 100644
index 0000000000..06948201ac
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c
@@ -0,0 +1,71 @@
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+
+	/* Test NULL context */
+	assert(XvMCCreateSurface(display, NULL, &surface) == XvMCBadContext);
+	/* Test NULL surface */
+	assert(XvMCCreateSurface(display, &context, NULL) == XvMCBadSurface);
+	/* Test valid params */
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	/* Test surface id assigned */
+	assert(surface.surface_id != 0);
+	/* Test context id assigned and correct */
+	assert(surface.context_id == context.context_id);
+	/* Test surface type id assigned and correct */
+	assert(surface.surface_type_id == surface_type_id);
+	/* Test width & height assigned and correct */
+	assert(surface.width == width && surface.height == height);
+	/* Test valid params */
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	/* Test NULL surface */
+	assert(XvMCDestroySurface(display, NULL) == XvMCBadSurface);
+
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c
new file mode 100644
index 0000000000..59a03ca813
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c
@@ -0,0 +1,119 @@
+#include "testlib.h"
+#include <stdio.h>
+
+/*
+void test(int pred, const char *pred_string, const char *doc_string, const char *file, unsigned int line)
+{
+	fputs(doc_string, stderr);
+	if (!pred)
+		fprintf(stderr, " FAIL!\n\t\"%s\" at %s:%u\n", pred_string, file, line);
+	else
+		fputs(" PASS!\n", stderr);
+}
+*/
+
+int GetPort
+(
+	Display *display,
+	unsigned int width,
+	unsigned int height,
+	unsigned int chroma_format,
+	const unsigned int *mc_types,
+	unsigned int num_mc_types,
+	XvPortID *port_id,
+	int *surface_type_id,
+	unsigned int *is_overlay,
+	unsigned int *intra_unsigned
+)
+{
+	unsigned int	found_port = 0;
+	XvAdaptorInfo	*adaptor_info;
+	unsigned int	num_adaptors;
+	int		num_types;
+	int		ev_base, err_base;
+	unsigned int	i, j, k, l;
+
+	if (!XvMCQueryExtension(display, &ev_base, &err_base))
+		return 0;
+	if (XvQueryAdaptors(display, XDefaultRootWindow(display), &num_adaptors, &adaptor_info) != Success)
+		return 0;
+
+	for (i = 0; i < num_adaptors && !found_port; ++i)
+	{
+		if (adaptor_info[i].type & XvImageMask)
+		{
+			XvMCSurfaceInfo *surface_info = XvMCListSurfaceTypes(display, adaptor_info[i].base_id, &num_types);
+
+			if (surface_info)
+			{
+				for (j = 0; j < num_types && !found_port; ++j)
+				{
+					if
+					(
+						surface_info[j].chroma_format == chroma_format &&
+						surface_info[j].max_width >= width &&
+						surface_info[j].max_height >= height
+					)
+					{
+						for (k = 0; k < num_mc_types && !found_port; ++k)
+						{
+							if (surface_info[j].mc_type == mc_types[k])
+							{
+								for (l = 0; l < adaptor_info[i].num_ports && !found_port; ++l)
+								{
+									if (XvGrabPort(display, adaptor_info[i].base_id + l, CurrentTime) == Success)
+									{
+										*port_id = adaptor_info[i].base_id + l;
+										*surface_type_id = surface_info[j].surface_type_id;
+										*is_overlay = surface_info[j].flags & XVMC_OVERLAID_SURFACE;
+										*intra_unsigned = surface_info[j].flags & XVMC_INTRA_UNSIGNED;
+										found_port = 1;
+									}
+								}
+							}
+						}
+					}
+				}
+
+				XFree(surface_info);
+			}
+		}
+	}
+
+	XvFreeAdaptorInfo(adaptor_info);
+
+	return found_port;
+}
+
+unsigned int align(unsigned int value, unsigned int alignment)
+{
+	return (value + alignment - 1) & ~(alignment - 1);
+}
+
+/* From the glibc manual */
+int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y)
+{
+	/* Perform the carry for the later subtraction by updating y. */
+	if (x->tv_usec < y->tv_usec)
+	{
+		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
+		y->tv_usec -= 1000000 * nsec;
+		y->tv_sec += nsec;
+	}
+	if (x->tv_usec - y->tv_usec > 1000000)
+	{
+		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
+		y->tv_usec += 1000000 * nsec;
+		y->tv_sec -= nsec;
+	}
+
+	/*
+	 * Compute the time remaining to wait.
+	 * tv_usec is certainly positive.
+	 */
+	result->tv_sec = x->tv_sec - y->tv_sec;
+	result->tv_usec = x->tv_usec - y->tv_usec;
+
+	/* Return 1 if result is negative. */
+	return x->tv_sec < y->tv_sec;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/testlib.h b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.h
new file mode 100644
index 0000000000..af71ad74e1
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.h
@@ -0,0 +1,42 @@
+#ifndef testlib_h
+#define testlib_h
+
+/*
+#define TEST(pred, doc)	test(pred, #pred, doc, __FILE__, __LINE__)
+
+void test(int pred, const char *pred_string, const char *doc_string, const char *file, unsigned int line);
+*/
+
+#include <sys/time.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
+
+/*
+ * display: IN			A valid X display
+ * width, height: IN		Surface size that the port must display
+ * chroma_format: IN		Chroma format that the port must display
+ * mc_types, num_mc_types: IN	List of MC types that the port must support, first port that matches the first mc_type will be returned
+ * port_id: OUT			Your port's ID
+ * surface_type_id: OUT		Your port's surface ID
+ * is_overlay: OUT		If 1, port uses overlay surfaces, you need to set a colorkey
+ * intra_unsigned: OUT		If 1, port uses unsigned values for intra-coded blocks
+ */
+int GetPort
+(
+	Display *display,
+	unsigned int width,
+	unsigned int height,
+	unsigned int chroma_format,
+	const unsigned int *mc_types,
+	unsigned int num_mc_types,
+	XvPortID *port_id,
+	int *surface_type_id,
+	unsigned int *is_overlay,
+	unsigned int *intra_unsigned
+);
+
+unsigned int align(unsigned int value, unsigned int alignment);
+
+int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y);
+
+#endif
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c
new file mode 100644
index 0000000000..97adcfc58a
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c
@@ -0,0 +1,273 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <error.h>
+#include <sys/time.h>
+#include "testlib.h"
+
+#define MACROBLOCK_WIDTH		16
+#define MACROBLOCK_HEIGHT		16
+#define BLOCKS_PER_MACROBLOCK		6
+
+#define DEFAULT_INPUT_WIDTH		720
+#define DEFAULT_INPUT_HEIGHT		480
+#define DEFAULT_REPS			100
+
+#define PIPELINE_STEP_MC		1
+#define PIPELINE_STEP_CSC		2
+#define PIPELINE_STEP_SWAP		4
+
+#define MB_TYPE_I			1
+#define MB_TYPE_P			2
+#define MB_TYPE_B			4
+
+struct Config
+{
+	unsigned int input_width;
+	unsigned int input_height;
+	unsigned int output_width;
+	unsigned int output_height;
+	unsigned int pipeline;
+	unsigned int mb_types;
+	unsigned int reps;
+};
+
+void ParseArgs(int argc, char **argv, struct Config *config);
+
+void ParseArgs(int argc, char **argv, struct Config *config)
+{
+	int fail = 0;
+	int i;
+
+	config->input_width = DEFAULT_INPUT_WIDTH;
+	config->input_height = DEFAULT_INPUT_HEIGHT;
+	config->output_width = 0;
+	config->output_height = 0;
+	config->pipeline = 0;
+	config->mb_types = 0;
+	config->reps = DEFAULT_REPS;
+
+	for (i = 1; i < argc && !fail; ++i)
+	{
+		if (!strcmp(argv[i], "-iw"))
+		{
+			if (sscanf(argv[++i], "%u", &config->input_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-ih"))
+		{
+			if (sscanf(argv[++i], "%u", &config->input_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-ow"))
+		{
+			if (sscanf(argv[++i], "%u", &config->output_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-oh"))
+		{
+			if (sscanf(argv[++i], "%u", &config->output_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-p"))
+		{
+			char *token = strtok(argv[++i], ",");
+
+			while (token && !fail)
+			{
+				if (!strcmp(token, "mc"))
+					config->pipeline |= PIPELINE_STEP_MC;
+				else if (!strcmp(token, "csc"))
+					config->pipeline |= PIPELINE_STEP_CSC;
+				else if (!strcmp(token, "swp"))
+					config->pipeline |= PIPELINE_STEP_SWAP;
+				else
+					fail = 1;
+
+				if (!fail)
+					token = strtok(NULL, ",");
+			}
+		}
+		else if (!strcmp(argv[i], "-mb"))
+		{
+			char *token = strtok(argv[++i], ",");
+
+			while (token && !fail)
+			{
+				if (strcmp(token, "i"))
+					config->mb_types |= MB_TYPE_I;
+				else if (strcmp(token, "p"))
+					config->mb_types |= MB_TYPE_P;
+				else if (strcmp(token, "b"))
+					config->mb_types |= MB_TYPE_B;
+				else
+					fail = 1;
+
+				if (!fail)
+					token = strtok(NULL, ",");
+			}
+		}
+		else if (!strcmp(argv[i], "-r"))
+		{
+			if (sscanf(argv[++i], "%u", &config->reps) != 1)
+				fail = 1;
+		}
+		else
+			fail = 1;
+	}
+
+	if (fail)
+		error
+		(
+			1, 0,
+			"Bad argument.\n"
+			"\n"
+			"Usage: %s [options]\n"
+			"\t-iw <width>\tInput width\n"
+			"\t-ih <height>\tInput height\n"
+			"\t-ow <width>\tOutput width\n"
+			"\t-oh <height>\tOutput height\n"
+			"\t-p <pipeline>\tPipeline to test\n"
+			"\t-mb <mb type>\tMacroBlock types to use\n"
+			"\t-r <reps>\tRepetitions\n\n"
+			"\tPipeline steps: mc,csc,swap\n"
+			"\tMB types: i,p,b\n",
+			argv[0]
+		);
+
+	if (config->output_width == 0)
+		config->output_width = config->input_width;
+	if (config->output_height == 0)
+		config->output_height = config->input_height;
+	if (!config->pipeline)
+		config->pipeline = PIPELINE_STEP_MC | PIPELINE_STEP_CSC | PIPELINE_STEP_SWAP;
+	if (!config->mb_types)
+		config->mb_types = MB_TYPE_I | MB_TYPE_P | MB_TYPE_B;
+}
+
+int main(int argc, char **argv)
+{
+	struct Config		config;
+	Display			*display;
+	Window			root, window;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		block_array;
+	XvMCMacroBlockArray	mb_array;
+	unsigned int		mbw, mbh;
+	unsigned int		mbx, mby;
+	unsigned int		reps;
+	struct timeval		start, stop, diff;
+	double			diff_secs;
+
+	ParseArgs(argc, argv, &config);
+
+	mbw = align(config.input_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
+	mbh = align(config.input_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		config.input_width,
+		config.input_height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	root = XDefaultRootWindow(display);
+	window = XCreateSimpleWindow(display, root, 0, 0, config.output_width, config.output_height, 0, 0, colorkey);
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, config.input_width, config.input_height, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	assert(XvMCCreateBlocks(display, &context, mbw * mbh * BLOCKS_PER_MACROBLOCK, &block_array) == Success);
+	assert(XvMCCreateMacroBlocks(display, &context, mbw * mbh, &mb_array) == Success);
+
+	for (mby = 0; mby < mbh; ++mby)
+		for (mbx = 0; mbx < mbw; ++mbx)
+		{
+			mb_array.macro_blocks[mby * mbw + mbx].x = mbx;
+			mb_array.macro_blocks[mby * mbw + mbx].y = mby;
+			mb_array.macro_blocks[mby * mbw + mbx].macroblock_type = XVMC_MB_TYPE_INTRA;
+			/*mb->motion_type = ;*/
+			/*mb->motion_vertical_field_select = ;*/
+			mb_array.macro_blocks[mby * mbw + mbx].dct_type = XVMC_DCT_TYPE_FRAME;
+			/*mb->PMV[0][0][0] = ;
+			mb->PMV[0][0][1] = ;
+			mb->PMV[0][1][0] = ;
+			mb->PMV[0][1][1] = ;
+			mb->PMV[1][0][0] = ;
+			mb->PMV[1][0][1] = ;
+			mb->PMV[1][1][0] = ;
+			mb->PMV[1][1][1] = ;*/
+			mb_array.macro_blocks[mby * mbw + mbx].index = (mby * mbw + mbx) * BLOCKS_PER_MACROBLOCK;
+			mb_array.macro_blocks[mby * mbw + mbx].coded_block_pattern = 0x3F;
+		}
+
+	XSelectInput(display, window, ExposureMask | KeyPressMask);
+	XMapWindow(display, window);
+	XSync(display, 0);
+
+	gettimeofday(&start, NULL);
+
+	for (reps = 0; reps < config.reps; ++reps)
+	{
+		if (config.pipeline & PIPELINE_STEP_MC)
+		{
+			assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, mbw * mbh, 0, &mb_array, &block_array) == Success);
+			assert(XvMCFlushSurface(display, &surface) == Success);
+		}
+		if (config.pipeline & PIPELINE_STEP_CSC)
+			assert(XvMCPutSurface(display, &surface, window, 0, 0, config.input_width, config.input_height, 0, 0, config.output_width, config.output_height, XVMC_FRAME_PICTURE) == Success);
+	}
+
+	gettimeofday(&stop, NULL);
+
+	timeval_subtract(&diff, &stop, &start);
+	diff_secs = (double)diff.tv_sec + (double)diff.tv_usec / 1000000.0;
+
+	printf("XvMC Benchmark\n");
+	printf("Input: %u,%u\nOutput: %u,%u\n", config.input_width, config.input_height, config.output_width, config.output_height);
+	printf("Pipeline: ");
+	if (config.pipeline & PIPELINE_STEP_MC)
+		printf("|mc|");
+	if (config.pipeline & PIPELINE_STEP_CSC)
+		printf("|csc|");
+	if (config.pipeline & PIPELINE_STEP_SWAP)
+		printf("|swap|");
+	printf("\n");
+	printf("Reps: %u\n", config.reps);
+	printf("Total time: %.2lf (%.2lf reps / sec)\n", diff_secs, config.reps / diff_secs);
+
+	assert(XvMCDestroyBlocks(display, &block_array) == Success);
+	assert(XvMCDestroyMacroBlocks(display, &mb_array) == Success);
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XDestroyWindow(display, window);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/xvmc_private.h b/src/gallium/state_trackers/xorg/xvmc/xvmc_private.h
new file mode 100644
index 0000000000..1e3dd561c6
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/xvmc_private.h
@@ -0,0 +1,31 @@
+#ifndef xvmc_private_h
+#define xvmc_private_h
+
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
+
+#define BLOCK_SIZE_SAMPLES 64
+#define BLOCK_SIZE_BYTES (BLOCK_SIZE_SAMPLES * 2)
+
+struct pipe_video_context;
+struct pipe_surface;
+struct pipe_fence_handle;
+
+typedef struct
+{
+	struct pipe_video_context *vpipe;
+	struct pipe_surface *backbuffer;
+} XvMCContextPrivate;
+
+typedef struct
+{
+	struct pipe_video_surface *pipe_vsfc;
+	struct pipe_fence_handle *render_fence;
+	struct pipe_fence_handle *disp_fence;
+	
+	/* Some XvMC functions take a surface but not a context,
+	   so we keep track of which context each surface belongs to. */
+	XvMCContext *context;
+} XvMCSurfacePrivate;
+
+#endif /* xvmc_private_h */
diff --git a/src/gallium/winsys/drm/intel/dri/SConscript b/src/gallium/winsys/drm/intel/dri/SConscript
index 6c00861f51..f973811072 100644
--- a/src/gallium/winsys/drm/intel/dri/SConscript
+++ b/src/gallium/winsys/drm/intel/dri/SConscript
@@ -12,8 +12,9 @@ drivers = [
     trace,
 ]
 
-env.SharedLibrary(
+env.LoadableModule(
     target ='i915_dri.so',
     source = COMMON_GALLIUM_SOURCES,
     LIBS = drivers + mesa + auxiliaries + env['LIBS'],
+    SHLIBPREFIX = '',
 )
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_api.c b/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
index 4c5a1d2ea8..8b647a769b 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
@@ -7,6 +7,7 @@
 #include "i915simple/i915_context.h"
 #include "i915simple/i915_screen.h"
 
+#include "trace/tr_drm.h"
 
 /*
  * Helper functions
@@ -40,6 +41,7 @@ intel_drm_buffer_from_handle(struct intel_drm_winsys *idws,
                              const char* name, unsigned handle)
 {
    struct intel_drm_buffer *buf = CALLOC_STRUCT(intel_drm_buffer);
+   uint32_t tile = 0, swizzle = 0;
 
    if (!buf)
       return NULL;
@@ -52,6 +54,10 @@ intel_drm_buffer_from_handle(struct intel_drm_winsys *idws,
    if (!buf->bo)
       goto err;
 
+   drm_intel_bo_get_tiling(buf->bo, &tile, &swizzle);
+   if (tile != INTEL_TILE_NONE)
+      buf->map_gtt = TRUE;
+
    return (struct intel_buffer *)buf;
 
 err:
@@ -166,6 +172,7 @@ intel_drm_create_screen(struct drm_api *api, int drmFD,
    idws->base.destroy = intel_drm_winsys_destroy;
 
    idws->pools.gem = drm_intel_bufmgr_gem_init(idws->fd, idws->max_batch_size);
+   drm_intel_bufmgr_gem_enable_reuse(idws->pools.gem);
 
    idws->softpipe = FALSE;
    idws->dump_cmd = debug_get_bool_option("INTEL_DUMP_CMD", FALSE);
@@ -198,5 +205,5 @@ struct drm_api intel_drm_api =
 struct drm_api *
 drm_api_create()
 {
-   return &intel_drm_api;
+   return trace_drm_create(&intel_drm_api);
 }
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c b/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
index 5ca3ad9762..ebd1b607b7 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
@@ -13,6 +13,7 @@
 #define INTEL_BATCH_CLIPRECTS    0x2
 
 #undef INTEL_RUN_SYNC
+#undef INTEL_MAP_BATCHBUFFER
 
 struct intel_drm_batchbuffer
 {
@@ -40,8 +41,11 @@ intel_drm_batchbuffer_reset(struct intel_drm_batchbuffer *batch)
                                   "gallium3d_batchbuffer",
                                   batch->actual_size,
                                   4096);
+
+#ifdef INTEL_MAP_BATCHBUFFER
    drm_intel_bo_map(batch->bo, TRUE);
    batch->base.map = batch->bo->virtual;
+#endif
 
    memset(batch->base.map, 0, batch->actual_size);
    batch->base.ptr = batch->base.map;
@@ -55,7 +59,13 @@ intel_drm_batchbuffer_create(struct intel_winsys *iws)
    struct intel_drm_winsys *idws = intel_drm_winsys(iws);
    struct intel_drm_batchbuffer *batch = CALLOC_STRUCT(intel_drm_batchbuffer);
 
+   batch->actual_size = idws->max_batch_size;
+
+#ifdef INTEL_MAP_BATCHBUFFER
    batch->base.map = NULL;
+#else
+   batch->base.map = MALLOC(batch->actual_size);
+#endif
    batch->base.ptr = NULL;
    batch->base.size = 0;
 
@@ -64,8 +74,6 @@ intel_drm_batchbuffer_create(struct intel_winsys *iws)
 
    batch->base.iws = iws;
 
-   batch->actual_size = idws->max_batch_size;
-
    intel_drm_batchbuffer_reset(batch);
 
    return &batch->base;
@@ -156,7 +164,11 @@ intel_drm_batchbuffer_flush(struct intel_batchbuffer *ibatch,
 
    used = batch->base.ptr - batch->base.map;
 
+#ifdef INTEL_MAP_BATCHBUFFER
    drm_intel_bo_unmap(batch->bo);
+#else
+   drm_intel_bo_subdata(batch->bo, 0, used, batch->base.map);
+#endif
 
    /* Do the sending to HW */
    ret = drm_intel_bo_exec(batch->bo, used, NULL, 0, 0);
@@ -202,7 +214,10 @@ intel_drm_batchbuffer_destroy(struct intel_batchbuffer *ibatch)
    if (batch->bo)
       drm_intel_bo_unreference(batch->bo);
 
-   free(batch);
+#ifndef INTEL_MAP_BATCHBUFFER
+   FREE(batch->base.map);
+#endif
+   FREE(batch);
 }
 
 void intel_drm_winsys_init_batchbuffer_functions(struct intel_drm_winsys *idws)
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c b/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
index e017cd2e98..327e19fcd6 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
@@ -28,6 +28,7 @@ intel_drm_buffer_create(struct intel_winsys *iws,
    } else if (type == INTEL_NEW_VERTEX) {
       name = "gallium3d_vertex";
       pool = idws->pools.gem;
+      buf->map_gtt = TRUE;
    } else if (type == INTEL_NEW_SCANOUT) {
       name = "gallium3d_scanout";
       pool = idws->pools.gem;
@@ -57,11 +58,17 @@ intel_drm_buffer_set_fence_reg(struct intel_winsys *iws,
                                unsigned stride,
                                enum intel_buffer_tile tile)
 {
+   struct intel_drm_buffer *buf = intel_drm_buffer(buffer);
    assert(I915_TILING_NONE == INTEL_TILE_NONE);
    assert(I915_TILING_X == INTEL_TILE_X);
    assert(I915_TILING_Y == INTEL_TILE_Y);
 
-   return drm_intel_bo_set_tiling(intel_bo(buffer), &tile, stride);
+   if (tile != INTEL_TILE_NONE) {
+      assert(buf->map_count == 0);
+      buf->map_gtt = TRUE;
+   }
+
+   return drm_intel_bo_set_tiling(buf->bo, &tile, stride);
 }
 
 static void *
@@ -109,6 +116,18 @@ intel_drm_buffer_unmap(struct intel_winsys *iws,
       drm_intel_bo_unmap(intel_bo(buffer));
 }
 
+static int
+intel_drm_buffer_write(struct intel_winsys *iws,
+                       struct intel_buffer *buffer,
+                       const void *data,
+                       size_t size,
+                       size_t offset)
+{
+   struct intel_drm_buffer *buf = intel_drm_buffer(buffer);
+
+   return drm_intel_bo_subdata(buf->bo, offset, size, (void*)data);
+}
+
 static void
 intel_drm_buffer_destroy(struct intel_winsys *iws,
                          struct intel_buffer *buffer)
@@ -130,5 +149,6 @@ intel_drm_winsys_init_buffer_functions(struct intel_drm_winsys *idws)
    idws->base.buffer_set_fence_reg = intel_drm_buffer_set_fence_reg;
    idws->base.buffer_map = intel_drm_buffer_map;
    idws->base.buffer_unmap = intel_drm_buffer_unmap;
+   idws->base.buffer_write = intel_drm_buffer_write;
    idws->base.buffer_destroy = intel_drm_buffer_destroy;
 }
diff --git a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
index 091cbbcfed..117ca6059b 100644
--- a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
+++ b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
@@ -112,7 +112,7 @@ nouveau_drm_create_screen(struct drm_api *api, int fd,
 		return NULL;
 	}
 
-	if (arg->mode == DRM_CREATE_DRI1) {
+	if (arg && arg->mode == DRM_CREATE_DRI1) {
 		struct nouveau_dri *nvdri = dri1->ddx_info;
 		enum pipe_format format;
 
diff --git a/src/gallium/winsys/drm/nouveau/xorg/Makefile b/src/gallium/winsys/drm/nouveau/xorg/Makefile
new file mode 100644
index 0000000000..f0d3b337e8
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/xorg/Makefile
@@ -0,0 +1,61 @@
+TARGET     = modesetting_drv.so
+CFILES     = $(wildcard ./*.c)
+OBJECTS    = $(patsubst ./%.c,./%.o,$(CFILES))
+TOP        = ../../../../../..
+
+include $(TOP)/configs/current
+
+INCLUDES = \
+	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
+	-I../gem \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/include \
+	-I$(TOP)/src/egl/main
+
+LIBS = \
+	$(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
+	$(TOP)/src/gallium/winsys/drm/nouveau/drm/libnouveaudrm.a \
+	$(TOP)/src/gallium/drivers/nv04/libnv04.a \
+	$(TOP)/src/gallium/drivers/nv10/libnv10.a \
+	$(TOP)/src/gallium/drivers/nv20/libnv20.a \
+	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
+	$(TOP)/src/gallium/drivers/nv40/libnv40.a \
+	$(TOP)/src/gallium/drivers/nv50/libnv50.a \
+	$(TOP)/src/gallium/drivers/nouveau/libnouveau.a \
+	$(GALLIUM_AUXILIARIES)
+
+DRIVER_DEFINES = \
+	-DHAVE_CONFIG_H
+
+
+#############################################
+
+
+
+all default: $(TARGET)
+
+$(TARGET): $(OBJECTS) Makefile $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a $(LIBS)
+	$(TOP)/bin/mklib -noprefix -o $@ \
+	$(OBJECTS) $(LIBS) $(shell pkg-config --libs libdrm) -ldrm_nouveau
+
+clean:
+	rm -rf $(OBJECTS) $(TARGET)
+
+install:
+	$(INSTALL) -d $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(TARGET) $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+
+
+##############################################
+
+
+.c.o:
+	$(CC) -c $(CFLAGS) $(INCLUDES) $(DRIVER_DEFINES) $< -o $@
+
+
+##############################################
+
+.PHONY	= all clean install
diff --git a/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c b/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c
new file mode 100644
index 0000000000..a669b3080a
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ * Author: Alan Hourihane <alanh@tungstengraphics.com>
+ * Author: Jakob Bornecrantz <wallbraker@gmail.com>
+ *
+ */
+
+#include "../../../../state_trackers/xorg/xorg_winsys.h"
+
+static void nouveau_xorg_identify(int flags);
+static Bool nouveau_xorg_pci_probe(DriverPtr driver, int entity_num,
+				   struct pci_device *device,
+				   intptr_t match_data);
+
+static const struct pci_id_match nouveau_xorg_device_match[] = {
+    { 0x10de, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY,
+      0x00030000, 0x00ffffff, 0 },
+    { 0x12d2, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY,
+      0x00030000, 0x00ffffff, 0 },
+    {0, 0, 0},
+};
+
+static SymTabRec nouveau_xorg_chipsets[] = {
+    {PCI_MATCH_ANY, "NVIDIA Graphics Device"},
+    {-1, NULL}
+};
+
+static PciChipsets nouveau_xorg_pci_devices[] = {
+    {PCI_MATCH_ANY, PCI_MATCH_ANY, NULL},
+    {-1, -1, NULL}
+};
+
+static XF86ModuleVersionInfo nouveau_xorg_version = {
+    "modesetting",
+    MODULEVENDORSTRING,
+    MODINFOSTRING1,
+    MODINFOSTRING2,
+    XORG_VERSION_CURRENT,
+    0, 1, 0, /* major, minor, patch */
+    ABI_CLASS_VIDEODRV,
+    ABI_VIDEODRV_VERSION,
+    MOD_CLASS_VIDEODRV,
+    {0, 0, 0, 0}
+};
+
+/*
+ * Xorg driver exported structures
+ */
+
+_X_EXPORT DriverRec modesetting = {
+    1,
+    "modesetting",
+    nouveau_xorg_identify,
+    NULL,
+    xorg_tracker_available_options,
+    NULL,
+    0,
+    NULL,
+    nouveau_xorg_device_match,
+    nouveau_xorg_pci_probe
+};
+
+static MODULESETUPPROTO(nouveau_xorg_setup);
+
+_X_EXPORT XF86ModuleData modesettingModuleData = {
+    &nouveau_xorg_version,
+    nouveau_xorg_setup,
+    NULL
+};
+
+/*
+ * Xorg driver functions
+ */
+
+static pointer
+nouveau_xorg_setup(pointer module, pointer opts, int *errmaj, int *errmin)
+{
+    static Bool setupDone = 0;
+
+    /* This module should be loaded only once, but check to be sure.
+     */
+    if (!setupDone) {
+	setupDone = 1;
+	xf86AddDriver(&modesetting, module, HaveDriverFuncs);
+
+	/*
+	 * The return value must be non-NULL on success even though there
+	 * is no TearDownProc.
+	 */
+	return (pointer) 1;
+    } else {
+	if (errmaj)
+	    *errmaj = LDR_ONCEONLY;
+	return NULL;
+    }
+}
+
+static void
+nouveau_xorg_identify(int flags)
+{
+    xf86PrintChipsets("modesetting", "Driver for Modesetting Kernel Drivers",
+		      nouveau_xorg_chipsets);
+}
+
+static Bool
+nouveau_xorg_pci_probe(DriverPtr driver,
+	  int entity_num, struct pci_device *device, intptr_t match_data)
+{
+    ScrnInfoPtr scrn = NULL;
+    EntityInfoPtr entity;
+
+    scrn = xf86ConfigPciEntity(scrn, 0, entity_num, nouveau_xorg_pci_devices,
+			       NULL, NULL, NULL, NULL, NULL);
+    if (scrn != NULL) {
+	scrn->driverVersion = 1;
+	scrn->driverName = "i915";
+	scrn->name = "modesetting";
+	scrn->Probe = NULL;
+
+	entity = xf86GetEntityInfo(entity_num);
+
+	/* Use all the functions from the xorg tracker */
+	xorg_tracker_set_functions(scrn);
+    }
+    return scrn != NULL;
+}
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
index 07551e7cd1..7bf23cba23 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
@@ -32,6 +32,8 @@
 
 #include "radeon_buffer.h"
 
+#include "radeon_bo_gem.h"
+
 static const char *radeon_get_name(struct pipe_winsys *ws)
 {
     return "Radeon/GEM+KMS";
@@ -99,6 +101,7 @@ static struct pipe_buffer *radeon_surface_buffer_create(struct pipe_winsys *ws,
                                                         unsigned height,
                                                         enum pipe_format format,
                                                         unsigned usage,
+                                                        unsigned tex_usage,
                                                         unsigned *stride)
 {
     struct pipe_format_block block;
@@ -134,8 +137,11 @@ static void *radeon_buffer_map(struct pipe_winsys *ws,
         (struct radeon_pipe_buffer*)buffer;
     int write = 0;
 
-    if (!(flags & PIPE_BUFFER_USAGE_DONTBLOCK)) {
-        radeon_bo_wait(radeon_buffer->bo);
+    if (flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
+        uint32_t domain;
+
+        if (radeon_bo_is_busy(radeon_buffer->bo, &domain))
+            return NULL;
     }
     if (flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
         write = 1;
@@ -187,7 +193,6 @@ static void radeon_flush_frontbuffer(struct pipe_winsys *pipe_winsys,
 struct radeon_winsys* radeon_pipe_winsys(int fd)
 {
     struct radeon_winsys* radeon_ws;
-    struct radeon_bo_manager* bom;
 
     radeon_ws = CALLOC_STRUCT(radeon_winsys);
     if (radeon_ws == NULL) {
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_drm.c b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
index 47376a0f07..a4011db0b8 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_drm.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
@@ -98,7 +98,7 @@ struct pipe_buffer* radeon_buffer_from_handle(struct drm_api* api,
     return &radeon_buffer->base;
 }
 
-struct pipe_texture*
+static struct pipe_texture*
 radeon_texture_from_shared_handle(struct drm_api *api,
                                   struct pipe_screen *screen,
                                   struct pipe_texture *templ,
@@ -116,20 +116,22 @@ radeon_texture_from_shared_handle(struct drm_api *api,
     return screen->texture_blanket(screen, templ, &stride, buffer);
 }
 
-boolean radeon_shared_handle_from_texture(struct drm_api *api,
-                                          struct pipe_screen *screen,
-                                          struct pipe_texture *texture,
-                                          unsigned *stride,
-                                          unsigned *handle)
+static boolean radeon_shared_handle_from_texture(struct drm_api *api,
+                                                 struct pipe_screen *screen,
+                                                 struct pipe_texture *texture,
+                                                 unsigned *stride,
+                                                 unsigned *handle)
 {
     int retval, fd;
     struct drm_gem_flink flink;
     struct radeon_pipe_buffer* radeon_buffer;
-    struct pipe_buffer* buffer = &radeon_buffer->base;
-    if (!radeon_buffer_from_texture(api, texture, buffer, stride)) {
+    struct pipe_buffer *buffer;
+
+    if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
         return FALSE;
     }
 
+    radeon_buffer = (struct radeon_pipe_buffer*)buffer;
     if (!radeon_buffer->flinked) {
         fd = ((struct radeon_winsys*)screen->winsys)->priv->fd;
 
@@ -150,11 +152,11 @@ boolean radeon_shared_handle_from_texture(struct drm_api *api,
     return TRUE;
 }
 
-boolean radeon_local_handle_from_texture(struct drm_api *api,
-                                         struct pipe_screen *screen,
-                                         struct pipe_texture *texture,
-                                         unsigned *stride,
-                                         unsigned *handle)
+static boolean radeon_local_handle_from_texture(struct drm_api *api,
+                                                struct pipe_screen *screen,
+                                                struct pipe_texture *texture,
+                                                unsigned *stride,
+                                                unsigned *handle)
 {
     struct pipe_buffer *buffer;
     if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_r300.c b/src/gallium/winsys/drm/radeon/core/radeon_r300.c
index d723876221..d2d84f1a8f 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_r300.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_r300.c
@@ -137,7 +137,7 @@ static void do_ioctls(struct r300_winsys* winsys, int fd)
     int target = 0;
     int retval;
 
-    info.value = &target;
+    info.value = (unsigned long)&target;
 
     /* First, get the number of pixel pipes */
     info.request = RADEON_INFO_NUM_GB_PIPES;
diff --git a/src/gallium/winsys/g3dvl/Makefile b/src/gallium/winsys/g3dvl/Makefile
new file mode 100644
index 0000000000..424ddea87a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/Makefile
@@ -0,0 +1,11 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+SUBDIRS = $(GALLIUM_WINSYS_DIRS)
+
+default install clean:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) $@) || exit 1; \
+		fi \
+	done
diff --git a/src/gallium/winsys/g3dvl/vl_winsys.h b/src/gallium/winsys/g3dvl/vl_winsys.h
index c83db28dd9..4f7a243361 100644
--- a/src/gallium/winsys/g3dvl/vl_winsys.h
+++ b/src/gallium/winsys/g3dvl/vl_winsys.h
@@ -2,13 +2,22 @@
 #define vl_winsys_h
 
 #include <X11/Xlib.h>
+#include <pipe/p_defines.h>
+#include <pipe/p_format.h>
 
-struct pipe_context;
+struct pipe_screen;
+struct pipe_video_context;
 
-struct pipe_context* create_pipe_context(Display *display, int screen);
-int destroy_pipe_context(struct pipe_context *pipe);
-int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable);
-int unbind_pipe_drawable(struct pipe_context *pipe);
+struct pipe_screen*
+vl_screen_create(Display *display, int screen);
 
-#endif
+struct pipe_video_context*
+vl_video_create(struct pipe_screen *screen,
+                enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height);
+
+Drawable
+vl_video_bind_drawable(struct pipe_video_context *vpipe, Drawable drawable);
 
+#endif
diff --git a/src/gallium/winsys/g3dvl/xlib/Makefile b/src/gallium/winsys/g3dvl/xlib/Makefile
new file mode 100644
index 0000000000..cf765ef51a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xlib/Makefile
@@ -0,0 +1,74 @@
+# This makefile produces a "stand-alone" libXvMCg3dvl.so which is
+# based on Xlib (no DRI HW acceleration)
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+XVMC_MAJOR = 1
+XVMC_MINOR = 0
+XVMC_LIB = XvMCg3dvl
+XVMC_LIB_NAME = lib$(XVMC_LIB).so
+XVMC_LIB_DEPS = $(EXTRA_LIB_PATH) -lXvMC -lXv -lX11 -lm
+
+INCLUDES = -I$(TOP)/src/gallium/include \
+           -I$(TOP)/src/gallium/auxiliary \
+           -I$(TOP)/src/gallium/drivers \
+           -I$(TOP)/src/gallium/winsys/g3dvl
+
+DEFINES += -DGALLIUM_SOFTPIPE \
+	   -DGALLIUM_TRACE
+
+SOURCES = xsp_winsys.c
+
+# XXX: Hack, if we include libxvmctracker.a in LIBS none of the symbols are
+# pulled in by the linker because xsp_winsys.c doesn't refer to them
+OBJECTS = $(SOURCES:.c=.o) $(TOP)/src/gallium/state_trackers/xorg/xvmc/*.o
+
+LIBS = $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+       $(TOP)/src/gallium/auxiliary/vl/libvl.a \
+       $(TOP)/src/gallium/auxiliary/tgsi/libtgsi.a \
+       $(TOP)/src/gallium/auxiliary/draw/libdraw.a \
+       $(TOP)/src/gallium/auxiliary/translate/libtranslate.a \
+       $(TOP)/src/gallium/auxiliary/cso_cache/libcso_cache.a \
+       $(TOP)/src/gallium/auxiliary/rtasm/librtasm.a \
+       $(TOP)/src/gallium/auxiliary/util/libutil.a
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
+
+.PHONY: default $(TOP)/$(LIB_DIR)/gallium clean
+
+default: depend $(TOP)/$(LIB_DIR)/gallium $(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME)
+
+$(TOP)/$(LIB_DIR)/gallium:
+	@mkdir -p $(TOP)/$(LIB_DIR)/gallium
+
+# Make the libXvMCg3dvl.so library
+$(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME): $(OBJECTS) $(LIBS) Makefile
+	$(MKLIB) -o $(XVMC_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
+		-major $(XVMC_MAJOR) -minor $(XVMC_MINOR) $(MKLIB_OPTIONS) \
+		-install $(TOP)/$(LIB_DIR)/gallium -id $(INSTALL_LIB_DIR)/lib$(XVMC_LIB).1.dylib \
+		$(XVMC_LIB_DEPS) $(OBJECTS) $(LIBS)
+
+depend: $(SOURCES) Makefile
+	$(RM) depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DEFINES) $(INCLUDES) $(SOURCES)
+
+#install: default
+#	$(INSTALL) -d $(INSTALL_DIR)/include/GL
+#	$(INSTALL) -d $(INSTALL_DIR)/$(LIB_DIR)
+#	$(INSTALL) -m 644 $(TOP)/include/GL/*.h $(INSTALL_DIR)/include/GL
+#	@if [ -e $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME) ]; then \
+#		$(INSTALL) $(TOP)/$(LIB_DIR)/libGL* $(INSTALL_DIR)/$(LIB_DIR); \
+#	fi
+
+clean: Makefile
+	$(RM) $(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME)
+	$(RM) *.o *~
+	$(RM) depend depend.bak
+
+-include depend
diff --git a/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c b/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c
new file mode 100644
index 0000000000..37eee79c5d
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c
@@ -0,0 +1,307 @@
+#include <vl_winsys.h>
+#include <X11/Xutil.h>
+#include <pipe/internal/p_winsys_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include <util/u_math.h>
+#include <softpipe/sp_winsys.h>
+#include <softpipe/sp_video_context.h>
+#include <softpipe/sp_texture.h>
+
+/* pipe_winsys implementation */
+
+struct xsp_pipe_winsys
+{
+	struct pipe_winsys	base;
+	Display			*display;
+	int			screen;
+	XImage			*fbimage;
+};
+
+struct xsp_context
+{
+	Drawable		drawable;
+
+	void (*pipe_destroy)(struct pipe_video_context *vpipe);
+};
+
+struct xsp_buffer
+{
+	struct pipe_buffer	base;
+	boolean			is_user_buffer;
+	void			*data;
+	void			*mapped_data;
+};
+
+static struct pipe_buffer* xsp_buffer_create(struct pipe_winsys *pws, unsigned alignment, unsigned usage, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->base.alignment = alignment;
+	buffer->base.usage = usage;
+	buffer->base.size = size;
+	buffer->data = align_malloc(size, alignment);
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static struct pipe_buffer* xsp_user_buffer_create(struct pipe_winsys *pws, void *data, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->base.size = size;
+	buffer->is_user_buffer = TRUE;
+	buffer->data = data;
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static void* xsp_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buffer, unsigned flags)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = xsp_buf->data;
+
+	return xsp_buf->mapped_data;
+}
+
+static void xsp_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = NULL;
+}
+
+static void xsp_buffer_destroy(struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(buffer);
+
+	if (!xsp_buf->is_user_buffer)
+		align_free(xsp_buf->data);
+
+	free(xsp_buf);
+}
+
+static struct pipe_buffer* xsp_surface_buffer_create
+(
+	struct pipe_winsys *pws,
+	unsigned width,
+	unsigned height,
+	enum pipe_format format,
+	unsigned usage,
+	unsigned tex_usage,
+	unsigned *stride
+)
+{
+	const unsigned int ALIGNMENT = 1;
+	struct pipe_format_block block;
+	unsigned nblocksx, nblocksy;
+
+	pf_get_block(format, &block);
+	nblocksx = pf_get_nblocksx(&block, width);
+	nblocksy = pf_get_nblocksy(&block, height);
+	*stride = align(nblocksx * block.size, ALIGNMENT);
+
+	return pws->buffer_create(pws, ALIGNMENT,
+				  usage,
+				  *stride * nblocksy);
+}
+
+static void xsp_fence_reference(struct pipe_winsys *pws, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence)
+{
+	assert(pws);
+	assert(ptr);
+	assert(fence);
+}
+
+static int xsp_fence_signalled(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static int xsp_fence_finish(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static void xsp_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surface, void *context_private)
+{
+	struct xsp_pipe_winsys	*xsp_winsys;
+	struct xsp_context	*xsp_context;
+
+	assert(pws);
+	assert(surface);
+	assert(context_private);
+
+	xsp_winsys = (struct xsp_pipe_winsys*)pws;
+	xsp_context = (struct xsp_context*)context_private;
+	xsp_winsys->fbimage->width = surface->width;
+	xsp_winsys->fbimage->height = surface->height;
+	xsp_winsys->fbimage->bytes_per_line = surface->width * (xsp_winsys->fbimage->bits_per_pixel >> 3);
+	xsp_winsys->fbimage->data = (char*)((struct xsp_buffer *)softpipe_texture(surface->texture)->buffer)->data + surface->offset;
+
+	XPutImage
+	(
+		xsp_winsys->display, xsp_context->drawable,
+		XDefaultGC(xsp_winsys->display, xsp_winsys->screen),
+		xsp_winsys->fbimage, 0, 0, 0, 0,
+		surface->width, surface->height
+	);
+	XFlush(xsp_winsys->display);
+}
+
+static const char* xsp_get_name(struct pipe_winsys *pws)
+{
+	assert(pws);
+	return "X11 SoftPipe";
+}
+
+static void xsp_destroy(struct pipe_winsys *pws)
+{
+	struct xsp_pipe_winsys *xsp_winsys = (struct xsp_pipe_winsys*)pws;
+
+	assert(pws);
+
+	/* XDestroyImage() wants to free the data as well */
+	xsp_winsys->fbimage->data = NULL;
+
+	XDestroyImage(xsp_winsys->fbimage);
+	FREE(xsp_winsys);
+}
+
+/* Called through pipe_video_context::destroy() */
+static void xsp_pipe_destroy(struct pipe_video_context *vpipe)
+{
+	struct xsp_context *xsp_context;
+
+	assert(vpipe);
+
+	xsp_context = vpipe->priv;
+
+	/* Call the original destroy */
+	xsp_context->pipe_destroy(vpipe);
+
+	FREE(xsp_context);
+}
+
+/* Show starts here */
+
+Drawable
+vl_video_bind_drawable(struct pipe_video_context *vpipe, Drawable drawable)
+{
+	struct xsp_context *xsp_context;
+	Drawable old_drawable;
+
+	assert(vpipe);
+
+	xsp_context = vpipe->priv;
+	old_drawable = xsp_context->drawable;
+	xsp_context->drawable = drawable;
+
+	return old_drawable;
+}
+
+struct pipe_screen*
+vl_screen_create(Display *display, int screen)
+{
+	struct xsp_pipe_winsys *xsp_winsys;
+
+	assert(display);
+
+	xsp_winsys = CALLOC_STRUCT(xsp_pipe_winsys);
+	if (!xsp_winsys)
+		return NULL;
+
+	xsp_winsys->base.buffer_create = xsp_buffer_create;
+	xsp_winsys->base.user_buffer_create = xsp_user_buffer_create;
+	xsp_winsys->base.buffer_map = xsp_buffer_map;
+	xsp_winsys->base.buffer_unmap = xsp_buffer_unmap;
+	xsp_winsys->base.buffer_destroy = xsp_buffer_destroy;
+	xsp_winsys->base.surface_buffer_create = xsp_surface_buffer_create;
+	xsp_winsys->base.fence_reference = xsp_fence_reference;
+	xsp_winsys->base.fence_signalled = xsp_fence_signalled;
+	xsp_winsys->base.fence_finish = xsp_fence_finish;
+	xsp_winsys->base.flush_frontbuffer = xsp_flush_frontbuffer;
+	xsp_winsys->base.get_name = xsp_get_name;
+	xsp_winsys->base.destroy = xsp_destroy;
+	xsp_winsys->display = display;
+	xsp_winsys->screen = screen;
+	xsp_winsys->fbimage = XCreateImage
+	(
+		display,
+		XDefaultVisual(display, screen),
+		XDefaultDepth(display, screen),
+		ZPixmap,
+		0,
+		NULL,
+		0,	/* Don't know the width and height until flush_frontbuffer */
+		0,
+		32,
+		0
+	);
+
+	if (!xsp_winsys->fbimage)
+	{
+		FREE(xsp_winsys);
+		return NULL;
+	}
+
+	XInitImage(xsp_winsys->fbimage);
+
+	return softpipe_create_screen(&xsp_winsys->base);
+}
+
+struct pipe_video_context*
+vl_video_create(struct pipe_screen *screen,
+                enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height)
+{
+	struct pipe_video_context	*vpipe;
+	struct xsp_context		*xsp_context;
+
+	assert(screen);
+	assert(width && height);
+
+	vpipe = sp_video_create(screen, profile, chroma_format, width, height);
+	if (!vpipe)
+		return NULL;
+
+	xsp_context = CALLOC_STRUCT(xsp_context);
+	if (!xsp_context)
+	{
+		vpipe->destroy(vpipe);
+		return NULL;
+	}
+
+	/* Override this so we can free our xsp_context when the pipe is freed */
+	xsp_context->pipe_destroy = vpipe->destroy;
+	vpipe->destroy = xsp_pipe_destroy;
+
+	vpipe->priv = xsp_context;
+
+	return vpipe;
+}
diff --git a/src/gallium/winsys/g3dvl/xsp_winsys.c b/src/gallium/winsys/g3dvl/xsp_winsys.c
index 698c2856a4..37d60ce540 100644
--- a/src/gallium/winsys/g3dvl/xsp_winsys.c
+++ b/src/gallium/winsys/g3dvl/xsp_winsys.c
@@ -105,6 +105,7 @@ static struct pipe_buffer* xsp_surface_buffer_create
 	unsigned height,
 	enum pipe_format format,
 	unsigned usage,
+	unsigned tex_usage,
 	unsigned *stride
 )
 {
diff --git a/src/gallium/winsys/gdi/SConscript b/src/gallium/winsys/gdi/SConscript
index 86eb9ef55e..f5e6d36d89 100644
--- a/src/gallium/winsys/gdi/SConscript
+++ b/src/gallium/winsys/gdi/SConscript
@@ -5,35 +5,39 @@ Import('*')
 
 if env['platform'] == 'windows':
 
-	env = env.Clone()
+    env = env.Clone()
 
-	env.Append(CPPPATH = [
-		'#src/gallium/state_trackers/wgl',
-	])
+    env.Append(CPPPATH = [
+        '#src/gallium/state_trackers/wgl',
+    ])
 
-	env.Append(LIBS = [
-		'gdi32',
-		'user32',
-		'kernel32',
-		'ws2_32',
-	])
+    env.Append(LIBS = [
+        'gdi32',
+        'user32',
+        'kernel32',
+        'ws2_32',
+    ])
 
-	sources = [
-		'gdi_softpipe_winsys.c',
-	]
-	
-	if env['gcc']:
-		sources += ['#src/gallium/state_trackers/wgl/opengl32.mingw.def']
-	else:
-		sources += ['#src/gallium/state_trackers/wgl/opengl32.def']
-		
-	drivers = [
-		trace,
-		softpipe,
-	]
+    if 'llvmpipe' in env['drivers']:
+        sources = ['gdi_llvmpipe_winsys.c']
+        drivers = [llvmpipe]
+        env.Tool('llvm')
+    elif 'softpipe' in env['drivers']:
+        sources = ['gdi_softpipe_winsys.c']
+        drivers = [softpipe]
+    else:
+        print 'warning: softpipe or llvmpipe not selected, gdi winsys disabled'
+        Return()
+    
+    if env['gcc']:
+        sources += ['#src/gallium/state_trackers/wgl/opengl32.mingw.def']
+    else:
+        sources += ['#src/gallium/state_trackers/wgl/opengl32.def']
+        
+    drivers += [trace]
 
-	env.SharedLibrary(
-		target ='opengl32',
-		source = sources,
-		LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
-	)
+    env.SharedLibrary(
+        target ='opengl32',
+        source = sources,
+        LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
+    )
diff --git a/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c b/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c
new file mode 100644
index 0000000000..e8bc0f55ac
--- /dev/null
+++ b/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c
@@ -0,0 +1,288 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * LLVMpipe support.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include <windows.h>
+
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "llvmpipe/lp_winsys.h"
+#include "llvmpipe/lp_texture.h"
+#include "stw_winsys.h"
+
+
+struct gdi_llvmpipe_displaytarget
+{
+   enum pipe_format format;
+   struct pipe_format_block block;
+   unsigned width;
+   unsigned height;
+   unsigned stride;
+
+   unsigned size;
+
+   void *data;
+
+   BITMAPINFO bmi;
+};
+
+
+/** Cast wrapper */
+static INLINE struct gdi_llvmpipe_displaytarget *
+gdi_llvmpipe_displaytarget( struct llvmpipe_displaytarget *buf )
+{
+   return (struct gdi_llvmpipe_displaytarget *)buf;
+}
+
+
+static boolean
+gdi_llvmpipe_is_displaytarget_format_supported( struct llvmpipe_winsys *ws,
+                                                enum pipe_format format )
+{
+   switch(format) {
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return TRUE;
+
+   /* TODO: Support other formats possible with BMPs, as described in 
+    * http://msdn.microsoft.com/en-us/library/dd183376(VS.85).aspx */
+      
+   default:
+      return FALSE;
+   }
+}
+
+
+static void *
+gdi_llvmpipe_displaytarget_map(struct llvmpipe_winsys *ws,
+                               struct llvmpipe_displaytarget *dt,
+                               unsigned flags )
+{
+   struct gdi_llvmpipe_displaytarget *gdt = gdi_llvmpipe_displaytarget(dt);
+
+   return gdt->data;
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_unmap(struct llvmpipe_winsys *ws,
+                                 struct llvmpipe_displaytarget *dt )
+{
+
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_destroy(struct llvmpipe_winsys *winsys,
+                                   struct llvmpipe_displaytarget *dt)
+{
+   struct gdi_llvmpipe_displaytarget *gdt = gdi_llvmpipe_displaytarget(dt);
+
+   align_free(gdt->data);
+   FREE(gdt);
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static struct llvmpipe_displaytarget *
+gdi_llvmpipe_displaytarget_create(struct llvmpipe_winsys *winsys,
+                                  enum pipe_format format,
+                                  unsigned width, unsigned height,
+                                  unsigned alignment,
+                                  unsigned *stride)
+{
+   struct gdi_llvmpipe_displaytarget *gdt;
+   unsigned cpp;
+   unsigned bpp;
+   
+   gdt = CALLOC_STRUCT(gdi_llvmpipe_displaytarget);
+   if(!gdt)
+      goto no_gdt;
+
+   gdt->format = format;
+   gdt->width = width;
+   gdt->height = height;
+
+   bpp = pf_get_bits(format);
+   cpp = pf_get_size(format);
+   
+   gdt->stride = round_up(width * cpp, alignment);
+   gdt->size = gdt->stride * height;
+   
+   gdt->data = align_malloc(gdt->size, alignment);
+   if(!gdt->data)
+      goto no_data;
+
+   gdt->bmi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+   gdt->bmi.bmiHeader.biWidth = gdt->stride / cpp;
+   gdt->bmi.bmiHeader.biHeight= -(long)height;
+   gdt->bmi.bmiHeader.biPlanes = 1;
+   gdt->bmi.bmiHeader.biBitCount = bpp;
+   gdt->bmi.bmiHeader.biCompression = BI_RGB;
+   gdt->bmi.bmiHeader.biSizeImage = 0;
+   gdt->bmi.bmiHeader.biXPelsPerMeter = 0;
+   gdt->bmi.bmiHeader.biYPelsPerMeter = 0;
+   gdt->bmi.bmiHeader.biClrUsed = 0;
+   gdt->bmi.bmiHeader.biClrImportant = 0;
+
+   *stride = gdt->stride;
+   return (struct llvmpipe_displaytarget *)gdt;
+
+no_data:
+   FREE(gdt);
+no_gdt:
+   return NULL;
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_display(struct llvmpipe_winsys *winsys, 
+                                   struct llvmpipe_displaytarget *dt,
+                                   void *context_private)
+{
+   assert(0);
+}
+
+
+static void
+gdi_llvmpipe_destroy(struct llvmpipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+gdi_llvmpipe_screen_create(void)
+{
+   static struct llvmpipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(llvmpipe_winsys);
+   if(!winsys)
+      goto no_winsys;
+
+   winsys->destroy = gdi_llvmpipe_destroy;
+   winsys->is_displaytarget_format_supported = gdi_llvmpipe_is_displaytarget_format_supported;
+   winsys->displaytarget_create = gdi_llvmpipe_displaytarget_create;
+   winsys->displaytarget_map = gdi_llvmpipe_displaytarget_map;
+   winsys->displaytarget_unmap = gdi_llvmpipe_displaytarget_unmap;
+   winsys->displaytarget_display = gdi_llvmpipe_displaytarget_display;
+   winsys->displaytarget_destroy = gdi_llvmpipe_displaytarget_destroy;
+
+   screen = llvmpipe_create_screen(winsys);
+   if(!screen)
+      goto no_screen;
+
+   return screen;
+   
+no_screen:
+   FREE(winsys);
+no_winsys:
+   return NULL;
+}
+
+
+static struct pipe_context *
+gdi_llvmpipe_context_create(struct pipe_screen *screen)
+{
+   return llvmpipe_create(screen);
+}
+
+
+static void
+gdi_llvmpipe_present(struct pipe_screen *screen,
+                     struct pipe_surface *surface,
+                     HDC hDC)
+{
+    struct llvmpipe_texture *texture;
+    struct gdi_llvmpipe_displaytarget *gdt;
+
+    texture = llvmpipe_texture(surface->texture);
+    gdt = gdi_llvmpipe_displaytarget(texture->dt);
+
+    StretchDIBits(hDC,
+                  0, 0, gdt->width, gdt->height,
+                  0, 0, gdt->width, gdt->height,
+                  gdt->data, &gdt->bmi, 0, SRCCOPY);
+}
+
+
+static const struct stw_winsys stw_winsys = {
+   &gdi_llvmpipe_screen_create,
+   &gdi_llvmpipe_context_create,
+   &gdi_llvmpipe_present,
+   NULL, /* get_adapter_luid */
+   NULL, /* shared_surface_open */
+   NULL, /* shared_surface_close */
+   NULL  /* compose */
+};
+
+
+BOOL WINAPI
+DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved)
+{
+   switch (fdwReason) {
+   case DLL_PROCESS_ATTACH:
+      if (!stw_init(&stw_winsys)) {
+         return FALSE;
+      }
+      return stw_init_thread();
+
+   case DLL_THREAD_ATTACH:
+      return stw_init_thread();
+
+   case DLL_THREAD_DETACH:
+      stw_cleanup_thread();
+      break;
+
+   case DLL_PROCESS_DETACH:
+      stw_cleanup_thread();
+      stw_cleanup();
+      break;
+   }
+   return TRUE;
+}
diff --git a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
index 33826524d7..5e0ccf32f4 100644
--- a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
+++ b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
@@ -46,7 +46,7 @@
 #include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "softpipe/sp_texture.h"
-#include "shared/stw_winsys.h"
+#include "stw_winsys.h"
 
 
 struct gdi_softpipe_buffer
@@ -166,6 +166,7 @@ gdi_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
                                    unsigned width, unsigned height,
                                    enum pipe_format format,
                                    unsigned usage,
+                                   unsigned tex_usage,
                                    unsigned *stride)
 {
    const unsigned alignment = 64;
@@ -268,9 +269,9 @@ gdi_softpipe_context_create(struct pipe_screen *screen)
 
 
 static void
-gdi_softpipe_flush_frontbuffer(struct pipe_screen *screen,
-                               struct pipe_surface *surface,
-                               HDC hDC)
+gdi_softpipe_present(struct pipe_screen *screen,
+                     struct pipe_surface *surface,
+                     HDC hDC)
 {
     struct softpipe_texture *texture;
     struct gdi_softpipe_buffer *buffer;
@@ -303,7 +304,11 @@ gdi_softpipe_flush_frontbuffer(struct pipe_screen *screen,
 static const struct stw_winsys stw_winsys = {
    &gdi_softpipe_screen_create,
    &gdi_softpipe_context_create,
-   &gdi_softpipe_flush_frontbuffer
+   &gdi_softpipe_present,
+   NULL, /* get_adapter_luid */
+   NULL, /* shared_surface_open */
+   NULL, /* shared_surface_close */
+   NULL  /* compose */
 };
 
 
diff --git a/src/gallium/winsys/xlib/xlib_softpipe.c b/src/gallium/winsys/xlib/xlib_softpipe.c
index 67fea023a3..260b39e2a0 100644
--- a/src/gallium/winsys/xlib/xlib_softpipe.c
+++ b/src/gallium/winsys/xlib/xlib_softpipe.c
@@ -75,9 +75,6 @@ struct xmesa_pipe_winsys
 {
    struct pipe_winsys base;
 /*   struct xmesa_visual *xm_visual; */
-#ifdef USE_XSHM
-   int shm;
-#endif
 };
 
 
@@ -93,11 +90,6 @@ xm_buffer( struct pipe_buffer *buf )
 /**
  * X Shared Memory Image extension code
  */
-#ifdef USE_XSHM
-#define XSHM_ENABLED(b) ((b)->shm)
-#else
-#define XSHM_ENABLED(b) 0
-#endif
 
 #ifdef USE_XSHM
 
@@ -116,23 +108,23 @@ mesaHandleXError(Display *dpy, XErrorEvent *event)
 }
 
 
-static GLboolean alloc_shm(struct xm_buffer *buf, unsigned size)
+static char *alloc_shm(struct xm_buffer *buf, unsigned size)
 {
    XShmSegmentInfo *const shminfo = & buf->shminfo;
 
    shminfo->shmid = shmget(IPC_PRIVATE, size, IPC_CREAT|0777);
    if (shminfo->shmid < 0) {
-      return GL_FALSE;
+      return NULL;
    }
 
    shminfo->shmaddr = (char *) shmat(shminfo->shmid, 0, 0);
    if (shminfo->shmaddr == (char *) -1) {
       shmctl(shminfo->shmid, IPC_RMID, 0);
-      return GL_FALSE;
+      return NULL;
    }
 
    shminfo->readOnly = False;
-   return GL_TRUE;
+   return shminfo->shmaddr;
 }
 
 
@@ -258,25 +250,30 @@ xlib_softpipe_display_surface(struct xmesa_buffer *b,
       return;
 
 #ifdef USE_XSHM
-   if (XSHM_ENABLED(xm_buf) && (xm_buf->tempImage == NULL)) {
-      assert(surf->texture->block.width == 1);
-      assert(surf->texture->block.height == 1);
-      alloc_shm_ximage(xm_buf, b, spt->stride[surf->level] /
-                       surf->texture->block.size, surf->height);
-   }
-#endif
+   if (xm_buf->shm)
+   {
+      if (xm_buf->tempImage == NULL) 
+      {
+         assert(surf->texture->block.width == 1);
+         assert(surf->texture->block.height == 1);
+         alloc_shm_ximage(xm_buf, b, spt->stride[surf->level] /
+                          surf->texture->block.size, surf->height);
+      }
 
-   ximage = (XSHM_ENABLED(xm_buf)) ? xm_buf->tempImage : b->tempImage;
-   ximage->data = xm_buf->data;
+      ximage = xm_buf->tempImage;
+      ximage->data = xm_buf->data;
 
-   /* display image in Window */
-#ifdef USE_XSHM
-   if (XSHM_ENABLED(xm_buf)) {
+      /* _debug_printf("XSHM\n"); */
       XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
                    ximage, 0, 0, 0, 0, surf->width, surf->height, False);
-   } else
+   }
+   else
 #endif
    {
+      /* display image in Window */
+      ximage = b->tempImage;
+      ximage->data = xm_buf->data;
+
       /* check that the XImage has been previously initialized */
       assert(ximage->format);
       assert(ximage->bitmap_unit);
@@ -286,6 +283,7 @@ xlib_softpipe_display_surface(struct xmesa_buffer *b,
       ximage->height = surf->height;
       ximage->bytes_per_line = spt->stride[surf->level];
 
+      /* _debug_printf("XPUT\n"); */
       XPutImage(b->xm_visual->display, b->drawable, b->gc,
                 ximage, 0, 0, 0, 0, surf->width, surf->height);
    }
@@ -322,21 +320,6 @@ xm_buffer_create(struct pipe_winsys *pws,
                  unsigned size)
 {
    struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
-#ifdef USE_XSHM
-   struct xmesa_pipe_winsys *xpws = (struct xmesa_pipe_winsys *) pws;
-
-   buffer->shminfo.shmid = -1;
-   buffer->shminfo.shmaddr = (char *) -1;
-
-   if (xpws->shm && (usage & PIPE_BUFFER_USAGE_PIXEL) != 0) {
-      buffer->shm = xpws->shm;
-
-      if (alloc_shm(buffer, size)) {
-         buffer->data = buffer->shminfo.shmaddr;
-         buffer->shm = 1;
-      }
-   }
-#endif
 
    pipe_reference_init(&buffer->base.reference, 1);
    buffer->base.alignment = alignment;
@@ -363,9 +346,6 @@ xm_user_buffer_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
    buffer->base.size = bytes;
    buffer->userBuffer = TRUE;
    buffer->data = ptr;
-#ifdef USE_XSHM
-   buffer->shm = 0;
-#endif
 
    return &buffer->base;
 }
@@ -381,16 +361,44 @@ xm_surface_buffer_create(struct pipe_winsys *winsys,
 {
    const unsigned alignment = 64;
    struct pipe_format_block block;
-   unsigned nblocksx, nblocksy;
+   unsigned nblocksx, nblocksy, size;
 
    pf_get_block(format, &block);
    nblocksx = pf_get_nblocksx(&block, width);
    nblocksy = pf_get_nblocksy(&block, height);
    *stride = align(nblocksx * block.size, alignment);
+   size = *stride * nblocksy;
+
+#ifdef USE_XSHM
+   if (!debug_get_bool_option("XLIB_NO_SHM", FALSE))
+   {
+      struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
+
+      pipe_reference_init(&buffer->base.reference, 1);
+      buffer->base.alignment = alignment;
+      buffer->base.usage = usage;
+      buffer->base.size = size;
+      buffer->userBuffer = FALSE;
+      buffer->shminfo.shmid = -1;
+      buffer->shminfo.shmaddr = (char *) -1;
+      buffer->shm = TRUE;
+         
+      buffer->data = alloc_shm(buffer, size);
+      if (!buffer->data)
+         goto out;
+
+      return &buffer->base;
+         
+   out:
+      if (buffer)
+         FREE(buffer);
+   }
+#endif
+   
 
    return winsys->buffer_create(winsys, alignment,
                                 usage,
-                                *stride * nblocksy);
+                                size);
 }