67 files changed, 4977 insertions, 2192 deletions
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index b04e98bfa1..69630e98ba 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -31,7 +31,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef	__cplusplus
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 78953bccfc..6c1cb48e8b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -104,7 +104,7 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    unsigned clipped = 0;
    unsigned j;
 
-   if (0) debug_printf("%s\n");
+   if (0) debug_printf("%s\n", __FUNCTION__);
 
    for (j = 0; j < count; j++) {
       float *position = out->data[pos];
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 645d7cccba..88bc790b62 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -537,19 +537,10 @@ static struct x86_reg fetch_src( struct aos_compilation *cp,
    unsigned abs = 0;
 
    for (i = 0; i < 4; i++) {
-      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
+      unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
       unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
 
-      switch (swizzle) {
-      case TGSI_EXTSWIZZLE_ZERO:
-      case TGSI_EXTSWIZZLE_ONE:
-         AOS_ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
-         break;
-
-      default:
-         swz |= (swizzle & 0x3) << (i * 2);
-         break;
-      }
+      swz |= (swizzle & 0x3) << (i * 2);
 
       switch (neg) {
       case TGSI_UTIL_SIGN_TOGGLE:
@@ -632,23 +623,10 @@ static void x87_fld_src( struct aos_compilation *cp,
                                                 src->SrcRegister.File, 
                                                 src->SrcRegister.Index);
 
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
    unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
 
-   switch (swizzle) {
-   case TGSI_EXTSWIZZLE_ZERO:
-      x87_fldz( cp->func );
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      x87_fld1( cp->func );
-      break;
-
-   default:
-      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
-      break;
-   }
-   
+   x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
 
    switch (neg) {
    case TGSI_UTIL_SIGN_TOGGLE:
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 2590546cb4..4ef372233f 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -46,7 +46,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 0d30363484..2ef4293d4d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -42,7 +42,7 @@
 #endif
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "util/u_debug.h"
 #include "pipe/p_thread.h"
 #include "util/u_memory.h"
@@ -540,9 +540,9 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %7u\n",
-                   fenced_buf,
+                   (void *) fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count);
+                   p_atomic_read(&fenced_buf->base.base.reference.count));
       curr = next; 
       next = curr->next;
    }
@@ -554,10 +554,10 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
       debug_printf("%10p %7u %7u %10p %s\n",
-                   fenced_buf,
+                   (void *) fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count,
-                   fenced_buf->fence,
+                   p_atomic_read(&fenced_buf->base.base.reference.count),
+                   (void *) fenced_buf->fence,
                    signaled == 0 ? "y" : "n");
       curr = next; 
       next = curr->next;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 39ab8e722c..8c8d713078 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -51,7 +51,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 1b4df28c70..6e3214ca9c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -350,7 +350,7 @@ pb_debug_manager_dump(struct pb_debug_manager *mgr)
       buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
 
       debug_printf("buffer = %p\n", buf);
-      debug_printf("    .size = %p\n", buf->base.base.size);
+      debug_printf("    .size = 0x%x\n", buf->base.base.size);
       debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
       
       curr = next; 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index e7352e90db..d21910d0bf 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -37,7 +37,6 @@
  */
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
 #include "util/u_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index 150fd50618..ce40c0cf0e 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -34,7 +34,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.h b/src/gallium/auxiliary/pipebuffer/pb_validate.h
index dfb84df1ce..3c93f30f20 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.h
@@ -37,7 +37,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gallium/auxiliary/rbug/README b/src/gallium/auxiliary/rbug/README
index 33d76371de..d984067893 100644
--- a/src/gallium/auxiliary/rbug/README
+++ b/src/gallium/auxiliary/rbug/README
@@ -16,6 +16,10 @@ for information about applications look in:
 
 progs/rbug/README
 
+for a GUI see:
+
+  http://cgit.freedesktop.org/mesa/rbug-gui
+
 
 --
 Jakob Bornecrantz <jakob@vmware.com>
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index e0cfc54420..4fa10e2f7e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -472,9 +472,9 @@ tgsi_default_full_instruction( void )
    unsigned i;
 
    full_instruction.Instruction = tgsi_default_instruction();
-   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
    full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
    full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
+   full_instruction.InstructionExtPredicate = tgsi_default_instruction_ext_predicate();
    for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
       full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
    }
@@ -512,34 +512,6 @@ tgsi_build_full_instruction(
       header );
    prev_token = (struct tgsi_token  *) instruction;
 
-   if( tgsi_compare_instruction_ext_nv(
-         full_inst->InstructionExtNv,
-         tgsi_default_instruction_ext_nv() ) ) {
-      struct tgsi_instruction_ext_nv *instruction_ext_nv;
-
-      if( maxsize <= size )
-         return 0;
-      instruction_ext_nv =
-         (struct  tgsi_instruction_ext_nv *) &tokens[size];
-      size++;
-
-      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
-         full_inst->InstructionExtNv.Precision,
-         full_inst->InstructionExtNv.CondDstIndex,
-         full_inst->InstructionExtNv.CondFlowIndex,
-         full_inst->InstructionExtNv.CondMask,
-         full_inst->InstructionExtNv.CondSwizzleX,
-         full_inst->InstructionExtNv.CondSwizzleY,
-         full_inst->InstructionExtNv.CondSwizzleZ,
-         full_inst->InstructionExtNv.CondSwizzleW,
-         full_inst->InstructionExtNv.CondDstUpdate,
-         full_inst->InstructionExtNv.CondFlowEnable,
-         prev_token,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) instruction_ext_nv;
-   }
-
    if( tgsi_compare_instruction_ext_label(
          full_inst->InstructionExtLabel,
          tgsi_default_instruction_ext_label() ) ) {
@@ -578,6 +550,29 @@ tgsi_build_full_instruction(
       prev_token = (struct tgsi_token  *) instruction_ext_texture;
    }
 
+   if (tgsi_compare_instruction_ext_predicate(full_inst->InstructionExtPredicate,
+                                              tgsi_default_instruction_ext_predicate())) {
+      struct tgsi_instruction_ext_predicate *instruction_ext_predicate;
+
+      if (maxsize <= size) {
+         return 0;
+      }
+      instruction_ext_predicate = (struct tgsi_instruction_ext_predicate *)&tokens[size];
+      size++;
+
+      *instruction_ext_predicate =
+         tgsi_build_instruction_ext_predicate(full_inst->InstructionExtPredicate.SrcIndex,
+                                              full_inst->InstructionExtPredicate.Negate,
+                                              full_inst->InstructionExtPredicate.SwizzleX,
+                                              full_inst->InstructionExtPredicate.SwizzleY,
+                                              full_inst->InstructionExtPredicate.SwizzleZ,
+                                              full_inst->InstructionExtPredicate.SwizzleW,
+                                              prev_token,
+                                              instruction,
+                                              header);
+      prev_token = (struct tgsi_token *)instruction_ext_predicate;
+   }
+
    for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
       const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
       struct tgsi_dst_register *dst_register;
@@ -597,30 +592,6 @@ tgsi_build_full_instruction(
          header );
       prev_token = (struct tgsi_token  *) dst_register;
 
-      if( tgsi_compare_dst_register_ext_concode(
-            reg->DstRegisterExtConcode,
-            tgsi_default_dst_register_ext_concode() ) ) {
-         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
-
-         if( maxsize <= size )
-            return 0;
-         dst_register_ext_concode =
-            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
-         size++;
-
-         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
-            reg->DstRegisterExtConcode.CondMask,
-            reg->DstRegisterExtConcode.CondSwizzleX,
-            reg->DstRegisterExtConcode.CondSwizzleY,
-            reg->DstRegisterExtConcode.CondSwizzleZ,
-            reg->DstRegisterExtConcode.CondSwizzleW,
-            reg->DstRegisterExtConcode.CondSrcIndex,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
-      }
-
       if( tgsi_compare_dst_register_ext_modulate(
             reg->DstRegisterExtModulate,
             tgsi_default_dst_register_ext_modulate() ) ) {
@@ -687,40 +658,6 @@ tgsi_build_full_instruction(
          header );
       prev_token = (struct tgsi_token  *) src_register;
 
-      if( tgsi_compare_src_register_ext_swz(
-            reg->SrcRegisterExtSwz,
-            tgsi_default_src_register_ext_swz() ) ) {
-         struct tgsi_src_register_ext_swz *src_register_ext_swz;
-
-         /* Use of the extended swizzle requires the simple swizzle to be identity.
-          */
-         assert( reg->SrcRegister.SwizzleX == TGSI_SWIZZLE_X );
-         assert( reg->SrcRegister.SwizzleY == TGSI_SWIZZLE_Y );
-         assert( reg->SrcRegister.SwizzleZ == TGSI_SWIZZLE_Z );
-         assert( reg->SrcRegister.SwizzleW == TGSI_SWIZZLE_W );
-         assert( reg->SrcRegister.Negate == FALSE );
-
-         if( maxsize <= size )
-            return 0;
-         src_register_ext_swz =
-            (struct  tgsi_src_register_ext_swz *) &tokens[size];
-         size++;
-
-         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
-            reg->SrcRegisterExtSwz.ExtSwizzleX,
-            reg->SrcRegisterExtSwz.ExtSwizzleY,
-            reg->SrcRegisterExtSwz.ExtSwizzleZ,
-            reg->SrcRegisterExtSwz.ExtSwizzleW,
-            reg->SrcRegisterExtSwz.NegateX,
-            reg->SrcRegisterExtSwz.NegateY,
-            reg->SrcRegisterExtSwz.NegateZ,
-            reg->SrcRegisterExtSwz.NegateW,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) src_register_ext_swz;
-      }
-
       if( tgsi_compare_src_register_ext_mod(
             reg->SrcRegisterExtMod,
             tgsi_default_src_register_ext_mod() ) ) {
@@ -809,29 +746,6 @@ tgsi_build_full_instruction(
    return size;
 }
 
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
-   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
-   instruction_ext_nv.CondDstIndex = 0;
-   instruction_ext_nv.CondFlowIndex = 0;
-   instruction_ext_nv.CondMask = TGSI_CC_TR;
-   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
-   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
-   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
-   instruction_ext_nv.CondDstUpdate = 0;
-   instruction_ext_nv.CondFlowEnable = 0;
-   instruction_ext_nv.Padding = 0;
-   instruction_ext_nv.Extended = 0;
-
-   return instruction_ext_nv;
-}
-
-
 /** test for inequality of 32-bit values pointed to by a and b */
 static INLINE boolean
 compare32(const void *a, const void *b)
@@ -839,53 +753,6 @@ compare32(const void *a, const void *b)
    return *((uint32_t *) a) != *((uint32_t *) b);
 }
 
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_enable,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv = tgsi_default_instruction_ext_nv();
-   instruction_ext_nv.Precision = precision;
-   instruction_ext_nv.CondDstIndex = cond_dst_index;
-   instruction_ext_nv.CondFlowIndex = cond_flow_index;
-   instruction_ext_nv.CondMask = cond_mask;
-   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
-   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
-   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
-   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
-   instruction_ext_nv.CondDstUpdate = cond_dst_update;
-   instruction_ext_nv.CondFlowEnable = cond_flow_enable;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return instruction_ext_nv;
-}
-
 struct tgsi_instruction_ext_label
 tgsi_default_instruction_ext_label( void )
 {
@@ -968,6 +835,60 @@ tgsi_build_instruction_ext_texture(
    return instruction_ext_texture;
 }
 
+struct tgsi_instruction_ext_predicate
+tgsi_default_instruction_ext_predicate(void)
+{
+   struct tgsi_instruction_ext_predicate instruction_ext_predicate;
+
+   instruction_ext_predicate.Type = TGSI_INSTRUCTION_EXT_TYPE_PREDICATE;
+   instruction_ext_predicate.SwizzleX = TGSI_SWIZZLE_X;
+   instruction_ext_predicate.SwizzleY = TGSI_SWIZZLE_Y;
+   instruction_ext_predicate.SwizzleZ = TGSI_SWIZZLE_Z;
+   instruction_ext_predicate.SwizzleW = TGSI_SWIZZLE_W;
+   instruction_ext_predicate.Negate = 0;
+   instruction_ext_predicate.SrcIndex = 0;
+   instruction_ext_predicate.Padding = 0;
+   instruction_ext_predicate.Extended = 0;
+
+   return instruction_ext_predicate;
+}
+
+unsigned
+tgsi_compare_instruction_ext_predicate(struct tgsi_instruction_ext_predicate a,
+                                       struct tgsi_instruction_ext_predicate b)
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_instruction_ext_predicate
+tgsi_build_instruction_ext_predicate(unsigned index,
+                                     unsigned negate,
+                                     unsigned swizzleX,
+                                     unsigned swizzleY,
+                                     unsigned swizzleZ,
+                                     unsigned swizzleW,
+                                     struct tgsi_token *prev_token,
+                                     struct tgsi_instruction *instruction,
+                                     struct tgsi_header *header)
+{
+   struct tgsi_instruction_ext_predicate instruction_ext_predicate;
+
+   instruction_ext_predicate = tgsi_default_instruction_ext_predicate();
+   instruction_ext_predicate.SwizzleX = swizzleX;
+   instruction_ext_predicate.SwizzleY = swizzleY;
+   instruction_ext_predicate.SwizzleZ = swizzleZ;
+   instruction_ext_predicate.SwizzleW = swizzleW;
+   instruction_ext_predicate.Negate = negate;
+   instruction_ext_predicate.SrcIndex = index;
+
+   prev_token->Extended = 1;
+   instruction_grow(instruction, header);
+
+   return instruction_ext_predicate;
+}
+
 struct tgsi_src_register
 tgsi_default_src_register( void )
 {
@@ -1033,7 +954,6 @@ tgsi_default_full_src_register( void )
    struct tgsi_full_src_register full_src_register;
 
    full_src_register.SrcRegister = tgsi_default_src_register();
-   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
    full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
    full_src_register.SrcRegisterInd = tgsi_default_src_register();
    full_src_register.SrcRegisterDim = tgsi_default_dimension();
@@ -1042,76 +962,6 @@ tgsi_default_full_src_register( void )
    return full_src_register;
 }
 
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
-   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
-   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
-   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
-   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
-   src_register_ext_swz.NegateX = 0;
-   src_register_ext_swz.NegateY = 0;
-   src_register_ext_swz.NegateZ = 0;
-   src_register_ext_swz.NegateW = 0;
-   src_register_ext_swz.Padding = 0;
-   src_register_ext_swz.Extended = 0;
-
-   return src_register_ext_swz;
-}
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
-   assert( negate_x <= 1 );
-   assert( negate_y <= 1 );
-   assert( negate_z <= 1 );
-   assert( negate_w <= 1 );
-
-   src_register_ext_swz = tgsi_default_src_register_ext_swz();
-   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
-   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
-   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
-   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
-   src_register_ext_swz.NegateX = negate_x;
-   src_register_ext_swz.NegateY = negate_y;
-   src_register_ext_swz.NegateZ = negate_z;
-   src_register_ext_swz.NegateW = negate_w;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return src_register_ext_swz;
-}
 
 struct tgsi_src_register_ext_mod
 tgsi_default_src_register_ext_mod( void )
@@ -1253,77 +1103,12 @@ tgsi_default_full_dst_register( void )
 
    full_dst_register.DstRegister = tgsi_default_dst_register();
    full_dst_register.DstRegisterInd = tgsi_default_src_register();
-   full_dst_register.DstRegisterExtConcode =
-      tgsi_default_dst_register_ext_concode();
    full_dst_register.DstRegisterExtModulate =
       tgsi_default_dst_register_ext_modulate();
 
    return full_dst_register;
 }
 
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
-   dst_register_ext_concode.CondMask = TGSI_CC_TR;
-   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
-   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
-   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
-   dst_register_ext_concode.CondSrcIndex = 0;
-   dst_register_ext_concode.Padding = 0;
-   dst_register_ext_concode.Extended = 0;
-
-   return dst_register_ext_concode;
-}
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   assert( cc <= TGSI_CC_FL );
-   assert( swizzle_x <= TGSI_SWIZZLE_W );
-   assert( swizzle_y <= TGSI_SWIZZLE_W );
-   assert( swizzle_z <= TGSI_SWIZZLE_W );
-   assert( swizzle_w <= TGSI_SWIZZLE_W );
-   assert( index >= -32768 && index <= 32767 );
-
-   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
-   dst_register_ext_concode.CondMask = cc;
-   dst_register_ext_concode.CondSwizzleX = swizzle_x;
-   dst_register_ext_concode.CondSwizzleY = swizzle_y;
-   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
-   dst_register_ext_concode.CondSwizzleW = swizzle_w;
-   dst_register_ext_concode.CondSrcIndex = index;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return dst_register_ext_concode;
-}
-
 struct tgsi_dst_register_ext_modulate
 tgsi_default_dst_register_ext_modulate( void )
 {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 17d977b059..669712eb8f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -157,30 +157,6 @@ tgsi_build_full_instruction(
    struct tgsi_header *header,
    unsigned maxsize );
 
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void );
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b );
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_enable,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_instruction_ext_label
 tgsi_default_instruction_ext_label( void );
 
@@ -211,6 +187,24 @@ tgsi_build_instruction_ext_texture(
    struct tgsi_instruction *instruction,
    struct tgsi_header *header );
 
+struct tgsi_instruction_ext_predicate
+tgsi_default_instruction_ext_predicate(void);
+
+unsigned
+tgsi_compare_instruction_ext_predicate(struct tgsi_instruction_ext_predicate a,
+                                       struct tgsi_instruction_ext_predicate b);
+
+struct tgsi_instruction_ext_predicate
+tgsi_build_instruction_ext_predicate(unsigned index,
+                                     unsigned negate,
+                                     unsigned swizzleX,
+                                     unsigned swizzleY,
+                                     unsigned swizzleZ,
+                                     unsigned swizzleW,
+                                     struct tgsi_token *prev_token,
+                                     struct tgsi_instruction *instruction,
+                                     struct tgsi_header *header);
+
 struct tgsi_src_register
 tgsi_default_src_register( void );
 
@@ -231,28 +225,6 @@ tgsi_build_src_register(
 struct tgsi_full_src_register
 tgsi_default_full_src_register( void );
 
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void );
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b );
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_src_register_ext_mod
 tgsi_default_src_register_ext_mod( void );
 
@@ -297,26 +269,6 @@ tgsi_build_dst_register(
 struct tgsi_full_dst_register
 tgsi_default_full_dst_register( void );
 
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void );
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b );
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_dst_register_ext_modulate
 tgsi_default_dst_register_ext_modulate( void );
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 111d95b666..d16e64f9c5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -100,7 +100,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static const char *interpolate_names[] =
@@ -148,15 +149,6 @@ static const char *texture_names[] =
    "SHADOWRECT"
 };
 
-static const char *extswizzle_names[] =
-{
-   "x",
-   "y",
-   "z",
-   "w",
-   "0",
-   "1"
-};
 
 static const char *modulate_names[TGSI_MODULATE_COUNT] =
 {
@@ -446,24 +438,6 @@ iter_instruction(
          ENM( src->SrcRegister.SwizzleZ, swizzle_names );
          ENM( src->SrcRegister.SwizzleW, swizzle_names );
       }
-      if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-          src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-          src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-          src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
-         CHR( '.' );
-         if (src->SrcRegisterExtSwz.NegateX)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleX, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateY)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleY, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateZ)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateW)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleW, extswizzle_names );
-      }
 
       if (src->SrcRegisterExtMod.Complement)
          CHR( ')' );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
index 4a9c02b141..4648051e29 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -79,7 +79,8 @@ static const char *TGSI_FILES[TGSI_FILE_COUNT] =
    "FILE_SAMPLER",
    "FILE_ADDRESS",
    "FILE_IMMEDIATE",
-   "FILE_LOOP"
+   "FILE_LOOP",
+   "FILE_PREDICATE"
 };
 
 static const char *TGSI_INTERPOLATES[] =
@@ -114,32 +115,11 @@ static const char *TGSI_SATS[] =
 
 static const char *TGSI_INSTRUCTION_EXTS[] =
 {
-   "INSTRUCTION_EXT_TYPE_NV",
+   "",
    "INSTRUCTION_EXT_TYPE_LABEL",
    "INSTRUCTION_EXT_TYPE_TEXTURE"
 };
 
-static const char *TGSI_PRECISIONS[] =
-{
-   "PRECISION_DEFAULT",
-   "PRECISION_FLOAT32",
-   "PRECISION_FLOAT16",
-   "PRECISION_FIXED12"
-};
-
-static const char *TGSI_CCS[] =
-{
-   "CC_GT",
-   "CC_EQ",
-   "CC_LT",
-   "CC_UN",
-   "CC_GE",
-   "CC_LE",
-   "CC_NE",
-   "CC_TR",
-   "CC_FL"
-};
-
 static const char *TGSI_SWIZZLES[] =
 {
    "SWIZZLE_X",
@@ -163,20 +143,10 @@ static const char *TGSI_TEXTURES[] =
 
 static const char *TGSI_SRC_REGISTER_EXTS[] =
 {
-   "SRC_REGISTER_EXT_TYPE_SWZ",
+   "",
    "SRC_REGISTER_EXT_TYPE_MOD"
 };
 
-static const char *TGSI_EXTSWIZZLES[] =
-{
-   "EXTSWIZZLE_X",
-   "EXTSWIZZLE_Y",
-   "EXTSWIZZLE_Z",
-   "EXTSWIZZLE_W",
-   "EXTSWIZZLE_ZERO",
-   "EXTSWIZZLE_ONE"
-};
-
 static const char *TGSI_WRITEMASKS[] =
 {
    "0",
@@ -199,7 +169,7 @@ static const char *TGSI_WRITEMASKS[] =
 
 static const char *TGSI_DST_REGISTER_EXTS[] =
 {
-   "DST_REGISTER_EXT_TYPE_CONDCODE",
+   "",
    "DST_REGISTER_EXT_TYPE_MODULATE"
 };
 
@@ -327,60 +297,6 @@ dump_instruction_verbose(
       UIX( inst->Instruction.Padding );
    }
 
-   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
-      EOL();
-      TXT( "\nType          : " );
-      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
-      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
-         TXT( "\nPrecision     : " );
-         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
-         TXT( "\nCondDstIndex  : " );
-         UID( inst->InstructionExtNv.CondDstIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
-         TXT( "\nCondFlowIndex : " );
-         UID( inst->InstructionExtNv.CondFlowIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
-         TXT( "\nCondMask      : " );
-         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
-         TXT( "\nCondSwizzleX  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
-         TXT( "\nCondSwizzleY  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
-         TXT( "\nCondSwizzleZ  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
-         TXT( "\nCondSwizzleW  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
-         TXT( "\nCondDstUpdate : " );
-         UID( inst->InstructionExtNv.CondDstUpdate );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
-         TXT( "\nCondFlowEnable: " );
-         UID( inst->InstructionExtNv.CondFlowEnable );
-      }
-      if( ignored ) {
-         TXT( "\nPadding       : " );
-         UIX( inst->InstructionExtNv.Padding );
-         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
-            TXT( "\nExtended      : " );
-            UID( inst->InstructionExtNv.Extended );
-         }
-      }
-   }
-
    if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
       EOL();
       TXT( "\nType    : " );
@@ -451,44 +367,6 @@ dump_instruction_verbose(
          }
       }
 
-      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
-         EOL();
-         TXT( "\nType        : " );
-         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
-         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
-            TXT( "\nCondMask    : " );
-            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
-            TXT( "\nCondSwizzleX: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
-            TXT( "\nCondSwizzleY: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
-            TXT( "\nCondSwizzleZ: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
-            TXT( "\nCondSwizzleW: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
-            TXT( "\nCondSrcIndex: " );
-            UID( dst->DstRegisterExtConcode.CondSrcIndex );
-         }
-         if( ignored ) {
-            TXT( "\nPadding     : " );
-            UIX( dst->DstRegisterExtConcode.Padding );
-            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
-               TXT( "\nExtended    : " );
-               UID( dst->DstRegisterExtConcode.Extended );
-            }
-         }
-      }
-
       if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
          EOL();
          TXT( "\nType    : " );
@@ -556,52 +434,6 @@ dump_instruction_verbose(
          }
       }
 
-      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
-         EOL();
-         TXT( "\nType       : " );
-         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
-            TXT( "\nExtSwizzleX: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
-            TXT( "\nExtSwizzleY: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
-            TXT( "\nExtSwizzleZ: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
-            TXT( "\nExtSwizzleW: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
-            TXT( "\nNegateX   : " );
-            UID( src->SrcRegisterExtSwz.NegateX );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
-            TXT( "\nNegateY   : " );
-            UID( src->SrcRegisterExtSwz.NegateY );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
-            TXT( "\nNegateZ   : " );
-            UID( src->SrcRegisterExtSwz.NegateZ );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
-            TXT( "\nNegateW   : " );
-            UID( src->SrcRegisterExtSwz.NegateW );
-         }
-         if( ignored ) {
-            TXT( "\nPadding   : " );
-            UIX( src->SrcRegisterExtSwz.Padding );
-            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
-               TXT( "\nExtended   : " );
-               UID( src->SrcRegisterExtSwz.Extended );
-            }
-         }
-      }
-
       if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
          EOL();
          TXT( "\nType     : " );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index c79c56debd..b7569e74d4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -107,6 +107,7 @@
 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 #define TEMP_R0            TGSI_EXEC_TEMP_R0
+#define TEMP_P0            TGSI_EXEC_TEMP_P0
 
 #define IS_CHANNEL_ENABLED(INST, CHAN)\
    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
@@ -210,9 +211,8 @@ tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
          uint channelsWritten = 0x0;
          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
             /* check if we're reading a channel that's been written */
-            uint swizzle = tgsi_util_get_full_src_register_extswizzle(&inst->FullSrcRegisters[i], chan);
-            if (swizzle <= TGSI_SWIZZLE_W &&
-                (channelsWritten & (1 << swizzle))) {
+            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
+            if (channelsWritten & (1 << swizzle)) {
                return TRUE;
             }
 
@@ -338,7 +338,7 @@ tgsi_exec_machine_bind_shader(
             /* XXX we only handle SOA dependencies properly for MOV/SWZ
              * at this time!
              */
-            if (opcode != TGSI_OPCODE_MOV && opcode != TGSI_OPCODE_SWZ) {
+            if (opcode != TGSI_OPCODE_MOV) {
                debug_printf("Warning: SOA dependency in instruction"
                             " is not handled:\n");
                tgsi_dump_instruction(&parse.FullToken.FullInstruction,
@@ -1130,10 +1130,10 @@ fetch_src_file_channel(
    union tgsi_exec_channel *chan )
 {
    switch( swizzle ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT:
          assert(mach->Consts);
@@ -1188,6 +1188,17 @@ fetch_src_file_channel(
          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
          break;
 
+      case TGSI_FILE_PREDICATE:
+         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
+         chan->u[0] = mach->Addrs[0].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[0].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[0].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[0].xyzw[swizzle].u[3];
+         break;
+
       case TGSI_FILE_OUTPUT:
          /* vertex/fragment output vars can be read too */
          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
@@ -1201,14 +1212,6 @@ fetch_src_file_channel(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
-      break;
-
    default:
       assert( 0 );
    }
@@ -1367,7 +1370,7 @@ fetch_source(
        */
    }
 
-   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    fetch_src_file_channel(
       mach,
       reg->SrcRegister.File,
@@ -1475,119 +1478,17 @@ store_dest(
       dst = &mach->Addrs[index].xyzw[chan_index];
       break;
 
+   case TGSI_FILE_PREDICATE:
+      index = reg->DstRegister.Index;
+      assert(index < TGSI_EXEC_NUM_PREDS);
+      dst = &mach->Addrs[index].xyzw[chan_index];
+      break;
+
    default:
       assert( 0 );
       return;
    }
 
-   if (inst->InstructionExtNv.CondFlowEnable) {
-      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
-      uint swizzle;
-      uint shift;
-      uint mask;
-      uint test;
-
-      /* Only CC0 supported.
-       */
-      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
-
-      switch (chan_index) {
-      case CHAN_X:
-         swizzle = inst->InstructionExtNv.CondSwizzleX;
-         break;
-      case CHAN_Y:
-         swizzle = inst->InstructionExtNv.CondSwizzleY;
-         break;
-      case CHAN_Z:
-         swizzle = inst->InstructionExtNv.CondSwizzleZ;
-         break;
-      case CHAN_W:
-         swizzle = inst->InstructionExtNv.CondSwizzleW;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      switch (swizzle) {
-      case TGSI_SWIZZLE_X:
-         shift = TGSI_EXEC_CC_X_SHIFT;
-         mask = TGSI_EXEC_CC_X_MASK;
-         break;
-      case TGSI_SWIZZLE_Y:
-         shift = TGSI_EXEC_CC_Y_SHIFT;
-         mask = TGSI_EXEC_CC_Y_MASK;
-         break;
-      case TGSI_SWIZZLE_Z:
-         shift = TGSI_EXEC_CC_Z_SHIFT;
-         mask = TGSI_EXEC_CC_Z_MASK;
-         break;
-      case TGSI_SWIZZLE_W:
-         shift = TGSI_EXEC_CC_W_SHIFT;
-         mask = TGSI_EXEC_CC_W_MASK;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      switch (inst->InstructionExtNv.CondMask) {
-      case TGSI_CC_GT:
-         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_EQ:
-         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_LT:
-         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_GE:
-         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_LE:
-         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_NE:
-         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_TR:
-         break;
-
-      case TGSI_CC_FL:
-         for (i = 0; i < QUAD_SIZE; i++)
-            execmask &= ~(1 << i);
-         break;
-
-      default:
-         assert( 0 );
-         return;
-      }
-   }
-
    switch (inst->Instruction.Saturate) {
    case TGSI_SAT_NONE:
       for (i = 0; i < QUAD_SIZE; i++)
@@ -1622,51 +1523,6 @@ store_dest(
    default:
       assert( 0 );
    }
-
-   if (inst->InstructionExtNv.CondDstUpdate) {
-      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
-      uint shift;
-      uint mask;
-
-      /* Only CC0 supported.
-       */
-      assert( inst->InstructionExtNv.CondDstIndex < 1 );
-
-      switch (chan_index) {
-      case CHAN_X:
-         shift = TGSI_EXEC_CC_X_SHIFT;
-         mask = ~TGSI_EXEC_CC_X_MASK;
-         break;
-      case CHAN_Y:
-         shift = TGSI_EXEC_CC_Y_SHIFT;
-         mask = ~TGSI_EXEC_CC_Y_MASK;
-         break;
-      case CHAN_Z:
-         shift = TGSI_EXEC_CC_Z_SHIFT;
-         mask = ~TGSI_EXEC_CC_Z_MASK;
-         break;
-      case CHAN_W:
-         shift = TGSI_EXEC_CC_W_SHIFT;
-         mask = ~TGSI_EXEC_CC_W_MASK;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      for (i = 0; i < QUAD_SIZE; i++)
-         if (execmask & (1 << i)) {
-            cc->u[i] &= mask;
-            if (dst->f[i] < 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
-            else if (dst->f[i] > 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
-            else if (dst->f[i] == 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
-            else
-               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
-         }
-   }
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
@@ -1689,10 +1545,8 @@ exec_kil(struct tgsi_exec_machine *mach,
    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
    union tgsi_exec_channel r[1];
 
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+   /* This mask stores component bits that were already tested. */
+   uniquemask = 0;
 
    for (chan_index = 0; chan_index < 4; chan_index++)
    {
@@ -1700,7 +1554,7 @@ exec_kil(struct tgsi_exec_machine *mach,
       uint i;
 
       /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle (
+      swizzle = tgsi_util_get_full_src_register_swizzle (
                         &inst->FullSrcRegisters[0],
                         chan_index);
 
@@ -1728,32 +1582,8 @@ exec_kilp(struct tgsi_exec_machine *mach,
 {
    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
 
-   if (inst->InstructionExtNv.CondFlowEnable) {
-      uint swizzle[4];
-      uint chan_index;
-
-      kilmask = 0x0;
-
-      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
-      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
-      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
-      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
-
-      for (chan_index = 0; chan_index < 4; chan_index++)
-      {
-         uint i;
-
-         for (i = 0; i < 4; i++) {
-            /* TODO: evaluate the condition code */
-            if (0)
-               kilmask |= 1 << i;
-         }
-      }
-   }
-   else {
-      /* "unconditional" kil */
-      kilmask = mach->ExecMask;
-   }
+   /* "unconditional" kil */
+   kilmask = mach->ExecMask;
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 }
 
@@ -1981,8 +1811,8 @@ exec_declaration(
             break;
 
          default:
-            eval = NULL;
             assert( 0 );
+            return;
          }
 
          if( mask == TGSI_WRITEMASK_XYZW ) {
@@ -2031,7 +1861,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       if (inst->Flags & SOA_DEPENDENCY_FLAG) {
          /* Do all fetches into temp regs, then do all stores to avoid
           * intermediate/accidental clobbering.  This could be done all the
@@ -3223,22 +3052,6 @@ exec_instruction(
       /* no-op */
       break;
 
-   case TGSI_OPCODE_NOISE1:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE2:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE3:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE4:
-      assert( 0 );
-      break;
-
    case TGSI_OPCODE_NOP:
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index c72f76809d..471f591dd6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -168,13 +168,18 @@ struct tgsi_exec_labels
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
 #define TGSI_EXEC_NUM_ADDRS         1
-#define TGSI_EXEC_NUM_TEMP_EXTRAS   9
 
+/* predicate register */
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_NUM_PREDS         1
 
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
 
-#define TGSI_EXEC_MAX_COND_NESTING  20
-#define TGSI_EXEC_MAX_LOOP_NESTING  20
-#define TGSI_EXEC_MAX_CALL_NESTING  20
+
+
+#define TGSI_EXEC_MAX_COND_NESTING  32
+#define TGSI_EXEC_MAX_LOOP_NESTING  32
+#define TGSI_EXEC_MAX_CALL_NESTING  32
 
 /* The maximum number of input attributes per vertex. For 2D
  * input register files, this is the stride between two 1D
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 17af4cb7ad..be375cabb8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -134,10 +134,10 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 0, 0, 0, 0, 1, "BGNSUB", TGSI_OPCODE_BGNSUB },
    { 0, 0, 0, 1, 1, 0, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
    { 0, 0, 0, 0, 1, 0, "ENDSUB", TGSI_OPCODE_ENDSUB },
-   { 1, 1, 0, 0, 0, 0, "NOISE1", TGSI_OPCODE_NOISE1 },
-   { 1, 1, 0, 0, 0, 0, "NOISE2", TGSI_OPCODE_NOISE2 },
-   { 1, 1, 0, 0, 0, 0, "NOISE3", TGSI_OPCODE_NOISE3 },
-   { 1, 1, 0, 0, 0, 0, "NOISE4", TGSI_OPCODE_NOISE4 },
+   { 0, 0, 0, 0, 0, 0, "", 103 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 104 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 105 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 106 },     /* removed */
    { 0, 0, 0, 0, 0, 0, "NOP", TGSI_OPCODE_NOP },
    { 0, 0, 0, 0, 0, 0, "", 108 },     /* removed */
    { 0, 0, 0, 0, 0, 0, "", 109 },     /* removed */
@@ -149,7 +149,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, 0, 0, "BREAKC", TGSI_OPCODE_BREAKC },
    { 0, 1, 0, 0, 0, 0, "KIL", TGSI_OPCODE_KIL },
    { 0, 0, 0, 0, 0, 0, "END", TGSI_OPCODE_END },
-   { 1, 1, 0, 0, 0, 0, "SWZ", TGSI_OPCODE_SWZ }
+   { 0, 0, 0, 0, 0, 0, "", 118 }      /* removed */
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index e7bcf4bf75..b34263da48 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -139,10 +139,6 @@ OP00_LBL(BGNLOOP)
 OP00(BGNSUB)
 OP00_LBL(ENDLOOP)
 OP00(ENDSUB)
-OP11(NOISE1)
-OP11(NOISE2)
-OP11(NOISE3)
-OP11(NOISE4)
 OP00(NOP)
 OP11(NRM4)
 OP01(CALLNZ)
@@ -150,7 +146,6 @@ OP01(IFC)
 OP01(BREAKC)
 OP01(KIL)
 OP00(END)
-OP11(SWZ)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 4870f82b6b..83f9df1183 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -181,10 +181,6 @@ tgsi_parse_token(
          next_token( ctx, &token );
 
          switch( token.Type ) {
-         case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            copy_token(&inst->InstructionExtNv, &token);
-            break;
-
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
             copy_token(&inst->InstructionExtLabel, &token);
             break;
@@ -193,6 +189,10 @@ tgsi_parse_token(
             copy_token(&inst->InstructionExtTexture, &token);
             break;
 
+         case TGSI_INSTRUCTION_EXT_TYPE_PREDICATE:
+            copy_token(&inst->InstructionExtPredicate, &token);
+            break;
+
          default:
             assert( 0 );
          }
@@ -220,11 +220,6 @@ tgsi_parse_token(
             next_token( ctx, &token );
 
             switch( token.Type ) {
-            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
-                          &token);
-               break;
-
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
                copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
                           &token);
@@ -264,11 +259,6 @@ tgsi_parse_token(
             next_token( ctx, &token );
 
             switch( token.Type ) {
-            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
-                          &token);
-               break;
-
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
                copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
                           &token);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index a26ee5ba86..76f1676d85 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -49,14 +49,12 @@ struct tgsi_full_dst_register
 {
    struct tgsi_dst_register               DstRegister;
    struct tgsi_src_register               DstRegisterInd;
-   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
    struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
 };
 
 struct tgsi_full_src_register
 {
    struct tgsi_src_register         SrcRegister;
-   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
    struct tgsi_src_register_ext_mod SrcRegisterExtMod;
    struct tgsi_src_register         SrcRegisterInd;
    struct tgsi_dimension            SrcRegisterDim;
@@ -82,9 +80,9 @@ struct tgsi_full_immediate
 struct tgsi_full_instruction
 {
    struct tgsi_instruction             Instruction;
-   struct tgsi_instruction_ext_nv      InstructionExtNv;
    struct tgsi_instruction_ext_label   InstructionExtLabel;
    struct tgsi_instruction_ext_texture InstructionExtTexture;
+   struct tgsi_instruction_ext_predicate InstructionExtPredicate;
    struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
    struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
    uint Flags;  /**< user-defined usage */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 4b1c7d4e01..617fd7f6be 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -283,14 +283,14 @@ emit_fetch(struct gen_context *gen,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
-   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   uint swizzle = tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    int dst_vec = -1;
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_INPUT:
          {
@@ -349,16 +349,6 @@ emit_fetch(struct gen_context *gen,
          assert( 0 );
       }
       break;
-   case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vzero(gen->f, dst_vec);
-      break;
-   case TGSI_EXTSWIZZLE_ONE:
-      {
-         int one_vec = gen_one_vec(gen);
-         dst_vec = ppc_allocate_vec_register(gen->f);
-         ppc_vmove(gen->f, dst_vec, one_vec);
-      }
-      break;
    default:
       assert( 0 );
    }
@@ -418,8 +408,8 @@ equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
       return FALSE;
    if (a->SrcRegister.Index != b->SrcRegister.Index)
       return FALSE;
-   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
-   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   swz_a = tgsi_util_get_full_src_register_swizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_swizzle(b, chan_b);
    if (swz_a != swz_b)
       return FALSE;
    sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
@@ -635,7 +625,6 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
          break;
       case TGSI_OPCODE_MOV:
-      case TGSI_OPCODE_SWZ:
          if (v0 != v1)
             ppc_vmove(gen->f, v1, v0);
          break;
@@ -1119,7 +1108,6 @@ emit_instruction(struct gen_context *gen,
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
    case TGSI_OPCODE_ABS:
    case TGSI_OPCODE_FLR:
    case TGSI_OPCODE_FRC:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 8a13885da9..36e27ea52f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -141,7 +141,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static boolean
@@ -358,7 +359,7 @@ epilog(
 
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens )
+   const struct tgsi_token *tokens )
 {
    struct sanity_check_ctx ctx;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index ca45e94c7a..52263ff883 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -40,7 +40,7 @@ extern "C" {
  */
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens );
+   const struct tgsi_token *tokens );
 
 #if defined __cplusplus
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index c535788819..f9c16f1b6c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -132,6 +132,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                if (file == TGSI_FILE_INPUT) {
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.SemanticName;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
                   info->num_inputs++;
                }
                else if (file == TGSI_FILE_OUTPUT) {
@@ -227,11 +228,6 @@ tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
                 src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
                 src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ||
 
-                src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-                src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-                src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-                src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W ||
-
                 dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW)
             {
                tgsi_parse_free(&parse);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 2c1a75bc81..8a7ee0c7e4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte num_outputs;
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 1e719940ec..a96fc94c7a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -39,8 +39,9 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
-#include "tgsi_exec.h"
-#include "tgsi_sse2.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -1259,13 +1260,13 @@ emit_fetch(
    const struct tgsi_full_src_register *reg,
    const unsigned chan_index )
 {
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_CONSTANT:
          emit_const(
@@ -1307,22 +1308,6 @@ emit_fetch(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      emit_tempf(
-         func,
-         xmm,
-         TEMP_ONE_I,
-         TEMP_ONE_C );
-      break;
-
    default:
       assert( 0 );
    }
@@ -1360,6 +1345,32 @@ emit_store(
    const struct tgsi_full_instruction *inst,
    unsigned chan_index )
 {
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      sse_maxps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ) );
+
+      sse_minps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C ) );
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+
+
    switch( reg->DstRegister.File ) {
    case TGSI_FILE_OUTPUT:
       emit_output(
@@ -1388,19 +1399,6 @@ emit_store(
    default:
       assert( 0 );
    }
-
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
 }
 
 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
@@ -1568,13 +1566,13 @@ emit_kil(
    /* This mask stores component bits that were already tested. Note that
     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
     * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+   uniquemask = 0;
 
    FOR_EACH_CHANNEL( chan_index ) {
       unsigned swizzle;
 
       /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
+      swizzle = tgsi_util_get_full_src_register_swizzle(
          reg,
          chan_index );
 
@@ -1747,14 +1745,6 @@ emit_instruction(
    if (indirect_temp_reference(inst))
       return FALSE;
 
-   /* we don't handle saturation/clamping yet */
-   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
-      return FALSE;
-
-   /* need to use extra temps to fix SOA dependencies : */
-   if (tgsi_check_soa_dependencies(inst))
-      return FALSE;
-
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1766,10 +1756,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 4 + chan_index, 0, chan_index );
+      }
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 4 + chan_index, 0, chan_index );
       }
       break;
 
@@ -1847,7 +1838,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_rcp( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1856,7 +1846,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_abs( func, 0 );
       emit_rsqrt( func, 1, 0 );
@@ -1954,7 +1943,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -1972,7 +1960,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -2043,17 +2030,14 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
       emit_setcc( func, inst, cc_LessThan );
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
       emit_setcc( func, inst, cc_NotLessThan );
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
@@ -2283,7 +2267,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      return 0;
+      emit_setcc( func, inst, cc_Equal );
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2291,7 +2275,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      return 0;
+      emit_setcc( func, inst, cc_NotLessThanEqual );
       break;
 
    case TGSI_OPCODE_SIN:
@@ -2303,11 +2287,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLE:
-      return 0;
+      emit_setcc( func, inst, cc_LessThanEqual );
       break;
 
    case TGSI_OPCODE_SNE:
-      return 0;
+      emit_setcc( func, inst, cc_NotEqual );
       break;
 
    case TGSI_OPCODE_STR:
@@ -2371,7 +2355,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          emit_sgn( func, 0, 0 );
@@ -2929,6 +2912,21 @@ tgsi_emit_sse2(
                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
                          "vertex shader" : "fragment shader");
 	 }
+
+         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
+            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+
+            /* XXX: we only handle src/dst aliasing in a few opcodes
+             * currently.  Need to use an additional temporay to hold
+             * the result in the cases where the code is too opaque to
+             * fix.
+             */
+            if (opcode != TGSI_OPCODE_MOV) {
+               debug_printf("Warning: src/dst aliasing in instruction"
+                            " is not handled:\n");
+               tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
+            }
+         }
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index d438450b1e..d2b03ffb2f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -232,7 +232,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static boolean
@@ -538,13 +539,11 @@ static boolean
 parse_optional_swizzle(
    struct translate_ctx *ctx,
    uint swizzle[4],
-   boolean *parsed_swizzle,
-   boolean *parsed_extswizzle )
+   boolean *parsed_swizzle )
 {
    const char *cur = ctx->cur;
 
    *parsed_swizzle = FALSE;
-   *parsed_extswizzle = FALSE;
 
    eat_opt_white( &cur );
    if (*cur == '.') {
@@ -562,15 +561,8 @@ parse_optional_swizzle(
          else if (uprcase( *cur ) == 'W')
             swizzle[i] = TGSI_SWIZZLE_W;
          else {
-            if (*cur == '0')
-               swizzle[i] = TGSI_EXTSWIZZLE_ZERO;
-            else if (*cur == '1')
-               swizzle[i] = TGSI_EXTSWIZZLE_ONE;
-            else {
-               report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
-               return FALSE;
-            }
-            *parsed_extswizzle = TRUE;
+	    report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+	    return FALSE;
          }
          cur++;
       }
@@ -595,7 +587,6 @@ parse_src_operand(
    uint swizzle[4];
    boolean parsed_ext_negate_paren = FALSE;
    boolean parsed_swizzle;
-   boolean parsed_extswizzle;
 
    if (*ctx->cur == '-') {
       cur = ctx->cur;
@@ -690,16 +681,8 @@ parse_src_operand(
 
    /* Parse optional swizzle.
     */
-   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle, &parsed_extswizzle )) {
-      if (parsed_extswizzle) {
-         assert( parsed_swizzle );
-
-         src->SrcRegisterExtSwz.ExtSwizzleX = swizzle[0];
-         src->SrcRegisterExtSwz.ExtSwizzleY = swizzle[1];
-         src->SrcRegisterExtSwz.ExtSwizzleZ = swizzle[2];
-         src->SrcRegisterExtSwz.ExtSwizzleW = swizzle[3];
-      }
-      else if (parsed_swizzle) {
+   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle )) {
+      if (parsed_swizzle) {
          src->SrcRegister.SwizzleX = swizzle[0];
          src->SrcRegister.SwizzleY = swizzle[1];
          src->SrcRegister.SwizzleZ = swizzle[2];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index f7096bd8e2..3f752e9352 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -29,8 +29,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_sanity.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -45,18 +47,14 @@ union tgsi_any_token {
    struct tgsi_immediate imm;
    union  tgsi_immediate_data imm_data;
    struct tgsi_instruction insn;
-   struct tgsi_instruction_ext_nv insn_ext_nv;
    struct tgsi_instruction_ext_label insn_ext_label;
    struct tgsi_instruction_ext_texture insn_ext_texture;
    struct tgsi_instruction_ext_predicate insn_ext_predicate;
    struct tgsi_src_register src;
-   struct tgsi_src_register_ext_swz src_ext_swz;
    struct tgsi_src_register_ext_mod src_ext_mod;
    struct tgsi_dimension dim;
    struct tgsi_dst_register dst;
-   struct tgsi_dst_register_ext_concode dst_ext_code;
    struct tgsi_dst_register_ext_modulate dst_ext_mod;
-   struct tgsi_dst_register_ext_predicate dst_ext_pred;
    unsigned value;
 };
 
@@ -70,9 +68,11 @@ struct ureg_tokens {
 
 #define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 32
 #define UREG_MAX_TEMP 256
 #define UREG_MAX_ADDR 2
+#define UREG_MAX_PRED 1
 
 #define DOMAIN_DECL 0
 #define DOMAIN_INSN 1
@@ -86,8 +86,10 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
-   } input[UREG_MAX_INPUT];
-   unsigned nr_inputs;
+   } fs_input[UREG_MAX_INPUT];
+   unsigned nr_fs_inputs;
+
+   unsigned vs_inputs[UREG_MAX_INPUT/32];
 
    struct {
       unsigned semantic_name;
@@ -107,9 +109,14 @@ struct ureg_program
    unsigned temps_active[UREG_MAX_TEMP / 32];
    unsigned nr_temps;
 
-   unsigned nr_addrs;
+   struct {
+      unsigned first;
+      unsigned last;
+   } constant_range[UREG_MAX_CONSTANT_RANGE];
+   unsigned nr_constant_ranges;
 
-   unsigned nr_constants;
+   unsigned nr_addrs;
+   unsigned nr_preds;
    unsigned nr_instructions;
 
    struct ureg_tokens domain[2];
@@ -119,6 +126,9 @@ static union tgsi_any_token error_tokens[32];
 
 static void tokens_error( struct ureg_tokens *tokens )
 {
+   if (tokens->tokens && tokens->tokens != error_tokens)
+      FREE(tokens->tokens);
+
    tokens->tokens = error_tokens;
    tokens->size = Elements(error_tokens);
    tokens->count = 0;
@@ -130,8 +140,9 @@ static void tokens_expand( struct ureg_tokens *tokens,
 {
    unsigned old_size = tokens->size * sizeof(unsigned);
 
-   if (tokens->tokens == error_tokens)
-      goto fail;
+   if (tokens->tokens == error_tokens) {
+      return;
+   }
 
    while (tokens->count + count > tokens->size) {
       tokens->size = (1 << ++tokens->order);
@@ -140,13 +151,9 @@ static void tokens_expand( struct ureg_tokens *tokens,
    tokens->tokens = REALLOC(tokens->tokens, 
                             old_size,
                             tokens->size * sizeof(unsigned));
-   if (tokens->tokens == NULL) 
-      goto fail;
-
-   return;
-          
-fail:
-   tokens_error(tokens);
+   if (tokens->tokens == NULL) {
+      tokens_error(tokens);
+   }
 }
 
 static void set_bad( struct ureg_program *ureg )
@@ -196,9 +203,13 @@ ureg_dst_register( unsigned file,
    dst.IndirectIndex = 0;
    dst.IndirectSwizzle = 0;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = index;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
@@ -228,25 +239,25 @@ ureg_src_register( unsigned file,
 
 
 
-static struct ureg_src 
-ureg_DECL_input( struct ureg_program *ureg,
-                 unsigned name,
-                 unsigned index,
-                 unsigned interp_mode )
+struct ureg_src 
+ureg_DECL_fs_input( struct ureg_program *ureg,
+                    unsigned name,
+                    unsigned index,
+                    unsigned interp_mode )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      if (ureg->input[i].semantic_name == name &&
-          ureg->input[i].semantic_index == index) 
+   for (i = 0; i < ureg->nr_fs_inputs; i++) {
+      if (ureg->fs_input[i].semantic_name == name &&
+          ureg->fs_input[i].semantic_index == index) 
          goto out;
    }
 
-   if (ureg->nr_inputs < UREG_MAX_INPUT) {
-      ureg->input[i].semantic_name = name;
-      ureg->input[i].semantic_index = index;
-      ureg->input[i].interp = interp_mode;
-      ureg->nr_inputs++;
+   if (ureg->nr_fs_inputs < UREG_MAX_INPUT) {
+      ureg->fs_input[i].semantic_name = name;
+      ureg->fs_input[i].semantic_index = index;
+      ureg->fs_input[i].interp = interp_mode;
+      ureg->nr_fs_inputs++;
    }
    else {
       set_bad( ureg );
@@ -257,25 +268,14 @@ out:
 }
 
 
-
-struct ureg_src 
-ureg_DECL_fs_input( struct ureg_program *ureg,
-                    unsigned name,
-                    unsigned index,
-                    unsigned interp )
-{
-   assert(ureg->processor == TGSI_PROCESSOR_FRAGMENT);
-   return ureg_DECL_input( ureg, name, index, interp );
-}
-
-
 struct ureg_src 
 ureg_DECL_vs_input( struct ureg_program *ureg,
-                    unsigned name,
                     unsigned index )
 {
    assert(ureg->processor == TGSI_PROCESSOR_VERTEX);
-   return ureg_DECL_input( ureg, name, index, TGSI_INTERPOLATE_CONSTANT );
+   
+   ureg->vs_inputs[index/32] |= 1 << (index % 32);
+   return ureg_src_register( TGSI_FILE_INPUT, index );
 }
 
 
@@ -313,9 +313,57 @@ out:
  * value or manage any constant_buffer contents -- that's the
  * resposibility of the calling code.
  */
-struct ureg_src ureg_DECL_constant(struct ureg_program *ureg )
+struct ureg_src ureg_DECL_constant(struct ureg_program *ureg, 
+                                   unsigned index )
 {
-   return ureg_src_register( TGSI_FILE_CONSTANT, ureg->nr_constants++ );
+   unsigned minconst = index, maxconst = index;
+   unsigned i;
+
+   /* Inside existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].first <= index &&
+          ureg->constant_range[i].last >= index)
+         goto out;
+   }
+
+   /* Extend existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].last == index - 1) {
+         ureg->constant_range[i].last = index;
+         goto out;
+      }
+
+      if (ureg->constant_range[i].first == index + 1) {
+         ureg->constant_range[i].first = index;
+         goto out;
+      }
+
+      minconst = MIN2(minconst, ureg->constant_range[i].first);
+      maxconst = MAX2(maxconst, ureg->constant_range[i].last);
+   }
+
+   /* Create new range?
+    */
+   if (ureg->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
+      i = ureg->nr_constant_ranges++;
+      ureg->constant_range[i].first = index;
+      ureg->constant_range[i].last = index;
+   }
+
+   /* Collapse all ranges down to one:
+    */
+   i = 0;
+   ureg->constant_range[0].first = minconst;
+   ureg->constant_range[0].last = maxconst;
+   ureg->nr_constant_ranges = 1;
+
+out:
+   assert(i < ureg->nr_constant_ranges);
+   assert(ureg->constant_range[i].first <= index);
+   assert(ureg->constant_range[i].last >= index);
+   return ureg_src_register( TGSI_FILE_CONSTANT, index );
 }
 
 
@@ -369,6 +417,19 @@ struct ureg_dst ureg_DECL_address( struct ureg_program *ureg )
    return ureg_dst_register( TGSI_FILE_ADDRESS, 0 );
 }
 
+/* Allocate a new predicate register.
+ */
+struct ureg_dst
+ureg_DECL_predicate(struct ureg_program *ureg)
+{
+   if (ureg->nr_preds < UREG_MAX_PRED) {
+      return ureg_dst_register(TGSI_FILE_PREDICATE, ureg->nr_preds++);
+   }
+
+   assert(0);
+   return ureg_dst_register(TGSI_FILE_PREDICATE, 0);
+}
+
 /* Allocate a new sampler.
  */
 struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
@@ -566,17 +627,40 @@ ureg_emit_dst( struct ureg_program *ureg,
 }
 
 
+static void validate( unsigned opcode,
+                      unsigned nr_dst,
+                      unsigned nr_src )
+{
+#ifdef DEBUG
+   const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
+   assert(info);
+   if(info) {
+      assert(nr_dst == info->num_dst);
+      assert(nr_src == info->num_src);
+   }
+#endif
+}
 
-unsigned
+struct ureg_emit_insn_result
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               boolean predicate,
+               boolean pred_negate,
+               unsigned pred_swizzle_x,
+               unsigned pred_swizzle_y,
+               unsigned pred_swizzle_z,
+               unsigned pred_swizzle_w,
                unsigned num_dst,
                unsigned num_src )
 {
    union tgsi_any_token *out;
+   uint count = predicate ? 2 : 1;
+   struct ureg_emit_insn_result result;
 
-   out = get_tokens( ureg, DOMAIN_INSN, 1 );
+   validate( opcode, num_dst, num_src );
+   
+   out = get_tokens( ureg, DOMAIN_INSN, count );
    out[0].value = 0;
    out[0].insn.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
    out[0].insn.NrTokens = 0;
@@ -585,17 +669,34 @@ ureg_emit_insn(struct ureg_program *ureg,
    out[0].insn.NumDstRegs = num_dst;
    out[0].insn.NumSrcRegs = num_src;
    out[0].insn.Padding = 0;
-   out[0].insn.Extended = 0;
-   
+
+   result.insn_token = ureg->domain[DOMAIN_INSN].count - count;
+
+   if (predicate) {
+      out[0].insn.Extended = 1;
+      out[1].insn_ext_predicate = tgsi_default_instruction_ext_predicate();
+      out[1].insn_ext_predicate.Negate = pred_negate;
+      out[1].insn_ext_predicate.SwizzleX = pred_swizzle_x;
+      out[1].insn_ext_predicate.SwizzleY = pred_swizzle_y;
+      out[1].insn_ext_predicate.SwizzleZ = pred_swizzle_z;
+      out[1].insn_ext_predicate.SwizzleW = pred_swizzle_w;
+
+      result.extended_token = result.insn_token + 1;
+   } else {
+      out[0].insn.Extended = 0;
+
+      result.extended_token = result.insn_token;
+   }
+
    ureg->nr_instructions++;
    
-   return ureg->domain[DOMAIN_INSN].count - 1;
+   return result;
 }
 
 
 void
 ureg_emit_label(struct ureg_program *ureg,
-                unsigned insn_token,
+                unsigned extended_token,
                 unsigned *label_token )
 {
    union tgsi_any_token *out, *insn;
@@ -604,9 +705,9 @@ ureg_emit_label(struct ureg_program *ureg,
       return;
 
    out = get_tokens( ureg, DOMAIN_INSN, 1 );
-   insn = retrieve_token( ureg, DOMAIN_INSN, insn_token );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
 
-   insn->insn.Extended = 1;
+   insn->token.Extended = 1;
 
    out[0].value = 0;
    out[0].insn_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
@@ -640,15 +741,15 @@ ureg_fixup_label(struct ureg_program *ureg,
 
 void
 ureg_emit_texture(struct ureg_program *ureg,
-                  unsigned insn_token,
+                  unsigned extended_token,
                   unsigned target )
 {
    union tgsi_any_token *out, *insn;
 
    out = get_tokens( ureg, DOMAIN_INSN, 1 );
-   insn = retrieve_token( ureg, DOMAIN_INSN, insn_token );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
 
-   insn->insn.Extended = 1;
+   insn->token.Extended = 1;
 
    out[0].value = 0;
    out[0].insn_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
@@ -675,23 +776,83 @@ ureg_insn(struct ureg_program *ureg,
           const struct ureg_src *src,
           unsigned nr_src )
 {
-   unsigned insn, i;
+   struct ureg_emit_insn_result insn;
+   unsigned i;
    boolean saturate;
+   boolean predicate;
+   boolean negate;
+   unsigned swizzle[4];
 
-#ifdef DEBUG
-   {
-      const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
-      assert(info);
-      if(info) {
-         assert(nr_dst == info->num_dst);
-         assert(nr_src == info->num_src);
-      }
+   saturate = nr_dst ? dst[0].Saturate : FALSE;
+   predicate = nr_dst ? dst[0].Predicate : FALSE;
+   if (predicate) {
+      negate = dst[0].PredNegate;
+      swizzle[0] = dst[0].PredSwizzleX;
+      swizzle[1] = dst[0].PredSwizzleY;
+      swizzle[2] = dst[0].PredSwizzleZ;
+      swizzle[3] = dst[0].PredSwizzleW;
    }
-#endif
-   
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         saturate,
+                         predicate,
+                         negate,
+                         swizzle[0],
+                         swizzle[1],
+                         swizzle[2],
+                         swizzle[3],
+                         nr_dst,
+                         nr_src);
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst( ureg, dst[i] );
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn.insn_token );
+}
+
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src )
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+   boolean saturate;
+   boolean predicate;
+   boolean negate;
+   unsigned swizzle[4];
+
    saturate = nr_dst ? dst[0].Saturate : FALSE;
+   predicate = nr_dst ? dst[0].Predicate : FALSE;
+   if (predicate) {
+      negate = dst[0].PredNegate;
+      swizzle[0] = dst[0].PredSwizzleX;
+      swizzle[1] = dst[0].PredSwizzleY;
+      swizzle[2] = dst[0].PredSwizzleZ;
+      swizzle[3] = dst[0].PredSwizzleW;
+   }
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         saturate,
+                         predicate,
+                         negate,
+                         swizzle[0],
+                         swizzle[1],
+                         swizzle[2],
+                         swizzle[3],
+                         nr_dst,
+                         nr_src);
 
-   insn = ureg_emit_insn( ureg, opcode, saturate, nr_dst, nr_src );
+   ureg_emit_texture( ureg, insn.extended_token, target );
 
    for (i = 0; i < nr_dst; i++)
       ureg_emit_dst( ureg, dst[i] );
@@ -699,7 +860,38 @@ ureg_insn(struct ureg_program *ureg,
    for (i = 0; i < nr_src; i++)
       ureg_emit_src( ureg, src[i] );
 
-   ureg_fixup_insn_size( ureg, insn );
+   ureg_fixup_insn_size( ureg, insn.insn_token );
+}
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label_token )
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         FALSE,
+                         FALSE,
+                         FALSE,
+                         TGSI_SWIZZLE_X,
+                         TGSI_SWIZZLE_Y,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         0,
+                         nr_src);
+
+   ureg_emit_label( ureg, insn.extended_token, label_token );
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn.insn_token );
 }
 
 
@@ -777,13 +969,22 @@ static void emit_decls( struct ureg_program *ureg )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      emit_decl( ureg, 
-                 TGSI_FILE_INPUT, 
-                 i,
-                 ureg->input[i].semantic_name,
-                 ureg->input[i].semantic_index,
-                 ureg->input[i].interp );
+   if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
+      for (i = 0; i < UREG_MAX_INPUT; i++) {
+         if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
+            emit_decl_range( ureg, TGSI_FILE_INPUT, i, 1 );
+         }
+      }
+   }
+   else {
+      for (i = 0; i < ureg->nr_fs_inputs; i++) {
+         emit_decl( ureg, 
+                    TGSI_FILE_INPUT, 
+                    i,
+                    ureg->fs_input[i].semantic_name,
+                    ureg->fs_input[i].semantic_index,
+                    ureg->fs_input[i].interp );
+      }
    }
 
    for (i = 0; i < ureg->nr_outputs; i++) {
@@ -801,10 +1002,13 @@ static void emit_decls( struct ureg_program *ureg )
                        ureg->sampler[i].Index, 1 );
    }
 
-   if (ureg->nr_constants) {
-      emit_decl_range( ureg,
-                       TGSI_FILE_CONSTANT,
-                       0, ureg->nr_constants );
+   if (ureg->nr_constant_ranges) {
+      for (i = 0; i < ureg->nr_constant_ranges; i++)
+         emit_decl_range( ureg,
+                          TGSI_FILE_CONSTANT,
+                          ureg->constant_range[i].first, 
+                          (ureg->constant_range[i].last + 1 -
+                           ureg->constant_range[i].first) );
    }
 
    if (ureg->nr_temps) {
@@ -819,6 +1023,13 @@ static void emit_decls( struct ureg_program *ureg )
                        0, ureg->nr_addrs );
    }
 
+   if (ureg->nr_preds) {
+      emit_decl_range(ureg,
+                      TGSI_FILE_PREDICATE,
+                      0,
+                      ureg->nr_preds);
+   }
+
    for (i = 0; i < ureg->nr_immediates; i++) {
       emit_immediate( ureg,
                       ureg->immediate[i].v );
@@ -890,6 +1101,15 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
                    ureg->domain[DOMAIN_DECL].count);
       tgsi_dump( tokens, 0 );
    }
+
+#if DEBUG
+   if (tokens && !tgsi_sanity_check(tokens)) {
+      debug_printf("tgsi_ureg.c, sanity check failed on generated tokens:\n");
+      tgsi_dump(tokens, 0);
+      assert(0);
+   }
+#endif
+
    
    return tokens;
 }
@@ -911,6 +1131,25 @@ void *ureg_create_shader( struct ureg_program *ureg,
 }
 
 
+const struct tgsi_token *ureg_get_tokens( struct ureg_program *ureg,
+                                          unsigned *nr_tokens )
+{
+   const struct tgsi_token *tokens;
+
+   ureg_finalize(ureg);
+
+   tokens = &ureg->domain[DOMAIN_DECL].tokens[0].token;
+
+   if (nr_tokens) 
+      *nr_tokens = ureg->domain[DOMAIN_DECL].size;
+
+   ureg->domain[DOMAIN_DECL].tokens = 0;
+   ureg->domain[DOMAIN_DECL].size = 0;
+   ureg->domain[DOMAIN_DECL].order = 0;
+   ureg->domain[DOMAIN_DECL].count = 0;
+
+   return tokens;
+}
 
 
 struct ureg_program *ureg_create( unsigned processor )
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index acbca59040..dae4291194 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -67,9 +67,13 @@ struct ureg_dst
    unsigned WriteMask   : 4;  /* TGSI_WRITEMASK_ */
    unsigned Indirect    : 1;  /* BOOL */
    unsigned Saturate    : 1;  /* BOOL */
+   unsigned Predicate   : 1;
+   unsigned PredNegate  : 1;  /* BOOL */
+   unsigned PredSwizzleX: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleY: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleZ: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleW: 2;  /* TGSI_SWIZZLE_ */
    int      Index       : 16; /* SINT */
-   unsigned Pad1        : 5;
-   unsigned Pad2        : 1;  /* BOOL */
    int      IndirectIndex   : 16; /* SINT */
    int      IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
 };
@@ -82,10 +86,21 @@ ureg_create( unsigned processor );
 const struct tgsi_token *
 ureg_finalize( struct ureg_program * );
 
+/* Create and return a shader:
+ */
 void *
 ureg_create_shader( struct ureg_program *,
                     struct pipe_context *pipe );
 
+
+/* Alternately, return the built token stream and hand ownership of
+ * that memory to the caller:
+ */
+const struct tgsi_token *
+ureg_get_tokens( struct ureg_program *ureg,
+                 unsigned *nr_tokens );
+
+
 void 
 ureg_destroy( struct ureg_program * );
 
@@ -116,8 +131,7 @@ ureg_DECL_fs_input( struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_vs_input( struct ureg_program *,
-                    unsigned semantic_name,
-                    unsigned semantic_index );
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_output( struct ureg_program *,
@@ -130,7 +144,8 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
-ureg_DECL_constant( struct ureg_program * );
+ureg_DECL_constant( struct ureg_program *,
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_temporary( struct ureg_program * );
@@ -142,6 +157,9 @@ ureg_release_temporary( struct ureg_program *ureg,
 struct ureg_dst
 ureg_DECL_address( struct ureg_program * );
 
+struct ureg_dst
+ureg_DECL_predicate(struct ureg_program *);
+
 /* Supply an index to the sampler declaration as this is the hook to
  * the external pipe_sampler state.  Users of this function probably
  * don't want just any sampler, but a specific one which they've set
@@ -233,14 +251,43 @@ ureg_insn(struct ureg_program *ureg,
           unsigned nr_src );
 
 
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src );
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label);
+
+
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
  */
 
-unsigned
+struct ureg_emit_insn_result {
+   unsigned insn_token;       /*< Used to fixup insn size. */
+   unsigned extended_token;   /*< Used to set the Extended bit, usually the same as insn_token. */
+};
+
+struct ureg_emit_insn_result
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               boolean predicate,
+               boolean pred_negate,
+               unsigned pred_swizzle_x,
+               unsigned pred_swizzle_y,
+               unsigned pred_swizzle_z,
+               unsigned pred_swizzle_w,
                unsigned num_dst,
                unsigned num_src );
 
@@ -271,7 +318,17 @@ ureg_fixup_insn_size(struct ureg_program *ureg,
 static INLINE void ureg_##op( struct ureg_program *ureg )       \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 0 ); \
+   unsigned insn = ureg_emit_insn(ureg,                         \
+                                  opcode,                       \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  TGSI_SWIZZLE_X,               \
+                                  TGSI_SWIZZLE_Y,               \
+                                  TGSI_SWIZZLE_Z,               \
+                                  TGSI_SWIZZLE_W,               \
+                                  0,                            \
+                                  0).insn_token;                \
    ureg_fixup_insn_size( ureg, insn );                          \
 }
 
@@ -280,7 +337,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src )             \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 1 ); \
+   unsigned insn = ureg_emit_insn(ureg,                         \
+                                  opcode,                       \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  TGSI_SWIZZLE_X,               \
+                                  TGSI_SWIZZLE_Y,               \
+                                  TGSI_SWIZZLE_Z,               \
+                                  TGSI_SWIZZLE_W,               \
+                                  0,                            \
+                                  1).insn_token;                \
    ureg_emit_src( ureg, src );                                  \
    ureg_fixup_insn_size( ureg, insn );                          \
 }
@@ -290,9 +357,20 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )           \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 0 ); \
-   ureg_emit_label( ureg, insn, label_token );                  \
-   ureg_fixup_insn_size( ureg, insn );                          \
+   struct ureg_emit_insn_result insn;                           \
+   insn = ureg_emit_insn(ureg,                                  \
+                         opcode,                                \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         TGSI_SWIZZLE_X,                        \
+                         TGSI_SWIZZLE_Y,                        \
+                         TGSI_SWIZZLE_Z,                        \
+                         TGSI_SWIZZLE_W,                        \
+                         0,                                     \
+                         0);                                    \
+   ureg_emit_label( ureg, insn.extended_token, label_token );   \
+   ureg_fixup_insn_size( ureg, insn.insn_token );               \
 }
 
 #define OP01_LBL( op )                                          \
@@ -301,10 +379,21 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )          \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 1 ); \
-   ureg_emit_label( ureg, insn, label_token );                  \
+   struct ureg_emit_insn_result insn;                           \
+   insn = ureg_emit_insn(ureg,                                  \
+                         opcode,                                \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         TGSI_SWIZZLE_X,                        \
+                         TGSI_SWIZZLE_Y,                        \
+                         TGSI_SWIZZLE_Z,                        \
+                         TGSI_SWIZZLE_W,                        \
+                         0,                                     \
+                         1);                                    \
+   ureg_emit_label( ureg, insn.extended_token, label_token );   \
    ureg_emit_src( ureg, src );                                  \
-   ureg_fixup_insn_size( ureg, insn );                          \
+   ureg_fixup_insn_size( ureg, insn.insn_token );               \
 }
 
 #define OP10( op )                                                      \
@@ -312,7 +401,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 0 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  0).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_fixup_insn_size( ureg, insn );                                  \
 }
@@ -324,7 +423,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 1 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  1).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src );                                          \
    ureg_fixup_insn_size( ureg, insn );                                  \
@@ -337,7 +446,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src1 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 2 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  2).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -352,12 +471,23 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src1 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 2 );  \
-   ureg_emit_texture( ureg, insn, target );                             \
+   struct ureg_emit_insn_result insn;                                   \
+   insn = ureg_emit_insn(ureg,                                          \
+                         opcode,                                        \
+                         dst.Saturate,                                  \
+                         dst.Predicate,                                 \
+                         dst.PredNegate,                                \
+                         dst.PredSwizzleX,                              \
+                         dst.PredSwizzleY,                              \
+                         dst.PredSwizzleZ,                              \
+                         dst.PredSwizzleW,                              \
+                         1,                                             \
+                         2);                                            \
+   ureg_emit_texture( ureg, insn.extended_token, target );              \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
-   ureg_fixup_insn_size( ureg, insn );                                  \
+   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
 #define OP13( op )                                                      \
@@ -368,7 +498,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src2 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 3 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  3).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -386,14 +526,25 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src3 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 4 );  \
-   ureg_emit_texture( ureg, insn, target );                             \
+   struct ureg_emit_insn_result insn;                                   \
+   insn = ureg_emit_insn(ureg,                                          \
+                         opcode,                                        \
+                         dst.Saturate,                                  \
+                         dst.Predicate,                                 \
+                         dst.PredNegate,                                \
+                         dst.PredSwizzleX,                              \
+                         dst.PredSwizzleY,                              \
+                         dst.PredSwizzleZ,                              \
+                         dst.PredSwizzleW,                              \
+                         1,                                             \
+                         4);                                            \
+   ureg_emit_texture( ureg, insn.extended_token, target );              \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
    ureg_emit_src( ureg, src2 );                                         \
    ureg_emit_src( ureg, src3 );                                         \
-   ureg_fixup_insn_size( ureg, insn );                                  \
+   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
 
@@ -468,6 +619,24 @@ ureg_saturate( struct ureg_dst reg )
    return reg;
 }
 
+static INLINE struct ureg_dst
+ureg_predicate(struct ureg_dst reg,
+               boolean negate,
+               unsigned swizzle_x,
+               unsigned swizzle_y,
+               unsigned swizzle_z,
+               unsigned swizzle_w)
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Predicate = 1;
+   reg.PredNegate = negate;
+   reg.PredSwizzleX = swizzle_x;
+   reg.PredSwizzleY = swizzle_y;
+   reg.PredSwizzleZ = swizzle_z;
+   reg.PredSwizzleW = swizzle_w;
+   return reg;
+}
+
 static INLINE struct ureg_dst 
 ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
 {
@@ -501,9 +670,13 @@ ureg_dst( struct ureg_src src )
    dst.IndirectIndex = src.IndirectIndex;
    dst.IndirectSwizzle = src.IndirectSwizzle;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = src.Index;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
@@ -542,9 +715,13 @@ ureg_dst_undef( void )
    dst.IndirectIndex = 0;
    dst.IndirectSwizzle = 0;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = 0;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 71f8a6ca40..4dee1be9e8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -69,59 +69,15 @@ tgsi_util_get_src_register_swizzle(
    return 0;
 }
 
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->ExtSwizzleX;
-   case 1:
-      return reg->ExtSwizzleY;
-   case 2:
-      return reg->ExtSwizzleZ;
-   case 3:
-      return reg->ExtSwizzleW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
 
 unsigned
-tgsi_util_get_full_src_register_extswizzle(
+tgsi_util_get_full_src_register_swizzle(
    const struct tgsi_full_src_register  *reg,
    unsigned component )
 {
-   unsigned swizzle;
-
-   /*
-    * First, calculate  the   extended swizzle for a given channel. This will give
-    * us either a channel index into the simple swizzle or  a constant 1 or   0.
-    */
-   swizzle = tgsi_util_get_src_register_extswizzle(
-      &reg->SrcRegisterExtSwz,
+   return tgsi_util_get_src_register_swizzle(
+      &reg->SrcRegister,
       component );
-
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
-
-   /*
-    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
-    * Leave the constants intact, they are   not   affected by the   simple swizzle.
-    */
-   if( swizzle <= TGSI_SWIZZLE_W ) {
-      swizzle = tgsi_util_get_src_register_swizzle(
-         &reg->SrcRegister,
-         swizzle );
-   }
-
-   return swizzle;
 }
 
 void
@@ -148,74 +104,6 @@ tgsi_util_set_src_register_swizzle(
    }
 }
 
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->ExtSwizzleX = swizzle;
-      break;
-   case 1:
-      reg->ExtSwizzleY = swizzle;
-      break;
-   case 2:
-      reg->ExtSwizzleZ = swizzle;
-      break;
-   case 3:
-      reg->ExtSwizzleW = swizzle;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const  struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->NegateX;
-   case 1:
-      return reg->NegateY;
-   case 2:
-      return reg->NegateZ;
-   case 3:
-      return reg->NegateW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->NegateX = negate;
-      break;
-   case 1:
-      reg->NegateY = negate;
-      break;
-   case 2:
-      reg->NegateZ = negate;
-      break;
-   case 3:
-      reg->NegateW = negate;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
 unsigned
 tgsi_util_get_full_src_register_sign_mode(
    const struct  tgsi_full_src_register *reg,
@@ -239,9 +127,7 @@ tgsi_util_get_full_src_register_sign_mode(
       unsigned negate;
 
       negate = reg->SrcRegister.Negate;
-      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
-         negate = !negate;
-      }
+
       if( reg->SrcRegisterExtMod.Negate ) {
          negate = !negate;
       }
@@ -262,11 +148,6 @@ tgsi_util_set_full_src_register_sign_mode(
    struct tgsi_full_src_register *reg,
    unsigned sign_mode )
 {
-   reg->SrcRegisterExtSwz.NegateX = 0;
-   reg->SrcRegisterExtSwz.NegateY = 0;
-   reg->SrcRegisterExtSwz.NegateZ = 0;
-   reg->SrcRegisterExtSwz.NegateW = 0;
-
    switch (sign_mode)
    {
    case TGSI_UTIL_SIGN_CLEAR:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 21eb656327..19ee2e7cf2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -33,7 +33,6 @@ extern "C" {
 #endif
 
 struct tgsi_src_register;
-struct tgsi_src_register_ext_swz;
 struct tgsi_full_src_register;
 
 void *
@@ -45,13 +44,9 @@ tgsi_util_get_src_register_swizzle(
    const struct tgsi_src_register *reg,
    unsigned component );
 
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component);
 
 unsigned
-tgsi_util_get_full_src_register_extswizzle(
+tgsi_util_get_full_src_register_swizzle(
    const struct tgsi_full_src_register *reg,
    unsigned component );
 
@@ -61,23 +56,6 @@ tgsi_util_set_src_register_swizzle(
    unsigned swizzle,
    unsigned component );
 
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component );
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component );
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component );
-
 #define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
 #define TGSI_UTIL_SIGN_SET      1   /* Force negative */
 #define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index ae8d330a78..1d8bb55bbd 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_debug_stack.c \
 	u_blit.c \
 	u_cache.c \
+	u_cpu_detect.c \
 	u_draw_quad.c \
 	u_format.c \
 	u_format_access.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 28a5ab4256..8d99106d0b 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -24,10 +24,10 @@ util = env.ConvenienceLibrary(
 		'u_bitmask.c',
 		'u_blit.c',
 		'u_cache.c',
+		'u_cpu_detect.c',
 		'u_debug.c',
 		'u_debug_dump.c',
 		'u_debug_memory.c',
-		'u_debug_profile.c',
 		'u_debug_stack.c',
 		'u_debug_symbol.c',
 		'u_draw_quad.c',
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index c516317d70..5038642599 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -46,6 +46,7 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_surface.h"
+#include "util/u_rect.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -182,47 +183,7 @@ get_next_slot( struct blit_state *ctx )
 }
                                
 
-/**
- * Setup vertex data for the textured quad we'll draw.
- * Note: y=0=top
- */
-static unsigned
-setup_vertex_data(struct blit_state *ctx,
-                  float x0, float y0, float x1, float y1, float z)
-{
-   unsigned offset;
-
-   ctx->vertices[0][0][0] = x0;
-   ctx->vertices[0][0][1] = y0;
-   ctx->vertices[0][0][2] = z;
-   ctx->vertices[0][1][0] = 0.0f; /*s*/
-   ctx->vertices[0][1][1] = 0.0f; /*t*/
-
-   ctx->vertices[1][0][0] = x1;
-   ctx->vertices[1][0][1] = y0;
-   ctx->vertices[1][0][2] = z;
-   ctx->vertices[1][1][0] = 1.0f; /*s*/
-   ctx->vertices[1][1][1] = 0.0f; /*t*/
-
-   ctx->vertices[2][0][0] = x1;
-   ctx->vertices[2][0][1] = y1;
-   ctx->vertices[2][0][2] = z;
-   ctx->vertices[2][1][0] = 1.0f;
-   ctx->vertices[2][1][1] = 1.0f;
 
-   ctx->vertices[3][0][0] = x0;
-   ctx->vertices[3][0][1] = y1;
-   ctx->vertices[3][0][2] = z;
-   ctx->vertices[3][1][0] = 0.0f;
-   ctx->vertices[3][1][1] = 1.0f;
-
-   offset = get_next_slot( ctx );
-
-   pipe_buffer_write(ctx->pipe->screen, ctx->vbuf,
-                     offset, sizeof(ctx->vertices), ctx->vertices);
-
-   return offset;
-}
 
 
 /**
@@ -315,15 +276,13 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_texture texTemp, *tex;
-   struct pipe_surface *texSurf;
+   struct pipe_texture *tex = NULL;
    struct pipe_framebuffer_state fb;
    const int srcW = abs(srcX1 - srcX0);
    const int srcH = abs(srcY1 - srcY0);
-   const int srcLeft = MIN2(srcX0, srcX1);
-   const int srcTop = MIN2(srcY0, srcY1);
    unsigned offset;
    boolean overlap;
+   float s0, t0, s1, t1;
 
    assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
           filter == PIPE_TEX_MIPFILTER_LINEAR);
@@ -343,7 +302,8 @@ util_blit_pixels_writemask(struct blit_state *ctx,
     * no overlapping.
     * Filter mode should not matter since there's no stretching.
     */
-   if (dst->format == src->format &&
+   if (pipe->surface_copy &&
+       dst->format == src->format &&
        srcX0 < srcX1 &&
        dstX0 < dstX1 &&
        srcY0 < srcY1 &&
@@ -358,54 +318,83 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       return;
    }
    
-   if (srcLeft != srcX0) {
-      /* left-right flip */
-      int tmp = dstX0;
-      dstX0 = dstX1;
-      dstX1 = tmp;
-   }
-
-   if (srcTop != srcY0) {
-      /* up-down flip */
-      int tmp = dstY0;
-      dstY0 = dstY1;
-      dstY1 = tmp;
-   }
-
    assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
                                       PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
 
-   /*
-    * XXX for now we're always creating a temporary texture.
-    * Strictly speaking that's not always needed.
+   /* Create a temporary texture when src and dest alias or when src
+    * is anything other than a single-level 2d texture.
+    * 
+    * This can still be improved upon.
     */
+   if (util_same_surface(src, dst) ||
+       src->texture->target != PIPE_TEXTURE_2D ||
+       src->texture->last_level != 0)
+   {
+      struct pipe_texture texTemp;
+      struct pipe_surface *texSurf;
+      const int srcLeft = MIN2(srcX0, srcX1);
+      const int srcTop = MIN2(srcY0, srcY1);
+
+      if (srcLeft != srcX0) {
+         /* left-right flip */
+         int tmp = dstX0;
+         dstX0 = dstX1;
+         dstX1 = tmp;
+      }
+
+      if (srcTop != srcY0) {
+         /* up-down flip */
+         int tmp = dstY0;
+         dstY0 = dstY1;
+         dstY1 = tmp;
+      }
+
+      /* create temp texture */
+      memset(&texTemp, 0, sizeof(texTemp));
+      texTemp.target = PIPE_TEXTURE_2D;
+      texTemp.format = src->format;
+      texTemp.last_level = 0;
+      texTemp.width[0] = srcW;
+      texTemp.height[0] = srcH;
+      texTemp.depth[0] = 1;
+      pf_get_block(src->format, &texTemp.block);
+
+      tex = screen->texture_create(screen, &texTemp);
+      if (!tex)
+         return;
+
+      texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
+                                        PIPE_BUFFER_USAGE_GPU_WRITE);
+
+      /* load temp texture */
+      if (pipe->surface_copy) {
+         pipe->surface_copy(pipe,
+                            texSurf, 0, 0,   /* dest */
+                            src, srcLeft, srcTop, /* src */
+                            srcW, srcH);     /* size */
+      } else {
+         util_surface_copy(pipe, FALSE,
+                           texSurf, 0, 0,   /* dest */
+                           src, srcLeft, srcTop, /* src */
+                           srcW, srcH);     /* size */
+      }
+
+      /* free the surface, update the texture if necessary.
+       */
+      pipe_surface_reference(&texSurf, NULL);
+      s0 = 0.0f; 
+      s1 = 1.0f;
+      t0 = 0.0f;
+      t1 = 1.0f;
+   }
+   else {
+      pipe_texture_reference(&tex, src->texture);
+      s0 = srcX0 / (float)tex->width[0];
+      s1 = srcX1 / (float)tex->width[0];
+      t0 = srcY0 / (float)tex->height[0];
+      t1 = srcY1 / (float)tex->height[0];
+   }
 
-   /* create temp texture */
-   memset(&texTemp, 0, sizeof(texTemp));
-   texTemp.target = PIPE_TEXTURE_2D;
-   texTemp.format = src->format;
-   texTemp.last_level = 0;
-   texTemp.width[0] = srcW;
-   texTemp.height[0] = srcH;
-   texTemp.depth[0] = 1;
-   pf_get_block(src->format, &texTemp.block);
-
-   tex = screen->texture_create(screen, &texTemp);
-   if (!tex)
-      return;
-
-   texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
-                                     PIPE_BUFFER_USAGE_GPU_WRITE);
-
-   /* load temp texture */
-   pipe->surface_copy(pipe,
-                      texSurf, 0, 0,   /* dest */
-                      src, srcLeft, srcTop, /* src */
-                      srcW, srcH);     /* size */
-
-   /* free the surface, update the texture if necessary.
-    */
-   pipe_surface_reference(&texSurf, NULL);
 
    /* save state (restored below) */
    cso_save_blend(ctx->cso);
@@ -447,9 +436,12 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_set_framebuffer(ctx->cso, &fb);
 
    /* draw quad */
-   offset = setup_vertex_data(ctx,
-                              (float) dstX0, (float) dstY0, 
-                              (float) dstX1, (float) dstY1, z);
+   offset = setup_vertex_data_tex(ctx,
+                                  (float) dstX0, (float) dstY0, 
+                                  (float) dstX1, (float) dstY1,
+                                  s0, t0,
+                                  s1, t1,
+                                  z);
 
    util_draw_vertex_buffer(ctx->pipe, ctx->vbuf, offset,
                            PIPE_PRIM_TRIANGLE_FAN,
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index 7c16b32cf9..1e65a035ae 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -32,6 +32,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "util/u_pack_color.h"
+#include "util/u_rect.h"
 
 
 /**
@@ -48,13 +49,22 @@ util_clear(struct pipe_context *pipe,
       unsigned color;
 
       util_pack_color(rgba, ps->format, &color);
-      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      if (pipe->surface_fill) {
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      } else {
+         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      }
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
       struct pipe_surface *ps = framebuffer->zsbuf;
 
-      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
-                         util_pack_z_stencil(ps->format, depth, stencil));
+      if (pipe->surface_fill) {
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+                            util_pack_z_stencil(ps->format, depth, stencil));
+      } else {
+         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+                           util_pack_z_stencil(ps->format, depth, stencil));
+      }
    }
 }
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index f78706f447..a08241971c 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -24,23 +24,21 @@
  * 
  **************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-/* FIXME: clean this entire file up */
+#include "pipe/p_config.h"
 
+#include "u_debug.h"
 #include "u_cpu_detect.h"
 
-#ifdef __linux__
-#define OS_LINUX
-#endif
-#ifdef WIN32
-#define OS_WIN32
-#endif
-
-#if defined(ARCH_POWERPC)
-#if defined(OS_DARWIN)
+#if defined(PIPE_ARCH_PPC)
+#if defined(PIPE_OS_DARWIN)
 #include <sys/sysctl.h>
 #else
 #include <signal.h>
@@ -48,140 +46,147 @@
 #endif
 #endif
 
-#if defined(OS_NETBSD) || defined(OS_OPENBSD)
+#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <machine/cpu.h>
 #endif
 
-#if defined(OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
-#if defined(OS_LINUX)
+#if defined(PIPE_OS_LINUX)
 #include <signal.h>
 #endif
 
-#if defined(OS_WIN32)
-#include <windows.h>
+#ifdef PIPE_OS_UNIX
+#include <unistd.h>
 #endif
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
+#if defined(PIPE_OS_WINDOWS)
+#include <windows.h>
+#if defined(MSVC)
+#include <intrin.h>
+#endif
+#endif
 
 
-static struct cpu_detect_caps __cpu_detect_caps;
-static int __cpu_detect_initialized = 0;
+struct util_cpu_caps util_cpu_caps;
 
 static int has_cpuid(void);
-static int cpuid(unsigned int ax, unsigned int *p);
+
+#if defined(PIPE_ARCH_X86)
 
 /* The sigill handlers */
-#if defined(ARCH_X86) /* x86 (linux katmai handler check thing) */
-#if defined(OS_LINUX) && defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
-static void sigill_handler_sse(int signal, struct sigcontext sc)
+#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/
+static void
+sigill_handler_sse(int signal, struct sigcontext sc)
 {
-	/* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
-	 * instructions are 3 bytes long.  We must increment the instruction
-	 * pointer manually to avoid repeated execution of the offending
-	 * instruction.
-	 *
-	 * If the SIGILL is caused by a divide-by-zero when unmasked
-	 * exceptions aren't supported, the SIMD FPU status and control
-	 * word will be restored at the end of the test, so we don't need
-	 * to worry about doing it here.  Besides, we may not be able to...
-	 */
-	sc.eip += 3;
-
-	__cpu_detect_caps.hasSSE=0;
+   /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
+    * instructions are 3 bytes long.  We must increment the instruction
+    * pointer manually to avoid repeated execution of the offending
+    * instruction.
+    *
+    * If the SIGILL is caused by a divide-by-zero when unmasked
+    * exceptions aren't supported, the SIMD FPU status and control
+    * word will be restored at the end of the test, so we don't need
+    * to worry about doing it here.  Besides, we may not be able to...
+    */
+   sc.eip += 3;
+
+   util_cpu_caps.has_sse=0;
 }
 
-static void sigfpe_handler_sse(int signal, struct sigcontext sc)
+static void
+sigfpe_handler_sse(int signal, struct sigcontext sc)
 {
-	if (sc.fpstate->magic != 0xffff) {
-		/* Our signal context has the extended FPU state, so reset the
-		 * divide-by-zero exception mask and clear the divide-by-zero
-		 * exception bit.
-		 */
-		sc.fpstate->mxcsr |= 0x00000200;
-		sc.fpstate->mxcsr &= 0xfffffffb;
-	} else {
-		/* If we ever get here, we're completely hosed.
-		*/
-	}
+   if (sc.fpstate->magic != 0xffff) {
+      /* Our signal context has the extended FPU state, so reset the
+       * divide-by-zero exception mask and clear the divide-by-zero
+       * exception bit.
+       */
+      sc.fpstate->mxcsr |= 0x00000200;
+      sc.fpstate->mxcsr &= 0xfffffffb;
+   } else {
+      /* If we ever get here, we're completely hosed.
+      */
+   }
 }
-#endif
-#endif /* OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
+#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
 
-#if defined(OS_WIN32)
-LONG CALLBACK win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
+#if defined(PIPE_OS_WINDOWS)
+static LONG CALLBACK
+win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
 {
-	if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
-		ep->ContextRecord->Eip +=3;
-		__cpu_detect_caps.hasSSE=0;
-		return EXCEPTION_CONTINUE_EXECUTION;
-	}
-	return EXCEPTION_CONTINUE_SEARCH;
+   if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
+      ep->ContextRecord->Eip +=3;
+      util_cpu_caps.has_sse=0;
+      return EXCEPTION_CONTINUE_EXECUTION;
+   }
+   return EXCEPTION_CONTINUE_SEARCH;
 }
-#endif /* OS_WIN32 */
+#endif /* PIPE_OS_WINDOWS */
 
+#endif /* PIPE_ARCH_X86 */
 
-#if defined(ARCH_POWERPC) && !defined(OS_DARWIN)
-static sigjmp_buf __lv_powerpc_jmpbuf;
-static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
-static void sigill_handler (int sig);
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN)
+static jmp_buf  __lv_powerpc_jmpbuf;
+static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
-static void sigill_handler (int sig)
+static void
+sigill_handler(int sig)
 {
-	if (!__lv_powerpc_canjump) {
-		signal (sig, SIG_DFL);
-		raise (sig);
-	}
+   if (!__lv_powerpc_canjump) {
+      signal (sig, SIG_DFL);
+      raise (sig);
+   }
 
-	__lv_powerpc_canjump = 0;
-	siglongjmp(__lv_powerpc_jmpbuf, 1);
+   __lv_powerpc_canjump = 0;
+   longjmp(__lv_powerpc_jmpbuf, 1);
 }
+#endif
 
-static void check_os_altivec_support(void)
+#if defined(PIPE_ARCH_PPC)
+static void
+check_os_altivec_support(void)
 {
-#if defined(OS_DARWIN)
-	int sels[2] = {CTL_HW, HW_VECTORUNIT};
-	int has_vu = 0;
-	int len = sizeof (has_vu);
-	int err;
-
-	err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
-
-	if (err == 0) {
-		if (has_vu != 0) {
-			__cpu_detect_caps.hasAltiVec = 1;
-		}
-	}
-#else /* !OS_DARWIN */
-	/* no Darwin, do it the brute-force way */
-	/* this is borrowed from the libmpeg2 library */
-	signal(SIGILL, sigill_handler);
-	if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
-		signal(SIGILL, SIG_DFL);
-	} else {
-		__lv_powerpc_canjump = 1;
-
-		__asm __volatile
-			("mtspr 256, %0\n\t"
-			 "vand %%v0, %%v0, %%v0"
-			 :
-			 : "r" (-1));
-
-		signal(SIGILL, SIG_DFL);
-		__cpu_detect_caps.hasAltiVec = 1;
-	}
-#endif
+#if defined(PIPE_OS_DARWIN)
+   int sels[2] = {CTL_HW, HW_VECTORUNIT};
+   int has_vu = 0;
+   int len = sizeof (has_vu);
+   int err;
+
+   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+   if (err == 0) {
+      if (has_vu != 0) {
+         util_cpu_caps.has_altivec = 1;
+      }
+   }
+#else /* !PIPE_OS_DARWIN */
+   /* no Darwin, do it the brute-force way */
+   /* this is borrowed from the libmpeg2 library */
+   signal(SIGILL, sigill_handler);
+   if (setjmp(__lv_powerpc_jmpbuf)) {
+      signal(SIGILL, SIG_DFL);
+   } else {
+      __lv_powerpc_canjump = 1;
+
+      __asm __volatile
+         ("mtspr 256, %0\n\t"
+          "vand %%v0, %%v0, %%v0"
+          :
+          : "r" (-1));
+
+      signal(SIGILL, SIG_DFL);
+      util_cpu_caps.has_altivec = 1;
+   }
+#endif /* PIPE_OS_DARWIN */
 }
-#endif
+#endif /* PIPE_ARCH_PPC */
 
 /* If we're running on a processor that can do SSE, let's see if we
  * are allowed to or not.  This will catch 2.4.0 or later kernels that
@@ -189,318 +194,327 @@ static void check_os_altivec_support(void)
  * and RedHat patched 2.2 kernels that have broken exception handling
  * support for user space apps that do SSE.
  */
-static void check_os_katmai_support(void)
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
+static void
+check_os_katmai_support(void)
 {
-#if defined(ARCH_X86)
-#if defined(OS_FREEBSD)
-	int has_sse=0, ret;
-	int len = sizeof (has_sse);
-
-	ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-	if (ret || !has_sse)
-		__cpu_detect_caps.hasSSE=0;
-
-#elif defined(OS_NETBSD) || defined(OS_OPENBSD)
-	int has_sse, has_sse2, ret, mib[2];
-	int varlen;
-
-	mib[0] = CTL_MACHDEP;
-	mib[1] = CPU_SSE;
-	varlen = sizeof (has_sse);
-
-	ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse) {
-		__cpu_detect_caps.hasSSE = 0;
-	} else {
-		__cpu_detect_caps.hasSSE = 1;
-	}
-
-	mib[1] = CPU_SSE2;
-	varlen = sizeof (has_sse2);
-	ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse2) {
-		__cpu_detect_caps.hasSSE2 = 0;
-	} else {
-		__cpu_detect_caps.hasSSE2 = 1;
-	}
-	__cpu_detect_caps.hasSSE = 0; /* FIXME ?!?!? */
-
-#elif defined(OS_WIN32)
-	LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-	if (__cpu_detect_caps.hasSSE) {
-		exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-		__asm __volatile ("xorps %xmm0, %xmm0");
-		SetUnhandledExceptionFilter(exc_fil);
-	}
-#elif defined(OS_LINUX)
-	struct sigaction saved_sigill;
-	struct sigaction saved_sigfpe;
-
-	/* Save the original signal handlers.
-	*/
-	sigaction(SIGILL, NULL, &saved_sigill);
-	sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-	signal(SIGILL, (void (*)(int))sigill_handler_sse);
-	signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-	/* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-	 * supports the extended FPU save and restore required for SSE.  If
-	 * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-	 * doesn't support Streaming SIMD Exceptions, even if the processor
-	 * does.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-		__asm __volatile ("xorps %xmm1, %xmm0");
-	}
-
-	/* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-	 * it supports unmasked SIMD FPU exceptions.  If we unmask the
-	 * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-	 * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-	 * as expected, we're okay but we need to clean up after it.
-	 *
-	 * Are we being too stringent in our requirement that the OS support
-	 * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-	 * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-	 * doesn't even support them.  We at least know the user-space SSE
-	 * support is good in kernels that do support unmasked exceptions,
-	 * and therefore to be safe I'm going to leave this test in here.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-           /* test_os_katmai_exception_support(); */
-	}
-
-	/* Restore the original signal handlers.
-	*/
-	sigaction(SIGILL, &saved_sigill, NULL);
-	sigaction(SIGFPE, &saved_sigfpe, NULL);
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_FREEBSD)
+   int has_sse=0, ret;
+   int len = sizeof (has_sse);
+
+   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
+   if (ret || !has_sse)
+      util_cpu_caps.has_sse=0;
+
+#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
+   int has_sse, has_sse2, ret, mib[2];
+   int varlen;
+
+   mib[0] = CTL_MACHDEP;
+   mib[1] = CPU_SSE;
+   varlen = sizeof (has_sse);
+
+   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse) {
+      util_cpu_caps.has_sse = 0;
+   } else {
+      util_cpu_caps.has_sse = 1;
+   }
+
+   mib[1] = CPU_SSE2;
+   varlen = sizeof (has_sse2);
+   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse2) {
+      util_cpu_caps.has_sse2 = 0;
+   } else {
+      util_cpu_caps.has_sse2 = 1;
+   }
+   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
+
+#elif defined(PIPE_OS_WINDOWS)
+   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
+   if (util_cpu_caps.has_sse) {
+      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
+#if defined(PIPE_CC_GCC)
+      __asm __volatile ("xorps %xmm0, %xmm0");
+#elif defined(PIPE_CC_MSVC)
+      __asm {
+          xorps xmm0, xmm0        /* executing SSE instruction */
+      }
+#else
+#error Unsupported compiler
+#endif
+      SetUnhandledExceptionFilter(exc_fil);
+   }
+#elif defined(PIPE_OS_LINUX)
+   struct sigaction saved_sigill;
+   struct sigaction saved_sigfpe;
+
+   /* Save the original signal handlers.
+   */
+   sigaction(SIGILL, NULL, &saved_sigill);
+   sigaction(SIGFPE, NULL, &saved_sigfpe);
+
+   signal(SIGILL, (void (*)(int))sigill_handler_sse);
+   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
+
+   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
+    * supports the extended FPU save and restore required for SSE.  If
+    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
+    * doesn't support Streaming SIMD Exceptions, even if the processor
+    * does.
+    */
+   if (util_cpu_caps.has_sse) {
+      __asm __volatile ("xorps %xmm1, %xmm0");
+   }
+
+   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
+    * it supports unmasked SIMD FPU exceptions.  If we unmask the
+    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
+    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
+    * as expected, we're okay but we need to clean up after it.
+    *
+    * Are we being too stringent in our requirement that the OS support
+    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
+    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
+    * doesn't even support them.  We at least know the user-space SSE
+    * support is good in kernels that do support unmasked exceptions,
+    * and therefore to be safe I'm going to leave this test in here.
+    */
+   if (util_cpu_caps.has_sse) {
+      /* test_os_katmai_exception_support(); */
+   }
+
+   /* Restore the original signal handlers.
+   */
+   sigaction(SIGILL, &saved_sigill, NULL);
+   sigaction(SIGFPE, &saved_sigfpe, NULL);
 
 #else
-	/* We can't use POSIX signal handling to test the availability of
-	 * SSE, so we disable it by default.
-	 */
-	__cpu_detect_caps.hasSSE = 0;
+   /* We can't use POSIX signal handling to test the availability of
+    * SSE, so we disable it by default.
+    */
+   util_cpu_caps.has_sse = 0;
 #endif /* __linux__ */
 #endif
+
+#if defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.has_sse = 1;
+#endif
 }
 
 
 static int has_cpuid(void)
 {
-#if defined(ARCH_X86)
-	int a, c;
-
-	__asm __volatile
-		("pushf\n"
-		 "popl %0\n"
-		 "movl %0, %1\n"
-		 "xorl $0x200000, %0\n"
-		 "push %0\n"
-		 "popf\n"
-		 "pushf\n"
-		 "popl %0\n"
-		 : "=a" (a), "=c" (c)
-		 :
-		 : "cc");
-
-	return a != c;
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_GCC)
+   int a, c;
+
+   __asm __volatile
+      ("pushf\n"
+       "popl %0\n"
+       "movl %0, %1\n"
+       "xorl $0x200000, %0\n"
+       "push %0\n"
+       "popf\n"
+       "pushf\n"
+       "popl %0\n"
+       : "=a" (a), "=c" (c)
+       :
+       : "cc");
+
+   return a != c;
+#else
+   /* FIXME */
+   return 1;
+#endif
+#elif defined(PIPE_ARCH_X86_64)
+   return 1;
 #else
-	return 0;
+   return 0;
 #endif
 }
 
-static int cpuid(unsigned int ax, unsigned int *p)
+
+/**
+ * @sa cpuid.h included in gcc-4.3 onwards.
+ * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+ */
+static INLINE void
+cpuid(uint32_t ax, uint32_t *p)
 {
-#if defined(ARCH_X86)
-	unsigned int flags;
-
-	__asm __volatile
-		("movl %%ebx, %%esi\n\t"
-		 "cpuid\n\t"
-		 "xchgl %%ebx, %%esi"
-		 : "=a" (p[0]), "=S" (p[1]),
-		 "=c" (p[2]), "=d" (p[3])
-		 : "0" (ax));
-
-	return 0;
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+   __asm __volatile (
+     "xchgl %%ebx, %1\n\t"
+     "cpuid\n\t"
+     "xchgl %%ebx, %1"
+     : "=a" (p[0]),
+       "=S" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
+   __asm __volatile (
+     "cpuid\n\t"
+     : "=a" (p[0]),
+       "=b" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_MSVC)
+   __cpuid(p, ax);
 #else
-	return -1;
+   p[0] = 0;
+   p[1] = 0;
+   p[2] = 0;
+   p[3] = 0;
 #endif
 }
+#endif /* X86 or X86_64 */
 
-void cpu_detect_initialize()
+void
+util_cpu_detect(void)
 {
-	unsigned int regs[4];
-	unsigned int regs2[4];
-
-	int mib[2], ncpu;
-	int len;
-
-	memset(&__cpu_detect_caps, 0, sizeof (struct cpu_detect_caps));
-
-	/* Check for arch type */
-#if defined(ARCH_MIPS)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_MIPS;
-#elif defined(ARCH_ALPHA)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_ALPHA;
-#elif defined(ARCH_SPARC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_SPARC;
-#elif defined(ARCH_X86)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_X86;
-#elif defined(ARCH_POWERPC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_POWERPC;
+   static boolean util_cpu_detect_initialized = FALSE;
+
+   if(util_cpu_detect_initialized)
+      return;
+
+   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
+
+   /* Check for arch type */
+#if defined(PIPE_ARCH_MIPS)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS;
+#elif defined(PIPE_ARCH_ALPHA)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA;
+#elif defined(PIPE_ARCH_SPARC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC;
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_X86;
+   util_cpu_caps.little_endian = 1;
+#elif defined(PIPE_ARCH_PPC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC;
+   util_cpu_caps.little_endian = 0;
 #else
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_OTHER;
+   util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN;
 #endif
 
-	/* Count the number of CPUs in system */
-#if !defined(OS_WIN32) && !defined(OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
-	__cpu_detect_caps.nrcpu = sysconf(_SC_NPROCESSORS_ONLN);
-	if (__cpu_detect_caps.nrcpu == -1)
-		__cpu_detect_caps.nrcpu = 1;
-
-#elif defined(OS_NETBSD) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
-
-	mib[0] = CTL_HW;
-	mib[1] = HW_NCPU;
-
-	len = sizeof (ncpu);
-	sysctl(mib, 2, &ncpu, &len, NULL, 0);
-	__cpu_detect_caps.nrcpu = ncpu;
-
+   /* Count the number of CPUs in system */
+#if defined(PIPE_OS_WINDOWS)
+   {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
+   }
+#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
+   util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+   if (util_cpu_caps.nr_cpus == -1)
+      util_cpu_caps.nr_cpus = 1;
+#elif defined(PIPE_OS_BSD)
+   {
+      int mib[2], ncpu;
+      int len;
+
+      mib[0] = CTL_HW;
+      mib[1] = HW_NCPU;
+
+      len = sizeof (ncpu);
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      util_cpu_caps.nr_cpus = ncpu;
+   }
 #else
-	__cpu_detect_caps.nrcpu = 1;
+   util_cpu_caps.nr_cpus = 1;
 #endif
 
-#if defined(ARCH_X86)
-	/* No cpuid, old 486 or lower */
-	if (has_cpuid() == 0)
-		return;
-
-	__cpu_detect_caps.cacheline = 32;
-
-	/* Get max cpuid level */
-	cpuid(0x00000000, regs);
-
-	if (regs[0] >= 0x00000001) {
-		unsigned int cacheline;
-
-		cpuid (0x00000001, regs2);
-
-		__cpu_detect_caps.x86cpuType = (regs2[0] >> 8) & 0xf;
-		if (__cpu_detect_caps.x86cpuType == 0xf)
-		    __cpu_detect_caps.x86cpuType = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
-
-		/* general feature flags */
-		__cpu_detect_caps.hasTSC  = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
-		__cpu_detect_caps.hasMMX  = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasSSE  = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
-		__cpu_detect_caps.hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
-		__cpu_detect_caps.hasSSE3 = (regs2[2] & (1));	       /* 0x0000001 */
-		__cpu_detect_caps.hasSSSE3 = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
-		__cpu_detect_caps.hasMMX2 = __cpu_detect_caps.hasSSE; /* SSE cpus supports mmxext too */
-
-		cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
-		if (cacheline > 0)
-			__cpu_detect_caps.cacheline = cacheline;
-	}
-
-	cpuid(0x80000000, regs);
-
-	if (regs[0] >= 0x80000001) {
-
-		cpuid(0x80000001, regs2);
-
-		__cpu_detect_caps.hasMMX  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasMMX2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
-		__cpu_detect_caps.has3DNow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
-		__cpu_detect_caps.has3DNowExt = (regs2[3] & (1 << 30 )) >> 30;
-	}
-
-	if (regs[0] >= 0x80000006) {
-		cpuid(0x80000006, regs2);
-		__cpu_detect_caps.cacheline = regs2[2] & 0xFF;
-	}
-
-
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_CYGWIN) || defined(OS_OPENBSD)
-	if (__cpu_detect_caps.hasSSE)
-		check_os_katmai_support();
-
-	if (!__cpu_detect_caps.hasSSE) {
-		__cpu_detect_caps.hasSSE2 = 0;
-		__cpu_detect_caps.hasSSE3 = 0;
-		__cpu_detect_caps.hasSSSE3 = 0;
-	}
-#else
-	__cpu_detect_caps.hasSSE = 0;
-	__cpu_detect_caps.hasSSE2 = 0;
-	__cpu_detect_caps.hasSSE3 = 0;
-	__cpu_detect_caps.hasSSSE3 = 0;
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if (has_cpuid()) {
+      uint32_t regs[4];
+      uint32_t regs2[4];
+
+      util_cpu_caps.cacheline = 32;
+
+      /* Get max cpuid level */
+      cpuid(0x00000000, regs);
+
+      if (regs[0] >= 0x00000001) {
+         unsigned int cacheline;
+
+         cpuid (0x00000001, regs2);
+
+         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
+         if (util_cpu_caps.x86_cpu_type == 0xf)
+             util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
+
+         /* general feature flags */
+         util_cpu_caps.has_tsc    = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
+         util_cpu_caps.has_mmx    = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_sse    = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
+         util_cpu_caps.has_sse2   = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
+         util_cpu_caps.has_sse3   = (regs2[2] & (1));          /* 0x0000001 */
+         util_cpu_caps.has_ssse3  = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
+         util_cpu_caps.has_sse4_1 = (regs2[2] & (1 << 19)) >> 19;
+         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
+
+         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+         if (cacheline > 0)
+            util_cpu_caps.cacheline = cacheline;
+      }
+
+      cpuid(0x80000000, regs);
+
+      if (regs[0] >= 0x80000001) {
+
+         cpuid(0x80000001, regs2);
+
+         util_cpu_caps.has_mmx  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_mmx2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
+         util_cpu_caps.has_3dnow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
+         util_cpu_caps.has_3dnow_ext = (regs2[3] & (1 << 30 )) >> 30;
+      }
+
+      if (regs[0] >= 0x80000006) {
+         cpuid(0x80000006, regs2);
+         util_cpu_caps.cacheline = regs2[2] & 0xFF;
+      }
+
+      if (util_cpu_caps.has_sse)
+         check_os_katmai_support();
+
+      if (!util_cpu_caps.has_sse) {
+         util_cpu_caps.has_sse2 = 0;
+         util_cpu_caps.has_sse3 = 0;
+         util_cpu_caps.has_ssse3 = 0;
+         util_cpu_caps.has_sse4_1 = 0;
+      }
+   }
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#if defined(PIPE_ARCH_PPC)
+   check_os_altivec_support();
+#endif /* PIPE_ARCH_PPC */
+
+#ifdef DEBUG
+   debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch);
+   debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
+
+   debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
+   debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
+
+   debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
+   debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
+   debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
+   debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
+   debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
+   debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
+   debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
+   debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
+   debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
+   debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+   debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
 #endif
-#endif /* ARCH_X86 */
-
-#if defined(ARCH_POWERPC)
-	check_os_altivec_support();
-#endif /* ARCH_POWERPC */
-
-	__cpu_detect_initialized = 1;
-}
-
-struct cpu_detect_caps *cpu_detect_get_caps()
-{
-	return &__cpu_detect_caps;
-}
-
-/* The getters and setters for feature flags */
-int cpu_detect_get_tsc()
-{
-	return __cpu_detect_caps.hasTSC;
-}
-
-int cpu_detect_get_mmx()
-{
-	return __cpu_detect_caps.hasMMX;
-}
-
-int cpu_detect_get_mmx2()
-{
-	return __cpu_detect_caps.hasMMX2;
-}
 
-int cpu_detect_get_sse()
-{
-	return __cpu_detect_caps.hasSSE;
-}
-
-int cpu_detect_get_sse2()
-{
-	return __cpu_detect_caps.hasSSE2;
-}
-
-int cpu_detect_get_sse3()
-{
-	return __cpu_detect_caps.hasSSE3;
-}
-
-int cpu_detect_get_ssse3()
-{
-	return __cpu_detect_caps.hasSSSE3;
+   util_cpu_detect_initialized = TRUE;
 }
-
-int cpu_detect_get_3dnow()
-{
-	return __cpu_detect_caps.has3DNow;
-}
-
-int cpu_detect_get_3dnow2()
-{
-	return __cpu_detect_caps.has3DNowExt;
-}
-
-int cpu_detect_get_altivec()
-{
-	return __cpu_detect_caps.hasAltiVec;
-}
-
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 1612d49286..4b3dc39c34 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -24,55 +24,55 @@
  *
  ***************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-#ifndef _CPU_DETECT_H
-#define _CPU_DETECT_H
+#ifndef _UTIL_CPU_DETECT_H
+#define _UTIL_CPU_DETECT_H
+
+#include "pipe/p_compiler.h"
 
-typedef enum {
-	CPU_DETECT_TYPE_MIPS,
-	CPU_DETECT_TYPE_ALPHA,
-	CPU_DETECT_TYPE_SPARC,
-	CPU_DETECT_TYPE_X86,
-	CPU_DETECT_TYPE_POWERPC,
-	CPU_DETECT_TYPE_OTHER
-} cpu_detect_type;
+enum util_cpu_arch {
+   UTIL_CPU_ARCH_UNKNOWN = 0,
+   UTIL_CPU_ARCH_MIPS,
+   UTIL_CPU_ARCH_ALPHA,
+   UTIL_CPU_ARCH_SPARC,
+   UTIL_CPU_ARCH_X86,
+   UTIL_CPU_ARCH_POWERPC
+};
 
-struct cpu_detect_caps {
-	cpu_detect_type	type;
-	int		nrcpu;
+struct util_cpu_caps {
+   enum util_cpu_arch arch;
+   unsigned nr_cpus;
 
-	/* Feature flags */
-	int		x86cpuType;
-	int		cacheline;
+   /* Feature flags */
+   int x86_cpu_type;
+   unsigned cacheline;
 
-	int		hasTSC;
-	int		hasMMX;
-	int		hasMMX2;
-	int		hasSSE;
-	int		hasSSE2;
-	int		hasSSE3;
-	int		hasSSSE3;
-	int		has3DNow;
-	int		has3DNowExt;
-	int		hasAltiVec;
+   unsigned little_endian:1;
+
+   unsigned has_tsc:1;
+   unsigned has_mmx:1;
+   unsigned has_mmx2:1;
+   unsigned has_sse:1;
+   unsigned has_sse2:1;
+   unsigned has_sse3:1;
+   unsigned has_ssse3:1;
+   unsigned has_sse4_1:1;
+   unsigned has_3dnow:1;
+   unsigned has_3dnow_ext:1;
+   unsigned has_altivec:1;
 };
 
-/* prototypes */
-void cpu_detect_initialize(void);
-struct cpu_detect_caps *cpu_detect_get_caps(void);
+extern struct util_cpu_caps
+util_cpu_caps;
+
+void util_cpu_detect(void);
 
-int cpu_detect_get_tsc(void);
-int cpu_detect_get_mmx(void);
-int cpu_detect_get_mmx2(void);
-int cpu_detect_get_sse(void);
-int cpu_detect_get_sse2(void);
-int cpu_detect_get_sse3(void);
-int cpu_detect_get_ssse3(void);
-int cpu_detect_get_3dnow(void);
-int cpu_detect_get_3dnow2(void);
-int cpu_detect_get_altivec(void);
 
-#endif /* _CPU_DETECT_H */
+#endif /* _UTIL_CPU_DETECT_H */
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 1380d98d7e..abd834c741 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -65,6 +65,11 @@ extern "C" {
 #define __FUNCTION__ "???"
 #endif
 
+#if defined(__GNUC__)
+#define _util_printf_format(fmt, list) __attribute__ ((format (printf, fmt, list)))
+#else
+#define _util_printf_format(fmt, list)
+#endif
 
 void _debug_vprintf(const char *format, va_list ap);
    
@@ -82,14 +87,17 @@ _debug_printf(const char *format, ...)
 /**
  * Print debug messages.
  *
- * The actual channel used to output debug message is platform specific. To 
- * avoid misformating or truncation, follow these rules of thumb:   
+ * The actual channel used to output debug message is platform specific. To
+ * avoid misformating or truncation, follow these rules of thumb:
  * - output whole lines
- * - avoid outputing large strings (512 bytes is the current maximum length 
+ * - avoid outputing large strings (512 bytes is the current maximum length
  * that is guaranteed to be printed in all platforms)
  */
 #if !defined(PIPE_OS_HAIKU)
 static INLINE void
+debug_printf(const char *format, ...) _util_printf_format(1,2);
+
+static INLINE void
 debug_printf(const char *format, ...)
 {
 #ifdef DEBUG
@@ -173,11 +181,14 @@ void _debug_assert_fail(const char *expr,
  * 
  * Do not expect that the assert call terminates -- errors must be handled 
  * regardless of assert behavior.
+ *
+ * For non debug builds the assert macro will expand to a no-op, so do not
+ * call functions with side effects in the assert expression.
  */
 #ifdef DEBUG
 #define debug_assert(expr) ((expr) ? (void)0 : _debug_assert_fail(#expr, __FILE__, __LINE__, __FUNCTION__))
 #else
-#define debug_assert(expr) ((void)(expr))
+#define debug_assert(expr) ((void)0)
 #endif
 
 
@@ -340,17 +351,6 @@ void
 debug_memory_end(unsigned long beginning);
 
 
-#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-void
-debug_profile_start(void);
-
-void 
-debug_profile_stop(void);
-
-#endif
-
-
 #ifdef DEBUG
 struct pipe_surface;
 struct pipe_transfer;
diff --git a/src/gallium/auxiliary/util/u_debug_dump.c b/src/gallium/auxiliary/util/u_debug_dump.c
index 6bdecde048..09866880ae 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.c
+++ b/src/gallium/auxiliary/util/u_debug_dump.c
@@ -187,3 +187,83 @@ debug_dump_func_short_names[] = {
 };
 
 DEFINE_DEBUG_DUMP_CONTINUOUS(func)
+
+
+static const char *
+debug_dump_tex_target_names[] = {
+   "PIPE_TEXTURE_1D",
+   "PIPE_TEXTURE_2D",
+   "PIPE_TEXTURE_3D",
+   "PIPE_TEXTURE_CUBE"
+};
+
+static const char *
+debug_dump_tex_target_short_names[] = {
+   "1d",
+   "2d",
+   "3d",
+   "cube"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_target)
+
+
+static const char *
+debug_dump_tex_wrap_names[] = {
+   "PIPE_TEX_WRAP_REPEAT",
+   "PIPE_TEX_WRAP_CLAMP",
+   "PIPE_TEX_WRAP_CLAMP_TO_EDGE",
+   "PIPE_TEX_WRAP_CLAMP_TO_BORDER",
+   "PIPE_TEX_WRAP_MIRROR_REPEAT",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER"
+};
+
+static const char *
+debug_dump_tex_wrap_short_names[] = {
+   "repeat",
+   "clamp",
+   "clamp_to_edge",
+   "clamp_to_border",
+   "mirror_repeat",
+   "mirror_clamp",
+   "mirror_clamp_to_edge",
+   "mirror_clamp_to_border"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_wrap)
+
+
+static const char *
+debug_dump_tex_mipfilter_names[] = {
+   "PIPE_TEX_MIPFILTER_NEAREST",
+   "PIPE_TEX_MIPFILTER_LINEAR",
+   "PIPE_TEX_MIPFILTER_NONE"
+};
+
+static const char *
+debug_dump_tex_mipfilter_short_names[] = {
+   "nearest",
+   "linear",
+   "none"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_mipfilter)
+
+
+static const char *
+debug_dump_tex_filter_names[] = {
+   "PIPE_TEX_FILTER_NEAREST",
+   "PIPE_TEX_FILTER_LINEAR",
+   "PIPE_TEX_FILTER_ANISO"
+};
+
+static const char *
+debug_dump_tex_filter_short_names[] = {
+   "nearest",
+   "linear",
+   "aniso"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_filter)
diff --git a/src/gallium/auxiliary/util/u_debug_dump.h b/src/gallium/auxiliary/util/u_debug_dump.h
index 102935559c..19b130ad18 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.h
+++ b/src/gallium/auxiliary/util/u_debug_dump.h
@@ -54,6 +54,18 @@ debug_dump_blend_func(unsigned value, boolean shortened);
 const char *
 debug_dump_func(unsigned value, boolean shortened);
 
+const char *
+debug_dump_tex_target(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_wrap(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_mipfilter(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_filter(unsigned value, boolean shortened);
+
 
 /* FIXME: Move the other debug_dump_xxx functions out of u_debug.h into here. */
 
diff --git a/src/gallium/auxiliary/util/u_debug_profile.c b/src/gallium/auxiliary/util/u_debug_profile.c
deleted file mode 100644
index d765b50144..0000000000
--- a/src/gallium/auxiliary/util/u_debug_profile.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * Poor-man profiling.
- * 
- * @author José Fonseca <jrfonseca@tungstengraphics.com>
- * 
- * @sa http://blogs.msdn.com/joshpoley/archive/2008/03/12/poor-man-s-profiler.aspx
- * @sa http://www.johnpanzer.com/aci_cuj/index.html
- */
-
-#include "pipe/p_config.h" 
-
-#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-#include <windows.h>
-#include <winddi.h>
-
-#include "util/u_debug.h" 
-#include "util/u_string.h" 
-
-
-#define PROFILE_TABLE_SIZE (1024*1024)
-#define FILE_NAME_SIZE 256
-
-struct debug_profile_entry
-{
-   uintptr_t caller;
-   uintptr_t callee;
-   uint64_t samples;
-};
-
-static unsigned long enabled = 0;
-
-static WCHAR wFileName[FILE_NAME_SIZE] = L"\\??\\c:\\00000000.prof";
-static ULONG_PTR iFile = 0;
-
-static struct debug_profile_entry *table = NULL;
-static unsigned long free_table_entries = 0;
-static unsigned long max_table_entries = 0;
-
-uint64_t start_stamp = 0;
-uint64_t end_stamp = 0;
-
-
-static void
-debug_profile_entry(uintptr_t caller, uintptr_t callee, uint64_t samples)
-{
-   unsigned hash = ( caller + callee ) & PROFILE_TABLE_SIZE - 1;
-   
-   while(1) {
-      if(table[hash].caller == 0 && table[hash].callee == 0) {
-         table[hash].caller = caller;
-         table[hash].callee = callee;
-         table[hash].samples = samples;
-         --free_table_entries;
-         break;
-      }
-      else if(table[hash].caller == caller && table[hash].callee == callee) {
-         table[hash].samples += samples;
-         break;
-      }
-      else {
-         ++hash;
-      }
-   }
-}
-
-
-static uintptr_t caller_stack[1024];
-static unsigned last_caller = 0;
-
-
-static int64_t delta(void) {
-   int64_t result = end_stamp - start_stamp;
-   if(result > UINT64_C(0xffffffff))
-      result = 0;
-   return result;
-}
-
-
-static void __cdecl 
-debug_profile_enter(uintptr_t callee)
-{
-   uintptr_t caller = last_caller ? caller_stack[last_caller - 1] : 0;
-                
-   if (caller)
-      debug_profile_entry(caller, 0, delta());
-   debug_profile_entry(caller, callee, 1);
-   caller_stack[last_caller++] = callee;
-}
-
-
-static void __cdecl
-debug_profile_exit(uintptr_t callee)
-{
-   debug_profile_entry(callee, 0, delta());
-   if(last_caller)
-      --last_caller;
-}
-   
-   
-/**
- * Called at the start of every method or function.
- * 
- * @sa http://msdn.microsoft.com/en-us/library/c63a9b7h.aspx
- */
-void __declspec(naked) __cdecl 
-_penter(void) {
-   _asm {
-      push eax
-      mov eax, [enabled]
-      test eax, eax
-      jz skip
-
-      push edx
-      
-      rdtsc
-      mov dword ptr [end_stamp], eax
-      mov dword ptr [end_stamp+4], edx
-
-      xor eax, eax
-      mov [enabled], eax
-
-      mov eax, [esp+8]
-
-      push ebx
-      push ecx
-      push ebp
-      push edi
-      push esi
-
-      push eax
-      call debug_profile_enter
-      add esp, 4
-
-      pop esi
-      pop edi
-      pop ebp
-      pop ecx
-      pop ebx
-
-      mov eax, 1
-      mov [enabled], eax 
-
-      rdtsc
-      mov dword ptr [start_stamp], eax
-      mov dword ptr [start_stamp+4], edx
-      
-      pop edx
-skip:
-      pop eax
-      ret
-   }
-}
-
-
-/**
- * Called at the end of Calls the end of every method or function.
- * 
- * @sa http://msdn.microsoft.com/en-us/library/xc11y76y.aspx
- */
-void __declspec(naked) __cdecl 
-_pexit(void) {
-   _asm {
-      push eax
-      mov eax, [enabled]
-      test eax, eax
-      jz skip
-
-      push edx
-      
-      rdtsc
-      mov dword ptr [end_stamp], eax
-      mov dword ptr [end_stamp+4], edx
-
-      xor eax, eax
-      mov [enabled], eax
-
-      mov eax, [esp+8]
-
-      push ebx
-      push ecx
-      push ebp
-      push edi
-      push esi
-
-      push eax
-      call debug_profile_exit
-      add esp, 4
-
-      pop esi
-      pop edi
-      pop ebp
-      pop ecx
-      pop ebx
-
-      mov eax, 1
-      mov [enabled], eax 
-
-      rdtsc
-      mov dword ptr [start_stamp], eax
-      mov dword ptr [start_stamp+4], edx
-      
-      pop edx
-skip:
-      pop eax
-      ret
-   }
-}
-
-
-/**
- * Reference function for calibration. 
- */
-void __declspec(naked) 
-__debug_profile_reference(void) {
-   _asm {
-      call _penter
-      call _pexit
-      ret
-   }
-}
-
-
-void
-debug_profile_start(void)
-{
-   WCHAR *p;
-
-   /* increment starting from the less significant digit */
-   p = &wFileName[14];
-   while(1) {
-      if(*p == '9') {
-         *p-- = '0';
-      }
-      else {
-         *p += 1;
-         break;
-      }
-   }
-
-   table = EngMapFile(wFileName, 
-                      PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry), 
-                      &iFile);
-   if(table) {
-      unsigned i;
-      
-      free_table_entries = max_table_entries = PROFILE_TABLE_SIZE;
-      memset(table, 0, PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry));
-      
-      table[0].caller = (uintptr_t)&__debug_profile_reference;
-      table[0].callee = 0;
-      table[0].samples = 0;
-      --free_table_entries;
-
-      _asm {
-         push edx
-         push eax
-      
-         rdtsc
-         mov dword ptr [start_stamp], eax
-         mov dword ptr [start_stamp+4], edx
-         
-         pop edx
-         pop eax
-      }
-
-      last_caller = 0;
-      
-      enabled = 1;
-
-      for(i = 0; i < 8; ++i) {
-         _asm {
-            call __debug_profile_reference
-         }
-      }
-   }
-}
-
-
-void 
-debug_profile_stop(void)
-{
-   enabled = 0;
-
-   if(iFile)
-      EngUnmapFile(iFile);
-   iFile = 0;
-   table = NULL;
-   free_table_entries = max_table_entries = 0;
-}
-
-#endif /* PROFILE */
diff --git a/src/gallium/auxiliary/util/u_fifo.h b/src/gallium/auxiliary/util/u_fifo.h
new file mode 100644
index 0000000000..9e007de1ad
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_fifo.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_FIFO_H
+#define U_FIFO_H
+
+#include "util/u_memory.h"
+
+struct util_fifo
+{
+   size_t head;
+   size_t tail;
+   size_t num;
+   size_t size;
+};
+
+static INLINE struct util_fifo *
+u_fifo_create(size_t size)
+{
+   struct util_fifo *fifo;
+   fifo = MALLOC(sizeof(*fifo) + size * sizeof(void*));
+
+   fifo->head = 0;
+   fifo->tail = 0;
+   fifo->num = 0;
+   fifo->size = size;
+
+   return fifo;
+}
+
+static INLINE boolean
+u_fifo_add(struct util_fifo *fifo, void *ptr)
+{
+   void **array = (void**)&fifo[1];
+   if (fifo->num >= fifo->size)
+      return FALSE;
+
+   if (++fifo->head >= fifo->size)
+      fifo->head = 0;
+
+   array[fifo->head] = ptr;
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE boolean
+u_fifo_pop(struct util_fifo *fifo, void **ptr)
+{
+   void **array = (void**)&fifo[1];
+
+   if (!fifo->num)
+      return FALSE;
+
+   if (++fifo->tail >= fifo->size)
+      fifo->tail = 0;
+
+   *ptr = array[fifo->tail];
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE void
+u_fifo_destroy(struct util_fifo *fifo)
+{
+   FREE(fifo);
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 00a46d0cc4..f1bf94f17d 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -2,7 +2,7 @@ PIPE_FORMAT_A8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyxw,
 PIPE_FORMAT_X8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
 PIPE_FORMAT_B8G8R8A8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
 PIPE_FORMAT_B8G8R8X8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
-PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un1 , un5 , un5 , un5 , zyxw, rgb
+PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
 PIPE_FORMAT_A4R4G4B4_UNORM        , arith , 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
 PIPE_FORMAT_R5G6B5_UNORM          , arith , 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
 PIPE_FORMAT_A2B10G10R10_UNORM     , arith , 1, 1, un10, un10, un10, un2 , xyzw, rgb
@@ -14,10 +14,10 @@ PIPE_FORMAT_L16_UNORM             , arith , 1, 1, un16,     ,     ,     , xxx1,
 PIPE_FORMAT_Z16_UNORM             , array , 1, 1, un16,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_UNORM             , array , 1, 1, un32,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_FLOAT             , array , 1, 1, f32 ,     ,     ,     , x___, zs 
-PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
-PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
-PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
-PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
+PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
+PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
 PIPE_FORMAT_S8_UNORM              , array , 1, 1, un8 ,     ,     ,     , _x__, zs 
 PIPE_FORMAT_R64_FLOAT             , array , 1, 1, f64 ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R64G64_FLOAT          , array , 1, 1, f64 , f64 ,     ,     , xy01, rgb
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index 8c2a8f454c..5604e3ac37 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -47,7 +47,7 @@
 #include "util/u_hash_table.h"
 
 
-struct hash_table
+struct util_hash_table
 {
    struct cso_hash *cso;   
    
@@ -61,27 +61,27 @@ struct hash_table
 };
 
 
-struct hash_table_item
+struct util_hash_table_item
 {
    void *key;
    void *value;
 };
 
 
-static INLINE struct hash_table_item *
-hash_table_item(struct cso_hash_iter iter)
+static INLINE struct util_hash_table_item *
+util_hash_table_item(struct cso_hash_iter iter)
 {
-   return (struct hash_table_item *)cso_hash_iter_data(iter);
+   return (struct util_hash_table_item *)cso_hash_iter_data(iter);
 }
 
 
-struct hash_table *
-hash_table_create(unsigned (*hash)(void *key),
-                  int (*compare)(void *key1, void *key2))
+struct util_hash_table *
+util_hash_table_create(unsigned (*hash)(void *key),
+                       int (*compare)(void *key1, void *key2))
 {
-   struct hash_table *ht;
+   struct util_hash_table *ht;
    
-   ht = MALLOC_STRUCT(hash_table);
+   ht = MALLOC_STRUCT(util_hash_table);
    if(!ht)
       return NULL;
    
@@ -99,16 +99,16 @@ hash_table_create(unsigned (*hash)(void *key),
 
 
 static INLINE struct cso_hash_iter
-hash_table_find_iter(struct hash_table *ht,
-                     void *key, 
-                     unsigned key_hash)
+util_hash_table_find_iter(struct util_hash_table *ht,
+                          void *key,
+                          unsigned key_hash)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    
    iter = cso_hash_find(ht->cso, key_hash);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       if (!ht->compare(item->key, key))
          break;
       iter = cso_hash_iter_next(iter);
@@ -118,17 +118,17 @@ hash_table_find_iter(struct hash_table *ht,
 }
 
 
-static INLINE struct hash_table_item *
-hash_table_find_item(struct hash_table *ht,
-                     void *key, 
-                     unsigned key_hash)
+static INLINE struct util_hash_table_item *
+util_hash_table_find_item(struct util_hash_table *ht,
+                          void *key,
+                          unsigned key_hash)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    
    iter = cso_hash_find(ht->cso, key_hash);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       if (!ht->compare(item->key, key))
          return item;
       iter = cso_hash_iter_next(iter);
@@ -139,12 +139,12 @@ hash_table_find_item(struct hash_table *ht,
 
 
 enum pipe_error
-hash_table_set(struct hash_table *ht,
-               void *key,
-               void *value)
+util_hash_table_set(struct util_hash_table *ht,
+                    void *key,
+                    void *value)
 {
    unsigned key_hash;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    struct cso_hash_iter iter;
 
    assert(ht);
@@ -153,14 +153,14 @@ hash_table_set(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   item = hash_table_find_item(ht, key, key_hash);
+   item = util_hash_table_find_item(ht, key, key_hash);
    if(item) {
       /* TODO: key/value destruction? */
       item->value = value;
       return PIPE_OK;
    }
    
-   item = MALLOC_STRUCT(hash_table_item);
+   item = MALLOC_STRUCT(util_hash_table_item);
    if(!item)
       return PIPE_ERROR_OUT_OF_MEMORY;
    
@@ -178,11 +178,11 @@ hash_table_set(struct hash_table *ht,
 
 
 void *
-hash_table_get(struct hash_table *ht, 
-               void *key)
+util_hash_table_get(struct util_hash_table *ht,
+                    void *key)
 {
    unsigned key_hash;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -190,7 +190,7 @@ hash_table_get(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   item = hash_table_find_item(ht, key, key_hash);
+   item = util_hash_table_find_item(ht, key, key_hash);
    if(!item)
       return NULL;
    
@@ -199,12 +199,12 @@ hash_table_get(struct hash_table *ht,
 
 
 void
-hash_table_remove(struct hash_table *ht, 
-                  void *key)
+util_hash_table_remove(struct util_hash_table *ht,
+                       void *key)
 {
    unsigned key_hash;
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -212,11 +212,11 @@ hash_table_remove(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   iter = hash_table_find_iter(ht, key, key_hash);
+   iter = util_hash_table_find_iter(ht, key, key_hash);
    if(cso_hash_iter_is_null(iter))
       return;
    
-   item = hash_table_item(iter);
+   item = util_hash_table_item(iter);
    assert(item);
    FREE(item);
    
@@ -225,10 +225,10 @@ hash_table_remove(struct hash_table *ht,
 
 
 void 
-hash_table_clear(struct hash_table *ht)
+util_hash_table_clear(struct util_hash_table *ht)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -236,7 +236,7 @@ hash_table_clear(struct hash_table *ht)
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_take(ht->cso, cso_hash_iter_key(iter));
+      item = (struct util_hash_table_item *)cso_hash_take(ht->cso, cso_hash_iter_key(iter));
       FREE(item);
       iter = cso_hash_first_node(ht->cso);
    }
@@ -244,12 +244,13 @@ hash_table_clear(struct hash_table *ht)
 
 
 enum pipe_error
-hash_table_foreach(struct hash_table *ht,
-                   enum pipe_error (*callback)(void *key, void *value, void *data),
-                   void *data)
+util_hash_table_foreach(struct util_hash_table *ht,
+                     enum pipe_error (*callback)
+                        (void *key, void *value, void *data),
+                     void *data)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    enum pipe_error result;
 
    assert(ht);
@@ -258,7 +259,7 @@ hash_table_foreach(struct hash_table *ht,
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       result = callback(item->key, item->value, data);
       if(result != PIPE_OK)
 	 return result;
@@ -270,10 +271,10 @@ hash_table_foreach(struct hash_table *ht,
 
 
 void
-hash_table_destroy(struct hash_table *ht)
+util_hash_table_destroy(struct util_hash_table *ht)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -281,7 +282,7 @@ hash_table_destroy(struct hash_table *ht)
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       FREE(item);
       iter = cso_hash_iter_next(iter);
    }
diff --git a/src/gallium/auxiliary/util/u_hash_table.h b/src/gallium/auxiliary/util/u_hash_table.h
index feee881582..51ec10a804 100644
--- a/src/gallium/auxiliary/util/u_hash_table.h
+++ b/src/gallium/auxiliary/util/u_hash_table.h
@@ -35,7 +35,7 @@
 #define U_HASH_TABLE_H_
 
 
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef __cplusplus
@@ -46,7 +46,7 @@ extern "C" {
 /**
  * Generic purpose hash table.
  */
-struct hash_table;
+struct util_hash_table;
 
 
 /**
@@ -55,37 +55,38 @@ struct hash_table;
  * @param hash hash function
  * @param compare should return 0 for two equal keys.
  */
-struct hash_table *
-hash_table_create(unsigned (*hash)(void *key),
-                  int (*compare)(void *key1, void *key2));
+struct util_hash_table *
+util_hash_table_create(unsigned (*hash)(void *key),
+                       int (*compare)(void *key1, void *key2));
 
 
 enum pipe_error
-hash_table_set(struct hash_table *ht,
-               void *key,
-               void *value);
+util_hash_table_set(struct util_hash_table *ht,
+                    void *key,
+                    void *value);
 
 void *
-hash_table_get(struct hash_table *ht, 
-               void *key);
+util_hash_table_get(struct util_hash_table *ht,
+                    void *key);
 
 
 void
-hash_table_remove(struct hash_table *ht, 
-                  void *key);
+util_hash_table_remove(struct util_hash_table *ht,
+                       void *key);
 
 
 void
-hash_table_clear(struct hash_table *ht);
+util_hash_table_clear(struct util_hash_table *ht);
 
 
 enum pipe_error
-hash_table_foreach(struct hash_table *ht,
-                   enum pipe_error (*callback)(void *key, void *value, void *data),
-                   void *data);
+util_hash_table_foreach(struct util_hash_table *ht,
+                        enum pipe_error (*callback)
+                        (void *key, void *value, void *data),
+                        void *data);
 
 void
-hash_table_destroy(struct hash_table *ht);
+util_hash_table_destroy(struct util_hash_table *ht);
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
index 508a2ee063..c4b9eb3d9b 100644
--- a/src/gallium/auxiliary/util/u_keymap.c
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -28,7 +28,7 @@
 /**
  * Key lookup/associative container.
  *
- * Like Jose's u_hash_table, based on CSO cache code for now.
+ * Like Jose's util_hash_table, based on CSO cache code for now.
  *
  * Author: Brian Paul
  */
@@ -36,7 +36,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 #include "cso_cache/cso_hash.h"
 
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 4c6c2bc00e..75b075f160 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -283,6 +283,14 @@ util_fast_pow(float x, float y)
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
+/* Note that this counts zero as a power of two.
+ */
+static INLINE boolean
+util_is_power_of_two( unsigned v )
+{
+   return (v & (v-1)) == 0;
+}
+
 
 /**
  * Floor(x), returned as int.
@@ -341,10 +349,22 @@ util_is_inf_or_nan(float x)
 
 
 /**
+ * Test whether x is a power of two.
+ */
+static INLINE boolean
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+
+/**
  * Find first bit set in word.  Least significant bit is 1.
  * Return 0 if no bits set.
  */
-#if defined(_MSC_VER) && _MSC_VER >= 1300
+#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64)
+unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
+#pragma intrinsic(_BitScanForward)
 static INLINE
 unsigned long ffs( unsigned long u )
 {
@@ -451,6 +471,26 @@ util_logbase2(unsigned n)
 
 
 /**
+ * Returns the smallest power of two >= x
+ */
+static INLINE unsigned
+util_next_power_of_two(unsigned x)
+{
+   unsigned i;
+
+   if (x == 0)
+      return 1;
+
+   --x;
+
+   for (i = 1; i < sizeof(unsigned) * 8; i <<= 1)
+      x |= x >> i;
+
+   return x + 1;
+}
+
+
+/**
  * Clamp X to [MIN, MAX].
  * This is a macro to allow float, int, uint, etc. types.
  */
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 4b75d4ba1d..82f83702d1 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -39,13 +39,20 @@ u_mmDumpMemInfo(const struct mem_block *heap)
    }
    else {
       const struct mem_block *p;
+      int total_used = 0, total_free = 0;
 
       for (p = heap->next; p != heap; p = p->next) {
 	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n", p->ofs, p->size,
                       p->free ? 'F':'.',
                       p->reserved ? 'R':'.');
+         if (p->free)
+            total_free += p->size;
+         else
+            total_used += p->size;
       }
 
+      debug_printf("'\nMemory stats: total = %d, used = %d, free = %d\n",
+                   total_used + total_free, total_used, total_free);
       debug_printf("\nFree list:\n");
 
       for (p = heap->next_free; p != heap; p = p->next_free) {
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index bc4b758406..6269c72e12 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -6,7 +6,7 @@
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <winsock2.h>
 #  include <windows.h>
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
 #  include <sys/socket.h>
 #  include <netinet/in.h>
 #  include <unistd.h>
@@ -54,7 +54,7 @@ u_socket_close(int s)
    if (s < 0)
       return;
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
    shutdown(s, SHUT_RDWR);
    close(s);
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
@@ -169,7 +169,7 @@ u_socket_listen_on_port(uint16_t portnum)
 void
 u_socket_block(int s, boolean block)
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
    int old = fcntl(s, F_GETFL, 0);
    if (old == -1)
       return;
diff --git a/src/gallium/auxiliary/util/u_network.h b/src/gallium/auxiliary/util/u_network.h
index 8c778f492c..0aa898b967 100644
--- a/src/gallium/auxiliary/util/u_network.h
+++ b/src/gallium/auxiliary/util/u_network.h
@@ -6,7 +6,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define PIPE_HAVE_SOCKETS
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
 #  define PIPE_HAVE_SOCKETS
 #endif
 
diff --git a/src/gallium/auxiliary/util/u_simple_screen.c b/src/gallium/auxiliary/util/u_simple_screen.c
index f01296b40f..5238299015 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.c
+++ b/src/gallium/auxiliary/util/u_simple_screen.c
@@ -52,8 +52,7 @@ pass_user_buffer_create(struct pipe_screen *screen,
                         unsigned bytes)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->user_buffer_create(screen->winsys,
-                                             ptr, bytes);
+      screen->winsys->user_buffer_create(screen->winsys, ptr, bytes);
 
    buffer->screen = screen;
 
@@ -69,9 +68,8 @@ pass_surface_buffer_create(struct pipe_screen *screen,
                            unsigned *stride)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->surface_buffer_create(screen->winsys,
-                                                width, height,
-                                                format, usage, tex_usage, stride);
+      screen->winsys->surface_buffer_create(screen->winsys, width, height,
+                                            format, usage, tex_usage, stride);
 
    buffer->screen = screen;
 
@@ -83,8 +81,7 @@ pass_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
                 unsigned usage)
 {
-   return screen->winsys->buffer_map(screen->winsys,
-                                     buf, usage);
+   return screen->winsys->buffer_map(screen->winsys, buf, usage);
 }
 
 static void
@@ -106,8 +103,7 @@ pass_flush_frontbuffer(struct pipe_screen *screen,
                        struct pipe_surface *surf,
                        void *context_private)
 {
-   screen->winsys->flush_frontbuffer(screen->winsys,
-                                     surf, context_private);
+   screen->winsys->flush_frontbuffer(screen->winsys, surf, context_private);
 }
 
 static void
@@ -115,8 +111,7 @@ pass_fence_reference(struct pipe_screen *screen,
                      struct pipe_fence_handle **ptr,
                      struct pipe_fence_handle *fence)
 {
-   screen->winsys->fence_reference(screen->winsys,
-                                   ptr, fence);
+   screen->winsys->fence_reference(screen->winsys, ptr, fence);
 }
 
 static int
@@ -124,8 +119,7 @@ pass_fence_signalled(struct pipe_screen *screen,
                      struct pipe_fence_handle *fence,
                      unsigned flag)
 {
-   return screen->winsys->fence_signalled(screen->winsys,
-                                          fence, flag);
+   return screen->winsys->fence_signalled(screen->winsys, fence, flag);
 }
 
 static int
@@ -133,11 +127,11 @@ pass_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
                   unsigned flag)
 {
-   return screen->winsys->fence_finish(screen->winsys,
-                                       fence, flag);
+   return screen->winsys->fence_finish(screen->winsys, fence, flag);
 }
 
-void u_simple_screen_init(struct pipe_screen *screen)
+void
+u_simple_screen_init(struct pipe_screen *screen)
 {
    screen->buffer_create = pass_buffer_create;
    screen->user_buffer_create = pass_user_buffer_create;
@@ -152,7 +146,8 @@ void u_simple_screen_init(struct pipe_screen *screen)
    screen->fence_finish = pass_fence_finish;
 }
 
-const char* u_simple_screen_winsys_name(struct pipe_screen *screen)
+const char *
+u_simple_screen_winsys_name(struct pipe_screen *screen)
 {
    return screen->winsys->get_name(screen->winsys);
 }
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index ab754296fa..1c8b157d91 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -34,14 +34,8 @@
 
 
 #include "pipe/p_context.h"
-#include "util/u_debug.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_screen.h"
 #include "pipe/p_shader_tokens.h"
-
-#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
-
 #include "tgsi/tgsi_ureg.h"
 
 
@@ -67,9 +61,7 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
       struct ureg_src src;
       struct ureg_dst dst;
 
-      src = ureg_DECL_vs_input( ureg,
-                                semantic_names[i],
-                                semantic_indexes[i]);
+      src = ureg_DECL_vs_input( ureg, i );
       
       dst = ureg_DECL_output( ureg,
                               semantic_names[i],
@@ -116,7 +108,15 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                            TGSI_SEMANTIC_COLOR,
                            0 );
 
-   ureg_TEX( ureg, out, TGSI_TEXTURE_2D, tex, sampler );
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      struct ureg_src imm = ureg_imm4f( ureg, 0, 0, 0, 1 );
+
+      ureg_MOV( ureg, out, imm );
+   }
+
+   ureg_TEX( ureg, 
+             ureg_writemask(out, writemask),
+             TGSI_TEXTURE_2D, tex, sampler );
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 1235a67d26..8a22f584be 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -170,7 +170,7 @@ x8r8g8b8_get_tile_rgba(const unsigned *src,
          pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
          pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
          pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
-         pRow[3] = ubyte_to_float(0xff);
+         pRow[3] = 1.0F;
       }
       p += dst_stride;
    }
@@ -394,6 +394,52 @@ r5g6b5_put_tile_rgba(ushort *dst,
 
 
 
+/*** PIPE_FORMAT_R8G8B8_UNORM ***/
+
+static void
+r8g8b8_get_tile_rgba(const ubyte *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] = ubyte_to_float(src[0]);
+         pRow[1] = ubyte_to_float(src[1]);
+         pRow[2] = ubyte_to_float(src[2]);
+         pRow[3] = 1.0f;
+         src += 3;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r8g8b8_put_tile_rgba(ubyte *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         dst[0] = float_to_ubyte(pRow[0]);
+         dst[1] = float_to_ubyte(pRow[1]);
+         dst[2] = float_to_ubyte(pRow[2]);
+         dst += 3;
+      }
+      p += src_stride;
+   }
+}
+
+
+
 /*** PIPE_FORMAT_Z16_UNORM ***/
 
 /**
@@ -1106,6 +1152,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_L8_UNORM:
       l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
       break;
@@ -1222,6 +1271,9 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       assert(0);
       break;
@@ -1400,7 +1452,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert(pt->usage == PIPE_TRANSFER_READ_WRITE);
+         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
@@ -1427,7 +1479,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert(pt->usage == PIPE_TRANSFER_READ_WRITE);
+         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index eb635c9f14..975ee89c45 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -29,7 +29,7 @@
  * coalescing small buffers into larger ones.
  */
 
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/auxiliary/vl/Makefile b/src/gallium/auxiliary/vl/Makefile
new file mode 100644
index 0000000000..4314c1e8d6
--- /dev/null
+++ b/src/gallium/auxiliary/vl/Makefile
@@ -0,0 +1,13 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = vl
+
+C_SOURCES = \
+	vl_bitstream_parser.c \
+	vl_mpeg12_mc_renderer.c \
+	vl_compositor.c \
+        vl_csc.c \
+	vl_shader_build.c
+
+include ../../Makefile.template
diff --git a/src/gallium/auxiliary/vl/SConscript b/src/gallium/auxiliary/vl/SConscript
new file mode 100644
index 0000000000..aed69f5efe
--- /dev/null
+++ b/src/gallium/auxiliary/vl/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+vl = env.ConvenienceLibrary(
+	target = 'vl',
+	source = [
+		'vl_bitstream_parser.c',
+		'vl_mpeg12_mc_renderer.c',
+		'vl_compositor.c',
+                'vl_csc.c',
+		'vl_shader_build.c',
+	])
+
+auxiliaries.insert(0, vl)
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.c b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
new file mode 100644
index 0000000000..3193ea5f41
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
@@ -0,0 +1,167 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_bitstream_parser.h"
+#include <assert.h>
+#include <limits.h>
+#include <util/u_memory.h>
+
+static unsigned
+grab_bits(unsigned cursor, unsigned how_many_bits, unsigned bitstream_elt)
+{
+   unsigned excess_bits = sizeof(unsigned) * CHAR_BIT - how_many_bits - cursor;
+	
+   assert(cursor < sizeof(unsigned) * CHAR_BIT);
+   assert(how_many_bits > 0 && how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+   assert(cursor + how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+
+   return (bitstream_elt << excess_bits) >> (excess_bits + cursor);
+}
+
+static unsigned
+show_bits(unsigned cursor, unsigned how_many_bits, const unsigned *bitstream)
+{	
+   unsigned cur_int = cursor / (sizeof(unsigned) * CHAR_BIT);
+   unsigned cur_bit = cursor % (sizeof(unsigned) * CHAR_BIT);
+	
+   assert(bitstream);
+	
+   if (cur_bit + how_many_bits > sizeof(unsigned) * CHAR_BIT) {
+      unsigned lower = grab_bits(cur_bit, sizeof(unsigned) * CHAR_BIT - cur_bit,
+                                 bitstream[cur_int]);
+      unsigned upper = grab_bits(0, cur_bit + how_many_bits - sizeof(unsigned) * CHAR_BIT,
+                                 bitstream[cur_int + 1]);
+      return lower | upper << (sizeof(unsigned) * CHAR_BIT - cur_bit);
+   }
+   else
+      return grab_bits(cur_bit, how_many_bits, bitstream[cur_int]);
+}
+
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes)
+{
+   assert(parser);
+   assert(num_bitstreams);
+   assert(bitstreams);
+   assert(sizes);
+
+   parser->num_bitstreams = num_bitstreams;
+   parser->bitstreams = (const unsigned**)bitstreams;
+   parser->sizes = sizes;
+   parser->cur_bitstream = 0;
+   parser->cursor = 0;
+
+   return true;
+}
+
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser)
+{
+   assert(parser);
+}
+
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits)
+{
+   unsigned bits;
+
+   assert(parser);
+
+   bits = vl_bitstream_parser_show_bits(parser, how_many_bits);
+
+   vl_bitstream_parser_forward(parser, how_many_bits);
+
+   return bits;
+}
+
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits)
+{	
+   unsigned bits = 0;
+   unsigned shift = 0;
+   unsigned cursor;
+   unsigned cur_bitstream;
+
+   assert(parser);
+
+   cursor = parser->cursor;
+   cur_bitstream = parser->cur_bitstream;
+
+   while (1) {
+      unsigned bits_left = parser->sizes[cur_bitstream] * CHAR_BIT - cursor;
+      unsigned bits_to_show = how_many_bits > bits_left ? bits_left : how_many_bits;
+
+      bits |= show_bits(cursor, bits_to_show,
+                        parser->bitstreams[cur_bitstream]) << shift;
+		
+      if (how_many_bits > bits_to_show) {
+         how_many_bits -= bits_to_show;
+         cursor = 0;
+         ++cur_bitstream;
+         shift += bits_to_show;
+      }
+      else
+         break;
+   }
+
+   return bits;
+}
+
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits)
+{
+   assert(parser);
+   assert(how_many_bits);
+
+   parser->cursor += how_many_bits;
+
+   while (parser->cursor > parser->sizes[parser->cur_bitstream] * CHAR_BIT) {
+      parser->cursor -= parser->sizes[parser->cur_bitstream++] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+}
+
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits)
+{
+   signed c;
+	
+   assert(parser);
+   assert(how_many_bits);
+	
+   c = parser->cursor - how_many_bits;
+
+   while (c < 0) {
+      c += parser->sizes[parser->cur_bitstream--] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+
+   parser->cursor = (unsigned)c;
+}
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.h b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
new file mode 100644
index 0000000000..30ec743fa7
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_bitstream_parser_h
+#define vl_bitstream_parser_h
+
+#include "pipe/p_compiler.h"
+
+struct vl_bitstream_parser
+{
+   unsigned num_bitstreams;
+   const unsigned **bitstreams;
+   const unsigned *sizes;
+   unsigned cur_bitstream;
+   unsigned cursor;
+};
+
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes);
+
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser);
+
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits);
+
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits);
+
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits);
+
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits);
+
+#endif /* vl_bitstream_parser_h */
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
new file mode 100644
index 0000000000..cda6dc134a
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -0,0 +1,536 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_compositor.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include <util/u_memory.h>
+#include "vl_csc.h"
+#include "vl_shader_build.h"
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f dst_scale;
+   struct vertex4f dst_trans;
+   struct vertex4f src_scale;
+   struct vertex4f src_trans;
+};
+
+struct fragment_shader_consts
+{
+   float matrix[16];
+};
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vertex2f surface_verts[4] =
+{
+   {0.0f, 0.0f},
+   {0.0f, 1.0f},
+   {1.0f, 0.0f},
+   {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vertex2f *surface_texcoords = surface_verts;
+
+static void
+create_vert_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0             ; Vertex pos
+    * decl i1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0             ; Scaling vector to scale vertex pos rect to destination size
+    * decl c1             ; Translation vector to move vertex pos rect into position
+    * decl c2             ; Scaling vector to scale texcoord rect to source size
+    * decl c3             ; Translation vector to move texcoord rect into position
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl o0             ; Vertex pos
+    * decl o1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * mad o0, i0, c0, c1  ; Scale and translate unit output rect to destination size and pos
+    * mad o1, i1, c2, c3  ; Scale and translate unit texcoord rect to source size and pos
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst4(TGSI_OPCODE_MAD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   c->vertex_shader = c->pipe->create_vs_state(c->pipe, &vs);
+   FREE(tokens);
+}
+
+static void
+create_frag_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /* decl i0             ; Texcoords for s0 */
+   decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl c0-c3          ; CSC matrix c0-c3
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0             ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0 */
+   decl = vl_decl_temps(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl s0             ; Sampler for tex containing picture to display */
+   decl = vl_decl_samplers(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t0, i0, s0    ; Read src pixel */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * dp4 o0.x, t0, c0    ; Multiply pixel by the color conversion matrix
+    * dp4 o0.y, t0, c1
+    * dp4 o0.z, t0, c2
+    * dp4 o0.w, t0, c3
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i);
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   c->fragment_shader = c->pipe->create_fs_state(c->pipe, &fs);
+   FREE(tokens);
+}
+
+static bool
+init_pipe_state(struct vl_compositor *c)
+{
+   struct pipe_sampler_state sampler;
+
+   assert(c);
+
+   c->fb_state.nr_cbufs = 1;
+   c->fb_state.zsbuf = NULL;
+
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+   sampler.compare_func = PIPE_FUNC_ALWAYS;
+   sampler.normalized_coords = 1;
+   /*sampler.prefilter = ;*/
+   /*sampler.lod_bias = ;*/
+   /*sampler.min_lod = ;*/
+   /*sampler.max_lod = ;*/
+   /*sampler.border_color[i] = ;*/
+   /*sampler.max_anisotropy = ;*/
+   c->sampler = c->pipe->create_sampler_state(c->pipe, &sampler);
+	
+   return true;
+}
+
+static void cleanup_pipe_state(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_sampler_state(c->pipe, c->sampler);
+}
+
+static bool
+init_shaders(struct vl_compositor *c)
+{
+   assert(c);
+
+   create_vert_shader(c);
+   create_frag_shader(c);
+
+   return true;
+}
+
+static void cleanup_shaders(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_vs_state(c->pipe, c->vertex_shader);
+   c->pipe->delete_fs_state(c->pipe, c->fragment_shader);
+}
+
+static bool
+init_buffers(struct vl_compositor *c)
+{
+   struct fragment_shader_consts fsc;
+
+   assert(c);
+	
+   /*
+    * Create our vertex buffer and vertex buffer element
+    * VB contains 4 vertices that render a quad covering the entire window
+    * to display a rendered surface
+    * Quad is rendered as a tri strip
+    */
+   c->vertex_bufs[0].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[0].max_index = 3;
+   c->vertex_bufs[0].buffer_offset = 0;
+   c->vertex_bufs[0].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_verts,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[0].buffer);
+
+   c->vertex_elems[0].src_offset = 0;
+   c->vertex_elems[0].vertex_buffer_index = 0;
+   c->vertex_elems[0].nr_components = 2;
+   c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our texcoord buffer and texcoord buffer element
+    * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+    */
+   c->vertex_bufs[1].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[1].max_index = 3;
+   c->vertex_bufs[1].buffer_offset = 0;
+   c->vertex_bufs[1].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_texcoords,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[1].buffer);
+
+   c->vertex_elems[1].src_offset = 0;
+   c->vertex_elems[1].vertex_buffer_index = 1;
+   c->vertex_elems[1].nr_components = 2;
+   c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our vertex shader's constant buffer
+    * Const buffer contains scaling and translation vectors
+    */
+   c->vs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   /*
+    * Create our fragment shader's constant buffer
+    * Const buffer contains the color conversion matrix and bias vectors
+    */
+   c->fs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   vl_csc_get_matrix(VL_CSC_COLOR_STANDARD_IDENTITY, NULL, true, fsc.matrix);
+
+   vl_compositor_set_csc_matrix(c, fsc.matrix);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_compositor *c)
+{
+   unsigned i;
+
+   assert(c);
+	
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_reference(&c->vertex_bufs[i].buffer, NULL);
+
+   pipe_buffer_reference(&c->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&c->fs_const_buf.buffer, NULL);
+}
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
+{
+   assert(compositor);
+
+   memset(compositor, 0, sizeof(struct vl_compositor));
+
+   compositor->pipe = pipe;
+
+   if (!init_pipe_state(compositor))
+      return false;
+   if (!init_shaders(compositor)) {
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+   if (!init_buffers(compositor)) {
+      cleanup_shaders(compositor);
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+
+   return true;
+}
+
+void vl_compositor_cleanup(struct vl_compositor *compositor)
+{
+   assert(compositor);
+	
+   cleanup_buffers(compositor);
+   cleanup_shaders(compositor);
+   cleanup_pipe_state(compositor);
+}
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas*/
+                          struct pipe_fence_handle      **fence)
+{
+   struct vertex_shader_consts *vs_consts;
+
+   assert(compositor);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+   assert(picture_type == PIPE_MPEG12_PICTURE_TYPE_FRAME);
+
+   compositor->fb_state.width = dst_surface->width[0];
+   compositor->fb_state.height = dst_surface->height[0];
+   compositor->fb_state.cbufs[0] = compositor->pipe->screen->get_tex_surface
+   (
+      compositor->pipe->screen,
+      dst_surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   compositor->viewport.scale[0] = compositor->fb_state.width;
+   compositor->viewport.scale[1] = compositor->fb_state.height;
+   compositor->viewport.scale[2] = 1;
+   compositor->viewport.scale[3] = 1;
+   compositor->viewport.translate[0] = 0;
+   compositor->viewport.translate[1] = 0;
+   compositor->viewport.translate[2] = 0;
+   compositor->viewport.translate[3] = 0;
+
+   compositor->scissor.maxx = compositor->fb_state.width;
+   compositor->scissor.maxy = compositor->fb_state.height;
+
+   compositor->pipe->set_framebuffer_state(compositor->pipe, &compositor->fb_state);
+   compositor->pipe->set_viewport_state(compositor->pipe, &compositor->viewport);
+   compositor->pipe->set_scissor_state(compositor->pipe, &compositor->scissor);
+   compositor->pipe->bind_sampler_states(compositor->pipe, 1, &compositor->sampler);
+   compositor->pipe->set_sampler_textures(compositor->pipe, 1, &src_surface);
+   compositor->pipe->bind_vs_state(compositor->pipe, compositor->vertex_shader);
+   compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
+   compositor->pipe->set_vertex_buffers(compositor->pipe, 2, compositor->vertex_bufs);
+   compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, &compositor->vs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, &compositor->fs_const_buf);
+
+   vs_consts = pipe_buffer_map
+   (
+      compositor->pipe->screen,
+      compositor->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->dst_scale.x = dst_area->w / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_scale.y = dst_area->h / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_scale.z = 1;
+   vs_consts->dst_scale.w = 1;
+   vs_consts->dst_trans.x = dst_area->x / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_trans.y = dst_area->y / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_trans.z = 0;
+   vs_consts->dst_trans.w = 0;
+
+   vs_consts->src_scale.x = src_area->w / (float)src_surface->width[0];
+   vs_consts->src_scale.y = src_area->h / (float)src_surface->height[0];
+   vs_consts->src_scale.z = 1;
+   vs_consts->src_scale.w = 1;
+   vs_consts->src_trans.x = src_area->x / (float)src_surface->width[0];
+   vs_consts->src_trans.y = src_area->y / (float)src_surface->height[0];
+   vs_consts->src_trans.z = 0;
+   vs_consts->src_trans.w = 0;
+
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf.buffer);
+
+   compositor->pipe->draw_arrays(compositor->pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+   compositor->pipe->flush(compositor->pipe, PIPE_FLUSH_RENDER_CACHE, fence);
+
+   pipe_surface_reference(&compositor->fb_state.cbufs[0], NULL);
+}
+
+void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat)
+{
+   assert(compositor);
+
+   memcpy
+   (
+      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      mat,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf.buffer);
+}
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
new file mode 100644
index 0000000000..f441901a75
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_compositor_h
+#define vl_compositor_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_texture;
+
+struct vl_compositor
+{
+   struct pipe_context *pipe;
+
+   struct pipe_framebuffer_state fb_state;
+   void *sampler;
+   void *vertex_shader;
+   void *fragment_shader;
+   struct pipe_viewport_state viewport;
+   struct pipe_scissor_state scissor;
+   struct pipe_vertex_buffer vertex_bufs[2];
+   struct pipe_vertex_element vertex_elems[2];
+   struct pipe_constant_buffer vs_const_buf, fs_const_buf;
+};
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe);
+
+void vl_compositor_cleanup(struct vl_compositor *compositor);
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas,*/
+                          struct pipe_fence_handle      **fence);
+
+void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat);
+
+#endif /* vl_compositor_h */
diff --git a/src/gallium/auxiliary/vl/vl_csc.c b/src/gallium/auxiliary/vl/vl_csc.c
new file mode 100644
index 0000000000..5ecc43a5fa
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_csc.c
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_csc.h"
+#include <util/u_math.h>
+#include <util/u_debug.h>
+
+/*
+ * Color space conversion formulas
+ *
+ * To convert YCbCr to RGB,
+ *    vec4  ycbcr, rgb
+ *    mat44 csc
+ *    rgb = csc * ycbcr
+ *
+ * To calculate the color space conversion matrix csc with ProcAmp adjustments,
+ *    mat44 csc, cstd, procamp, bias
+ *    csc = cstd * (procamp * bias)
+ *
+ * Where cstd is a matrix corresponding to one of the color standards (BT.601, BT.709, etc)
+ * adjusted for the kind of YCbCr -> RGB mapping wanted (1:1, full),
+ * bias is a matrix corresponding to the kind of YCbCr -> RGB mapping wanted (1:1, full)
+ *
+ * To calculate procamp,
+ *    mat44 procamp, hue, saturation, brightness, contrast
+ *    procamp = brightness * (saturation * (contrast * hue))
+ * Alternatively,
+ *    procamp = saturation * (brightness * (contrast * hue))
+ *
+ * contrast
+ * [ c, 0, 0, 0]
+ * [ 0, c, 0, 0]
+ * [ 0, 0, c, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * brightness
+ * [ 1, 0, 0, b]
+ * [ 0, 1, 0, 0]
+ * [ 0, 0, 1, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * saturation
+ * [ 1, 0, 0, 0]
+ * [ 0, s, 0, 0]
+ * [ 0, 0, s, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * hue
+ * [ 1,       0,      0, 0]
+ * [ 0,  cos(h), sin(h), 0]
+ * [ 0, -sin(h), cos(h), 0]
+ * [ 0,       0,      0, 1]
+ *
+ * procamp
+ * [ c,           0,          0, b]
+ * [ 0,  c*s*cos(h), c*s*sin(h), 0]
+ * [ 0, -c*s*sin(h), c*s*cos(h), 0]
+ * [ 0,           0,          0, 1]
+ *
+ * bias
+ * [ 1, 0, 0,  ybias]
+ * [ 0, 1, 0, cbbias]
+ * [ 0, 0, 1, crbias]
+ * [ 0, 0, 0,      1]
+ *
+ * csc
+ * [ c*cstd[ 0], c*cstd[ 1]*s*cos(h) - c*cstd[ 2]*s*sin(h), c*cstd[ 2]*s*cos(h) + c*cstd[ 1]*s*sin(h), cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 2]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[ 4], c*cstd[ 5]*s*cos(h) - c*cstd[ 6]*s*sin(h), c*cstd[ 6]*s*cos(h) + c*cstd[ 5]*s*sin(h), cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 6]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[ 8], c*cstd[ 9]*s*cos(h) - c*cstd[10]*s*sin(h), c*cstd[10]*s*cos(h) + c*cstd[ 9]*s*sin(h), cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[10]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[12], c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h), c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h), cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ */
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const float bt_601[16] =
+{
+   1.0f,  0.0f,    1.371f, 0.0f,
+   1.0f, -0.336f, -0.698f, 0.0f,
+   1.0f,  1.732f,  0.0f,   0.0f,
+   0.0f,  0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const float bt_601_full[16] =
+{
+   1.164f,  0.0f,    1.596f, 0.0f,
+   1.164f, -0.391f, -0.813f, 0.0f,
+   1.164f,  2.018f,  0.0f,   0.0f,
+   0.0f,    0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const float bt_709[16] =
+{
+   1.0f,  0.0f,    1.540f, 0.0f,
+   1.0f, -0.183f, -0.459f, 0.0f,
+   1.0f,  1.816f,  0.0f,   0.0f,
+   0.0f,  0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const float bt_709_full[16] =
+{
+   1.164f,  0.0f,    1.793f, 0.0f,
+   1.164f, -0.213f, -0.534f, 0.0f,
+   1.164f,  2.115f,  0.0f,   0.0f,
+   0.0f,    0.0f,    0.0f,   1.0f
+};
+
+static const float identity[16] =
+{
+   1.0f, 0.0f, 0.0f, 0.0f,
+   0.0f, 1.0f, 0.0f, 0.0f,
+   0.0f, 0.0f, 1.0f, 0.0f,
+   0.0f, 0.0f, 0.0f, 1.0f
+};
+
+void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
+                       struct vl_procamp *procamp,
+                       bool full_range,
+                       float *matrix)
+{
+   float ybias = full_range ? -16.0f/255.0f : 0.0f;
+   float cbbias = -128.0f/255.0f;
+   float crbias = -128.0f/255.0f;
+   float c = procamp ? procamp->contrast : 1.0f;
+   float s = procamp ? procamp->saturation : 1.0f;
+   float b = procamp ? procamp->brightness : 0.0f;
+   float h = procamp ? procamp->hue : 0.0f;
+   const float *cstd;
+
+   assert(matrix);
+
+   switch (cs) {
+      case VL_CSC_COLOR_STANDARD_BT_601:
+         cstd = full_range ? &bt_601_full[0] : &bt_601[0];
+         break;
+      case VL_CSC_COLOR_STANDARD_BT_709:
+         cstd = full_range ? &bt_709_full[0] : &bt_709[0];
+         break;
+      case VL_CSC_COLOR_STANDARD_IDENTITY:
+      default:
+         assert(cs == VL_CSC_COLOR_STANDARD_IDENTITY);
+         memcpy(matrix, &identity[0], sizeof(float) * 16);
+         return;
+   }
+
+   matrix[ 0] = c*cstd[ 0];
+   matrix[ 1] = c*cstd[ 1]*s*cosf(h) - c*cstd[ 2]*s*sinf(h);
+   matrix[ 2] = c*cstd[ 2]*s*cosf(h) + c*cstd[ 1]*s*sinf(h);
+   matrix[ 3] = cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 2]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[ 4] = c*cstd[ 4];
+   matrix[ 5] = c*cstd[ 5]*s*cosf(h) - c*cstd[ 6]*s*sinf(h);
+   matrix[ 6] = c*cstd[ 6]*s*cosf(h) + c*cstd[ 5]*s*sinf(h);
+   matrix[ 7] = cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 6]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[ 8] = c*cstd[ 8];
+   matrix[ 9] = c*cstd[ 9]*s*cosf(h) - c*cstd[10]*s*sinf(h);
+   matrix[10] = c*cstd[10]*s*cosf(h) + c*cstd[ 9]*s*sinf(h);
+   matrix[11] = cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[10]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[12] = c*cstd[12];
+   matrix[13] = c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h);
+   matrix[14] = c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h);
+   matrix[15] = cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h));
+}
diff --git a/src/gallium/auxiliary/vl/vl_csc.h b/src/gallium/auxiliary/vl/vl_csc.h
new file mode 100644
index 0000000000..722ca35f33
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_csc.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_csc_h
+#define vl_csc_h
+
+#include <pipe/p_compiler.h>
+
+struct vl_procamp
+{
+   float brightness;
+   float contrast;
+   float saturation;
+   float hue;
+};
+
+enum VL_CSC_COLOR_STANDARD
+{
+   VL_CSC_COLOR_STANDARD_IDENTITY,
+   VL_CSC_COLOR_STANDARD_BT_601,
+   VL_CSC_COLOR_STANDARD_BT_709
+};
+
+void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
+                       struct vl_procamp *procamp,
+                       bool full_range,
+                       float *matrix);
+
+#endif /* vl_csc_h */
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
new file mode 100644
index 0000000000..c4ba69817f
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
@@ -0,0 +1,1660 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_mpeg12_mc_renderer.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <util/u_math.h>
+#include <util/u_memory.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_shader_build.h"
+
+#define DEFAULT_BUF_ALIGNMENT 1
+#define MACROBLOCK_WIDTH 16
+#define MACROBLOCK_HEIGHT 16
+#define BLOCK_WIDTH 8
+#define BLOCK_HEIGHT 8
+#define ZERO_BLOCK_NIL -1.0f
+#define ZERO_BLOCK_IS_NIL(zb) ((zb).x < 0.0f)
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f denorm;
+};
+
+struct fragment_shader_consts
+{
+   struct vertex4f multiplier;
+   struct vertex4f div;
+};
+
+/*
+ * Muliplier renormalizes block samples from 16 bits to 12 bits.
+ * Divider is used when calculating Y % 2 for choosing top or bottom
+ * field for P or B macroblocks.
+ * TODO: Use immediates.
+ */
+static const struct fragment_shader_consts fs_consts = {
+   {32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+   {0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+struct vert_stream_0
+{
+   struct vertex2f pos;
+   struct vertex2f luma_tc;
+   struct vertex2f cb_tc;
+   struct vertex2f cr_tc;
+};
+
+enum MACROBLOCK_TYPE
+{
+   MACROBLOCK_TYPE_INTRA,
+   MACROBLOCK_TYPE_FWD_FRAME_PRED,
+   MACROBLOCK_TYPE_FWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BKWD_FRAME_PRED,
+   MACROBLOCK_TYPE_BKWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BI_FRAME_PRED,
+   MACROBLOCK_TYPE_BI_FIELD_PRED,
+
+   NUM_MACROBLOCK_TYPES
+};
+
+static void
+create_intra_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->i_vs = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_intra_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul o0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->i_fs = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_frame_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; Ref surface top field texcoords
+    * decl i5              ; Ref surface bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 5; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+        inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+        ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* add o4, i0, i4       ; Translate vertex pos by motion vec to form ref macroblock texcoords */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->p_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for ref surface texture
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t1, i3, s3             ; Read texel from ref macroblock */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->p_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; First ref macroblock top field texcoords
+    * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
+    * decl i6              ; Second ref macroblock top field texcoords
+    * decl i7              ; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 8; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; First ref macroblock texcoords
+    * decl o5              ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * add o4, i0, i4       ; Translate vertex pos by motion vec to form first ref macroblock texcoords
+    * add o5, i0, i6       ; Translate vertex pos by motion vec to form second ref macroblock texcoords
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->b_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; First ref macroblock texcoords
+    * decl i4                      ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+    * decl c1                      ; Constant 1/2 in .x channel to use as weight to blend past and future texels
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0-t2 */
+   decl = vl_decl_temps(0, 2);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for first ref surface texture
+    * decl s4                      ; Sampler for second ref surface texture
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * tex2d t1, i3, s3             ; Read texel from first ref macroblock
+    * tex2d t2, i4, s4             ; Read texel from second ref macroblock
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
+   inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->b_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+xfer_buffers_map(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->tex_transfer[i] = r->pipe->screen->get_tex_transfer
+      (
+         r->pipe->screen, r->textures.all[i],
+         0, 0, 0, PIPE_TRANSFER_WRITE, 0, 0,
+         r->textures.all[i]->width[0], r->textures.all[i]->height[0]
+      );
+
+      r->texels[i] = r->pipe->screen->transfer_map(r->pipe->screen, r->tex_transfer[i]);
+   }
+}
+
+static void
+xfer_buffers_unmap(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->pipe->screen->transfer_unmap(r->pipe->screen, r->tex_transfer[i]);
+      r->pipe->screen->tex_transfer_destroy(r->tex_transfer[i]);
+   }
+}
+
+static bool
+init_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_sampler_state sampler;
+   unsigned filters[5];
+   unsigned i;
+
+   assert(r);
+
+   r->viewport.scale[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->viewport.scale[1] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->viewport.scale[2] = 1;
+   r->viewport.scale[3] = 1;
+   r->viewport.translate[0] = 0;
+   r->viewport.translate[1] = 0;
+   r->viewport.translate[2] = 0;
+   r->viewport.translate[3] = 0;
+
+   r->scissor.maxx = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->scissor.maxy = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+
+   r->fb_state.width = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->fb_state.height = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->fb_state.nr_cbufs = 1;
+   r->fb_state.zsbuf = NULL;
+
+   /* Luma filter */
+   filters[0] = PIPE_TEX_FILTER_NEAREST;
+   /* Chroma filters */
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_444 ||
+       r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+      filters[1] = PIPE_TEX_FILTER_NEAREST;
+      filters[2] = PIPE_TEX_FILTER_NEAREST;
+   }
+   else {
+      filters[1] = PIPE_TEX_FILTER_LINEAR;
+      filters[2] = PIPE_TEX_FILTER_LINEAR;
+   }
+   /* Fwd, bkwd ref filters */
+   filters[3] = PIPE_TEX_FILTER_LINEAR;
+   filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+   for (i = 0; i < 5; ++i) {
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_img_filter = filters[i];
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.mag_img_filter = filters[i];
+      sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+      sampler.compare_func = PIPE_FUNC_ALWAYS;
+      sampler.normalized_coords = 1;
+      /*sampler.prefilter = ; */
+      /*sampler.shadow_ambient = ; */
+      /*sampler.lod_bias = ; */
+      sampler.min_lod = 0;
+      /*sampler.max_lod = ; */
+      /*sampler.border_color[i] = ; */
+      /*sampler.max_anisotropy = ; */
+      r->samplers.all[i] = r->pipe->create_sampler_state(r->pipe, &sampler);
+   }
+
+   return true;
+}
+
+static void
+cleanup_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 5; ++i)
+      r->pipe->delete_sampler_state(r->pipe, r->samplers.all[i]);
+}
+
+static bool
+init_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   create_intra_vert_shader(r);
+   create_intra_frag_shader(r);
+   create_frame_pred_vert_shader(r);
+   create_frame_pred_frag_shader(r);
+   create_frame_bi_pred_vert_shader(r);
+   create_frame_bi_pred_frag_shader(r);
+
+   return true;
+}
+
+static void
+cleanup_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   r->pipe->delete_vs_state(r->pipe, r->i_vs);
+   r->pipe->delete_fs_state(r->pipe, r->i_fs);
+   r->pipe->delete_vs_state(r->pipe, r->p_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->p_fs[0]);
+   r->pipe->delete_vs_state(r->pipe, r->b_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->b_fs[0]);
+}
+
+static bool
+init_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_texture template;
+
+   const unsigned mbw =
+      align(r->picture_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
+   const unsigned mbh =
+      align(r->picture_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
+
+   unsigned i;
+
+   assert(r);
+
+   r->macroblocks_per_batch =
+      mbw * (r->bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE ? mbh : 1);
+   r->num_macroblocks = 0;
+   r->macroblock_buf = MALLOC(r->macroblocks_per_batch * sizeof(struct pipe_mpeg12_macroblock));
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   /* TODO: Accomodate HW that can't do this and also for cases when this isn't precise enough */
+   template.format = PIPE_FORMAT_R16_SNORM;
+   template.last_level = 0;
+   template.width[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   template.height[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_DYNAMIC;
+
+   r->textures.individual.y = r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
+      template.width[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_width / 2) :
+         r->picture_width / 2;
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+   }
+   else if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422)
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+
+   r->textures.individual.cb =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+   r->textures.individual.cr =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   r->vertex_bufs.individual.ycbcr.stride = sizeof(struct vertex2f) * 4;
+   r->vertex_bufs.individual.ycbcr.max_index = 24 * r->macroblocks_per_batch - 1;
+   r->vertex_bufs.individual.ycbcr.buffer_offset = 0;
+   r->vertex_bufs.individual.ycbcr.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex2f) * 4 * 24 * r->macroblocks_per_batch
+   );
+
+   for (i = 1; i < 3; ++i) {
+      r->vertex_bufs.all[i].stride = sizeof(struct vertex2f) * 2;
+      r->vertex_bufs.all[i].max_index = 24 * r->macroblocks_per_batch - 1;
+      r->vertex_bufs.all[i].buffer_offset = 0;
+      r->vertex_bufs.all[i].buffer = pipe_buffer_create
+      (
+         r->pipe->screen,
+         DEFAULT_BUF_ALIGNMENT,
+         PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+         sizeof(struct vertex2f) * 2 * 24 * r->macroblocks_per_batch
+      );
+   }
+
+   /* Position element */
+   r->vertex_elems[0].src_offset = 0;
+   r->vertex_elems[0].vertex_buffer_index = 0;
+   r->vertex_elems[0].nr_components = 2;
+   r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Luma, texcoord element */
+   r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[1].vertex_buffer_index = 0;
+   r->vertex_elems[1].nr_components = 2;
+   r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cr texcoord element */
+   r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
+   r->vertex_elems[2].vertex_buffer_index = 0;
+   r->vertex_elems[2].nr_components = 2;
+   r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cb texcoord element */
+   r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
+   r->vertex_elems[3].vertex_buffer_index = 0;
+   r->vertex_elems[3].nr_components = 2;
+   r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface top field texcoord element */
+   r->vertex_elems[4].src_offset = 0;
+   r->vertex_elems[4].vertex_buffer_index = 1;
+   r->vertex_elems[4].nr_components = 2;
+   r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface bottom field texcoord element */
+   r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[5].vertex_buffer_index = 1;
+   r->vertex_elems[5].nr_components = 2;
+   r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface top field texcoord element */
+   r->vertex_elems[6].src_offset = 0;
+   r->vertex_elems[6].vertex_buffer_index = 2;
+   r->vertex_elems[6].nr_components = 2;
+   r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface bottom field texcoord element */
+   r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[7].vertex_buffer_index = 2;
+   r->vertex_elems[7].nr_components = 2;
+   r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   r->vs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   r->fs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT, sizeof(struct fragment_shader_consts)
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(r->pipe->screen, r->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      &fs_consts, sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf.buffer);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   pipe_buffer_reference(&r->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&r->fs_const_buf.buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_texture_reference(&r->textures.all[i], NULL);
+
+   FREE(r->macroblock_buf);
+}
+
+static enum MACROBLOCK_TYPE
+get_macroblock_type(struct pipe_mpeg12_macroblock *mb)
+{
+   assert(mb);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+         return MACROBLOCK_TYPE_INTRA;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_FWD_FRAME_PRED : MACROBLOCK_TYPE_FWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BKWD_FRAME_PRED : MACROBLOCK_TYPE_BKWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BI_FRAME_PRED : MACROBLOCK_TYPE_BI_FIELD_PRED;
+      default:
+         assert(0);
+   }
+
+   /* Unreachable */
+   return -1;
+}
+
+/* XXX: One of these days this will have to be killed with fire */
+#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, use_zb, zb)				\
+	do {															\
+	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+																\
+	if (!use_zb || (cbp) & (lm))												\
+	{															\
+		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
+		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
+		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
+		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (cbm))												\
+	{															\
+		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
+		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
+		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
+		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (crm))												\
+	{															\
+		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
+		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
+		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
+		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
+	}															\
+	} while (0)
+
+static void
+gen_macroblock_verts(struct vl_mpeg12_mc_renderer *r,
+                     struct pipe_mpeg12_macroblock *mb, unsigned pos,
+                     struct vert_stream_0 *ycbcr_vb, struct vertex2f **ref_vb)
+{
+   struct vertex2f mo_vec[2];
+
+   unsigned i;
+
+   assert(r);
+   assert(mb);
+   assert(ycbcr_vb);
+   assert(pos < r->macroblocks_per_batch);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[1]);
+
+         vb = ref_vb[1] + pos * 2 * 24;
+
+         mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+         mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+         if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[0]);
+
+         vb = ref_vb[0] + pos * 2 * 24;
+
+         if (mb->mb_type == PIPE_MPEG12_MACROBLOCK_TYPE_BKWD) {
+             mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+             mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+             if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+                mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+                mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+             }
+         }
+         else {
+            mo_vec[0].x = mb->pmv[0][0][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[0].y = mb->pmv[0][0][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+               mo_vec[1].x = mb->pmv[1][0][0] * 0.5f * r->surface_tex_inv_size.x;
+               mo_vec[1].y = mb->pmv[1][0][1] * 0.5f * r->surface_tex_inv_size.y;
+            }
+         }
+
+         if (mb->mb_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+      {
+         const struct vertex2f unit =
+         {
+            r->surface_tex_inv_size.x * MACROBLOCK_WIDTH,
+            r->surface_tex_inv_size.y * MACROBLOCK_HEIGHT
+         };
+         const struct vertex2f half =
+         {
+            r->surface_tex_inv_size.x * (MACROBLOCK_WIDTH / 2),
+            r->surface_tex_inv_size.y * (MACROBLOCK_HEIGHT / 2)
+         };
+         const bool use_zb = r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE;
+
+         struct vert_stream_0 *vb = ycbcr_vb + pos * 24;
+
+         SET_BLOCK(vb, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, 0, half.x, half.y,
+                   32, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 6, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, 0, half.x, half.y,
+                   16, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 12, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, half.y, half.x, half.y,
+                   8, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 18, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, half.y, half.x, half.y,
+                   4, 2, 1, use_zb, r->zero_block);
+
+         break;
+      }
+      default:
+         assert(0);
+   }
+}
+
+static void
+gen_macroblock_stream(struct vl_mpeg12_mc_renderer *r,
+                      unsigned *num_macroblocks)
+{
+   unsigned offset[NUM_MACROBLOCK_TYPES];
+   struct vert_stream_0 *ycbcr_vb;
+   struct vertex2f *ref_vb[2];
+   unsigned i;
+
+   assert(r);
+   assert(num_macroblocks);
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+      ++num_macroblocks[mb_type];
+   }
+
+   offset[0] = 0;
+
+   for (i = 1; i < NUM_MACROBLOCK_TYPES; ++i)
+      offset[i] = offset[i - 1] + num_macroblocks[i - 1];
+
+   ycbcr_vb = (struct vert_stream_0 *)pipe_buffer_map
+   (
+      r->pipe->screen,
+      r->vertex_bufs.individual.ycbcr.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   for (i = 0; i < 2; ++i)
+      ref_vb[i] = (struct vertex2f *)pipe_buffer_map
+      (
+         r->pipe->screen,
+         r->vertex_bufs.individual.ref[i].buffer,
+         PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+      );
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+
+      gen_macroblock_verts(r, &r->macroblock_buf[i], offset[mb_type],
+                           ycbcr_vb, ref_vb);
+
+      ++offset[mb_type];
+   }
+
+   pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ycbcr.buffer);
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ref[i].buffer);
+}
+
+static void
+flush(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned num_macroblocks[NUM_MACROBLOCK_TYPES] = { 0 };
+   unsigned vb_start = 0;
+   struct vertex_shader_consts *vs_consts;
+   unsigned i;
+
+   assert(r);
+   assert(r->num_macroblocks == r->macroblocks_per_batch);
+
+   gen_macroblock_stream(r, num_macroblocks);
+
+   r->fb_state.cbufs[0] = r->pipe->screen->get_tex_surface
+   (
+      r->pipe->screen, r->surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   r->pipe->set_framebuffer_state(r->pipe, &r->fb_state);
+   r->pipe->set_viewport_state(r->pipe, &r->viewport);
+   r->pipe->set_scissor_state(r->pipe, &r->scissor);
+
+   vs_consts = pipe_buffer_map
+   (
+      r->pipe->screen, r->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->denorm.x = r->surface->width[0];
+   vs_consts->denorm.y = r->surface->height[0];
+
+   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf.buffer);
+
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
+                                &r->vs_const_buf);
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_FRAGMENT, 0,
+                                &r->fs_const_buf);
+
+   if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 4, r->vertex_elems);
+      r->pipe->set_sampler_textures(r->pipe, 3, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 3, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->i_vs);
+      r->pipe->bind_fs_state(r->pipe, r->i_fs);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24;
+   }
+
+   r->pipe->flush(r->pipe, PIPE_FLUSH_RENDER_CACHE, r->fence);
+   pipe_surface_reference(&r->fb_state.cbufs[0], NULL);
+
+   if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE)
+      for (i = 0; i < 3; ++i)
+         r->zero_block[i].x = ZERO_BLOCK_NIL;
+
+   r->num_macroblocks = 0;
+}
+
+static void
+grab_frame_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_field_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch * 2, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+fill_zero_block(short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memset(dst + y * dst_pitch, 0, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
+            enum pipe_mpeg12_dct_type dct_type, unsigned cbp, short *blocks)
+{
+   unsigned tex_pitch;
+   short *texels;
+   unsigned tb = 0, sb = 0;
+   unsigned mbpx = mbx * MACROBLOCK_WIDTH, mbpy = mby * MACROBLOCK_HEIGHT;
+   unsigned x, y;
+
+   assert(r);
+   assert(blocks);
+
+   tex_pitch = r->tex_transfer[0]->stride / r->tex_transfer[0]->block.size;
+   texels = r->texels[0] + mbpy * tex_pitch + mbpx;
+
+   for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x, ++tb) {
+         if ((cbp >> (5 - tb)) & 1) {
+            if (dct_type == PIPE_MPEG12_DCT_TYPE_FRAME) {
+               grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch * BLOCK_WIDTH +
+                                      x * BLOCK_WIDTH, tex_pitch);
+            }
+            else {
+               grab_field_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch + x * BLOCK_WIDTH,
+                                      tex_pitch);
+            }
+
+            ++sb;
+         }
+         else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+                ZERO_BLOCK_IS_NIL(r->zero_block[0])) {
+               fill_zero_block(texels + y * tex_pitch * BLOCK_WIDTH + x * BLOCK_WIDTH, tex_pitch);
+               if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+                  r->zero_block[0].x = (mbpx + x * 8) * r->surface_tex_inv_size.x;
+                  r->zero_block[0].y = (mbpy + y * 8) * r->surface_tex_inv_size.y;
+               }
+            }
+         }
+      }
+   }
+
+   /* TODO: Implement 422, 444 */
+   assert(r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420);
+
+   mbpx /= 2;
+   mbpy /= 2;
+
+   for (tb = 0; tb < 2; ++tb) {
+      tex_pitch = r->tex_transfer[tb + 1]->stride / r->tex_transfer[tb + 1]->block.size;
+      texels = r->texels[tb + 1] + mbpy * tex_pitch + mbpx;
+
+      if ((cbp >> (1 - tb)) & 1) {
+         grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT, texels, tex_pitch);
+         ++sb;
+      }
+      else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+         if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+             ZERO_BLOCK_IS_NIL(r->zero_block[tb + 1])) {
+            fill_zero_block(texels, tex_pitch);
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+               r->zero_block[tb + 1].x = (mbpx << 1) * r->surface_tex_inv_size.x;
+               r->zero_block[tb + 1].y = (mbpy << 1) * r->surface_tex_inv_size.y;
+            }
+         }
+      }
+   }
+}
+
+static void
+grab_macroblock(struct vl_mpeg12_mc_renderer *r,
+                struct pipe_mpeg12_macroblock *mb)
+{
+   assert(r);
+   assert(mb);
+   assert(r->num_macroblocks < r->macroblocks_per_batch);
+
+   memcpy(&r->macroblock_buf[r->num_macroblocks], mb,
+          sizeof(struct pipe_mpeg12_macroblock));
+
+   grab_blocks(r, mb->mbx, mb->mby, mb->dct_type, mb->cbp, mb->blocks);
+
+   ++r->num_macroblocks;
+}
+
+bool
+vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                           struct pipe_context *pipe,
+                           unsigned picture_width,
+                           unsigned picture_height,
+                           enum pipe_video_chroma_format chroma_format,
+                           enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                           enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                           bool pot_buffers)
+{
+   unsigned i;
+
+   assert(renderer);
+   assert(pipe);
+   /* TODO: Implement other policies */
+   assert(bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE);
+   /* TODO: Implement this */
+   /* XXX: XFER_ALL sampling issue at block edges when using bilinear filtering */
+   assert(eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE);
+   /* TODO: Non-pot buffers untested, probably doesn't work without changes to texcoord generation, vert shader, etc */
+   assert(pot_buffers);
+
+   memset(renderer, 0, sizeof(struct vl_mpeg12_mc_renderer));
+
+   renderer->pipe = pipe;
+   renderer->picture_width = picture_width;
+   renderer->picture_height = picture_height;
+   renderer->chroma_format = chroma_format;
+   renderer->bufmode = bufmode;
+   renderer->eb_handling = eb_handling;
+   renderer->pot_buffers = pot_buffers;
+
+   if (!init_pipe_state(renderer))
+      return false;
+   if (!init_shaders(renderer)) {
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+   if (!init_buffers(renderer)) {
+      cleanup_shaders(renderer);
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+
+   renderer->surface = NULL;
+   renderer->past = NULL;
+   renderer->future = NULL;
+   for (i = 0; i < 3; ++i)
+      renderer->zero_block[i].x = ZERO_BLOCK_NIL;
+   renderer->num_macroblocks = 0;
+
+   xfer_buffers_map(renderer);
+
+   return true;
+}
+
+void
+vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer)
+{
+   assert(renderer);
+
+   xfer_buffers_unmap(renderer);
+
+   cleanup_pipe_state(renderer);
+   cleanup_shaders(renderer);
+   cleanup_buffers(renderer);
+}
+
+void
+vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer
+                                         *renderer,
+                                         struct pipe_texture *surface,
+                                         struct pipe_texture *past,
+                                         struct pipe_texture *future,
+                                         unsigned num_macroblocks,
+                                         struct pipe_mpeg12_macroblock
+                                         *mpeg12_macroblocks,
+                                         struct pipe_fence_handle **fence)
+{
+   bool new_surface = false;
+
+   assert(renderer);
+   assert(surface);
+   assert(num_macroblocks);
+   assert(mpeg12_macroblocks);
+
+   if (renderer->surface) {
+      if (surface != renderer->surface) {
+         if (renderer->num_macroblocks > 0) {
+            xfer_buffers_unmap(renderer);
+            flush(renderer);
+         }
+         
+         new_surface = true;
+      }
+
+      /* If the surface we're rendering hasn't changed the ref frames shouldn't change. */
+      assert(surface != renderer->surface || renderer->past == past);
+      assert(surface != renderer->surface || renderer->future == future);
+   }
+   else
+      new_surface = true;
+
+   if (new_surface) {
+      renderer->surface = surface;
+      renderer->past = past;
+      renderer->future = future;
+      renderer->fence = fence;
+      renderer->surface_tex_inv_size.x = 1.0f / surface->width[0];
+      renderer->surface_tex_inv_size.y = 1.0f / surface->height[0];
+   }
+
+   while (num_macroblocks) {
+      unsigned left_in_batch = renderer->macroblocks_per_batch - renderer->num_macroblocks;
+      unsigned num_to_submit = MIN2(num_macroblocks, left_in_batch);
+      unsigned i;
+
+      for (i = 0; i < num_to_submit; ++i) {
+         assert(mpeg12_macroblocks[i].base.codec == PIPE_VIDEO_CODEC_MPEG12);
+         grab_macroblock(renderer, &mpeg12_macroblocks[i]);
+      }
+
+      num_macroblocks -= num_to_submit;
+
+      if (renderer->num_macroblocks == renderer->macroblocks_per_batch) {
+         xfer_buffers_unmap(renderer);
+         flush(renderer);
+         xfer_buffers_map(renderer);
+         /* Next time we get this surface it may have new ref frames */
+         renderer->surface = NULL;
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
new file mode 100644
index 0000000000..64184337a0
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
@@ -0,0 +1,121 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_mpeg12_mc_renderer_h
+#define vl_mpeg12_mc_renderer_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_video_surface;
+struct pipe_macroblock;
+
+/* A slice is video-width (rounded up to a multiple of macroblock width) x macroblock height */
+enum VL_MPEG12_MC_RENDERER_BUFFER_MODE
+{
+   VL_MPEG12_MC_RENDERER_BUFFER_SLICE,  /* Saves memory at the cost of smaller batches */
+   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE /* Larger batches, more memory */
+};
+
+enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK
+{
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL, /* Waste of memory bandwidth */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE, /* Can only do point-filtering when interpolating subsampled chroma channels */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE /* Needs conditional texel fetch! */
+};
+
+struct vl_mpeg12_mc_renderer
+{
+   struct pipe_context *pipe;
+   unsigned picture_width;
+   unsigned picture_height;
+   enum pipe_video_chroma_format chroma_format;
+   enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode;
+   enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling;
+   bool pot_buffers;
+   unsigned macroblocks_per_batch;
+
+   struct pipe_viewport_state viewport;
+   struct pipe_scissor_state scissor;
+   struct pipe_constant_buffer vs_const_buf;
+   struct pipe_constant_buffer fs_const_buf;
+   struct pipe_framebuffer_state fb_state;
+   struct pipe_vertex_element vertex_elems[8];
+	
+   union
+   {
+      void *all[5];
+      struct { void *y, *cb, *cr, *ref[2]; } individual;
+   } samplers;
+	
+   void *i_vs, *p_vs[2], *b_vs[2];
+   void *i_fs, *p_fs[2], *b_fs[2];
+	
+   union
+   {
+      struct pipe_texture *all[5];
+      struct { struct pipe_texture *y, *cb, *cr, *ref[2]; } individual;
+   } textures;
+
+   union
+   {
+      struct pipe_vertex_buffer all[3];
+      struct { struct pipe_vertex_buffer ycbcr, ref[2]; } individual;
+   } vertex_bufs;
+	
+   struct pipe_texture *surface, *past, *future;
+   struct pipe_fence_handle **fence;
+   unsigned num_macroblocks;
+   struct pipe_mpeg12_macroblock *macroblock_buf;
+   struct pipe_transfer *tex_transfer[3];
+   short *texels[3];
+   struct { float x, y; } surface_tex_inv_size;
+   struct { float x, y; } zero_block[3];
+};
+
+bool vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                                struct pipe_context *pipe,
+                                unsigned picture_width,
+                                unsigned picture_height,
+                                enum pipe_video_chroma_format chroma_format,
+                                enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                                enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                                bool pot_buffers);
+
+void vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer);
+
+void vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer *renderer,
+                                              struct pipe_texture *surface,
+                                              struct pipe_texture *past,
+                                              struct pipe_texture *future,
+                                              unsigned num_macroblocks,
+                                              struct pipe_mpeg12_macroblock *mpeg12_macroblocks,
+                                              struct pipe_fence_handle **fence);
+
+#endif /* vl_mpeg12_mc_renderer_h */
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.c b/src/gallium/auxiliary/vl/vl_shader_build.c
new file mode 100644
index 0000000000..faa20a903c
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.c
@@ -0,0 +1,242 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_shader_build.h"
+#include <assert.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   assert
+   (
+      interpolation == TGSI_INTERPOLATE_CONSTANT ||
+      interpolation == TGSI_INTERPOLATE_LINEAR ||
+      interpolation == TGSI_INTERPOLATE_PERSPECTIVE
+   );
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.Declaration.Interpolate = interpolation;;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_CONSTANT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_TEMPORARY;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 1;
+   inst.FullSrcRegisters[0].SrcRegister.File = src_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.InstructionExtTexture.Texture = tex;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+   inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
+   inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_end(void)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+
+   return inst;
+}
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.h b/src/gallium/auxiliary/vl/vl_shader_build.h
new file mode 100644
index 0000000000..5da71f8e13
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.h
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_shader_build_h
+#define vl_shader_build_h
+
+#include <pipe/p_shader_tokens.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+);
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+);
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+);
+struct tgsi_full_instruction vl_end(void);
+
+#endif