207 files changed, 7856 insertions, 5381 deletions
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 5cafe8c3f0..8f7d3b7100 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -552,7 +552,7 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_SHL:
       break;
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       break;
    case TGSI_OPCODE_AND:
       break;
@@ -919,7 +919,7 @@ translate_instructionir(llvm::Module *module,
       break;
    case TGSI_OPCODE_SHL:
       break;
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       break;
    case TGSI_OPCODE_AND:
       break;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index a9375abd21..ba6f7b15f9 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -80,7 +80,7 @@ struct fenced_buffer_list
  */
 struct fenced_buffer
 {
-   /* 
+   /*
     * Immutable members.
     */
 
@@ -126,8 +126,8 @@ fenced_buffer(struct pb_buffer *buf)
 /**
  * Add the buffer to the fenced list.
  * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
- * order before calling this function.
+ * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this
+ * order, before calling this function.
  * 
  * Reference count should be incremented before calling this function.
  */
@@ -191,7 +191,7 @@ fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
  * Wait for the fence to expire, and remove it from the fenced list.
  * 
  * fenced_buffer::mutex must be held. fenced_buffer_list::mutex must not be 
- * held -- it will
+ * held -- it will be acquired internally.
  */
 static INLINE enum pipe_error
 fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
@@ -207,7 +207,10 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->fence);
 
-   /* Acquire the global lock */
+   /*
+    * Acquire the global lock. Must release buffer mutex first to preserve
+    * lock order.
+    */
    pipe_mutex_unlock(fenced_buf->mutex);
    pipe_mutex_lock(fenced_list->mutex);
    pipe_mutex_lock(fenced_buf->mutex);
@@ -217,7 +220,7 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
          /* Remove from the fenced list */
          /* TODO: remove consequents */
          fenced_buffer_remove_locked(fenced_list, fenced_buf);
-         
+
          p_atomic_dec(&fenced_buf->base.base.reference.count);
          assert(pipe_is_referenced(&fenced_buf->base.base.reference));
 
@@ -238,7 +241,7 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
  */
 static void
 fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list, 
-                               int wait)
+                                     int wait)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
    struct list_head *curr, *next;
@@ -274,7 +277,6 @@ fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list,
 
       pb_buf = &fenced_buf->base;
       pb_reference(&pb_buf, NULL);
-      
 
       curr = next; 
       next = curr->next;
@@ -329,7 +331,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
           ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
          /* Don't wait for the GPU to finish writing */
-         goto finish;
+         goto done;
       }
 
       /* Wait for the GPU to finish writing */
@@ -350,7 +352,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
 
-finish:
+done:
    pipe_mutex_unlock(fenced_buf->mutex);
    
    return map;
@@ -391,7 +393,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
    
    assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
@@ -401,7 +403,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    /* Buffer cannot be validated in two different lists */ 
    if(fenced_buf->vl && fenced_buf->vl != vl) {
       ret = PIPE_ERROR_RETRY;
-      goto finish;
+      goto done;
    }
    
 #if 0
@@ -409,7 +411,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
       /* TODO: wait for the thread that mapped the buffer to unmap it */
       ret = PIPE_ERROR_RETRY;
-      goto finish;
+      goto done;
    }
    /* Final sanity checking */
    assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
@@ -420,17 +422,17 @@ fenced_buffer_validate(struct pb_buffer *buf,
       (fenced_buf->validation_flags & flags) == flags) {
       /* Nothing to do -- buffer already validated */
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
    
    ret = pb_validate(fenced_buf->buffer, vl, flags);
    if (ret != PIPE_OK)
-      goto finish;
+      goto done;
    
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
    
-finish:
+done:
    pipe_mutex_unlock(fenced_buf->mutex);
 
    return ret;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 01811d5011..ffed768f97 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -41,6 +41,12 @@
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
+#if defined(PIPE_OS_WINDOWS)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+#endif
 
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
 
@@ -118,7 +124,29 @@ rtasm_exec_free(void *addr)
 }
 
 
-#else /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#elif defined(PIPE_OS_WINDOWS)
+
+
+/*
+ * Avoid Data Execution Prevention.
+ */
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+}
+
+
+void
+rtasm_exec_free(void *addr)
+{
+   VirtualFree(addr, 0, MEM_RELEASE);
+}
+
+
+#else
+
 
 /*
  * Just use regular memory.
@@ -138,4 +166,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 2c65ff16d8..e2e5394f86 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -128,7 +128,9 @@ static const char *semantic_names[] =
 
 static const char *immediate_type_names[] =
 {
-   "FLT32"
+   "FLT32",
+   "UINT32",
+   "INT32"
 };
 
 static const char *swizzle_names[] =
@@ -412,6 +414,12 @@ iter_immediate(
       case TGSI_IMM_FLOAT32:
          FLT( imm->u[i].Float );
          break;
+      case TGSI_IMM_UINT32:
+         UID(imm->u[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         SID(imm->u[i].Int);
+         break;
       default:
          assert( 0 );
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index ba89f2fbc3..2bcb33392a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -60,6 +61,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
+
 #define FAST_MATH 1
 
 #define TILE_TOP_LEFT     0
@@ -67,11 +69,329 @@
 #define TILE_BOTTOM_LEFT  2
 #define TILE_BOTTOM_RIGHT 3
 
+static void
+micro_abs(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = fabsf(src->f[0]);
+   dst->f[1] = fabsf(src->f[1]);
+   dst->f[2] = fabsf(src->f[2]);
+   dst->f[3] = fabsf(src->f[3]);
+}
+
+static void
+micro_arl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0]);
+   dst->i[1] = (int)floorf(src->f[1]);
+   dst->i[2] = (int)floorf(src->f[2]);
+   dst->i[3] = (int)floorf(src->f[3]);
+}
+
+static void
+micro_arr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
+   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
+   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
+   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_ceil(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->f[0] = ceilf(src->f[0]);
+   dst->f[1] = ceilf(src->f[1]);
+   dst->f[2] = ceilf(src->f[2]);
+   dst->f[3] = ceilf(src->f[3]);
+}
+
+static void
+micro_cos(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = cosf(src->f[0]);
+   dst->f[1] = cosf(src->f[1]);
+   dst->f[2] = cosf(src->f[2]);
+   dst->f[3] = cosf(src->f[3]);
+}
+
+static void
+micro_ddx(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
+}
+
+static void
+micro_exp2(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_exp2(src->f[0]);
+   dst->f[1] = util_fast_exp2(src->f[1]);
+   dst->f[2] = util_fast_exp2(src->f[2]);
+   dst->f[3] = util_fast_exp2(src->f[3]);
+#else
+#if DEBUG
+   /* Inf is okay for this instruction, so clamp it to silence assertions. */
+   uint i;
+   union tgsi_exec_channel clamped;
+
+   for (i = 0; i < 4; i++) {
+      if (src->f[i] > 127.99999f) {
+         clamped.f[i] = 127.99999f;
+      } else if (src->f[i] < -126.99999f) {
+         clamped.f[i] = -126.99999f;
+      } else {
+         clamped.f[i] = src->f[i];
+      }
+   }
+   src = &clamped;
+#endif /* DEBUG */
+
+   dst->f[0] = powf(2.0f, src->f[0]);
+   dst->f[1] = powf(2.0f, src->f[1]);
+   dst->f[2] = powf(2.0f, src->f[2]);
+   dst->f[3] = powf(2.0f, src->f[3]);
+#endif /* FAST_MATH */
+}
+
+static void
+micro_flr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0]);
+   dst->f[1] = floorf(src->f[1]);
+   dst->f[2] = floorf(src->f[2]);
+   dst->f[3] = floorf(src->f[3]);
+}
+
+static void
+micro_frc(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] - floorf(src->f[0]);
+   dst->f[1] = src->f[1] - floorf(src->f[1]);
+   dst->f[2] = src->f[2] - floorf(src->f[2]);
+   dst->f[3] = src->f[3] - floorf(src->f[3]);
+}
+
+static void
+micro_iabs(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
+   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
+   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
+   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
+}
+
+static void
+micro_ineg(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_lg2(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_log2(src->f[0]);
+   dst->f[1] = util_fast_log2(src->f[1]);
+   dst->f[2] = util_fast_log2(src->f[2]);
+   dst->f[3] = util_fast_log2(src->f[3]);
+#else
+   dst->f[0] = logf(src->f[0]) * 1.442695f;
+   dst->f[1] = logf(src->f[1]) * 1.442695f;
+   dst->f[2] = logf(src->f[2]) * 1.442695f;
+   dst->f[3] = logf(src->f[3]) * 1.442695f;
+#endif
+}
+
+static void
+micro_lrp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
+   dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
+   dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
+   dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
+}
+
+static void
+micro_mad(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
+   dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
+   dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
+   dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
+}
+
+static void
+micro_mov(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src->u[0];
+   dst->u[1] = src->u[1];
+   dst->u[2] = src->u[2];
+   dst->u[3] = src->u[3];
+}
+
+static void
+micro_rcp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = 1.0f / src->f[0];
+   dst->f[1] = 1.0f / src->f[1];
+   dst->f[2] = 1.0f / src->f[2];
+   dst->f[3] = 1.0f / src->f[3];
+}
+
+static void
+micro_rnd(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0] + 0.5f);
+   dst->f[1] = floorf(src->f[1] + 0.5f);
+   dst->f[2] = floorf(src->f[2] + 0.5f);
+   dst->f[3] = floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_rsq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
+   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
+   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
+   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
+}
+
+static void
+micro_seq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sge(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgn(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
+   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
+   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
+   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sin(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = sinf(src->f[0]);
+   dst->f[1] = sinf(src->f[1]);
+   dst->f[2] = sinf(src->f[2]);
+   dst->f[3] = sinf(src->f[3]);
+}
+
+static void
+micro_sle(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_slt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sne(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_trunc(union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)(int)src->f[0];
+   dst->f[1] = (float)(int)src->f[1];
+   dst->f[2] = (float)(int)src->f[2];
+   dst->f[3] = (float)(int)src->f[3];
+}
+
+
 #define CHAN_X  0
 #define CHAN_Y  1
 #define CHAN_Z  2
 #define CHAN_W  3
 
+enum tgsi_exec_datatype {
+   TGSI_EXEC_DATA_FLOAT,
+   TGSI_EXEC_DATA_INT,
+   TGSI_EXEC_DATA_UINT
+};
+
 /*
  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  */
@@ -123,23 +443,19 @@
 
 /** The execution mask depends on the conditional mask and the loop mask */
 #define UPDATE_EXEC_MASK(MACH) \
-      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 
 
 static const union tgsi_exec_channel ZeroVec =
    { { 0.0, 0.0, 0.0, 0.0 } };
 
 
-#ifdef DEBUG
-static void
-check_inf_or_nan(const union tgsi_exec_channel *chan)
-{
-   assert(!util_is_inf_or_nan(chan->f[0]));
-   assert(!util_is_inf_or_nan(chan->f[1]));
-   assert(!util_is_inf_or_nan(chan->f[2]));
-   assert(!util_is_inf_or_nan(chan->f[3]));
-}
-#endif
+#define CHECK_INF_OR_NAN(chan) do {\
+      assert(!util_is_inf_or_nan((chan)->f[0]));\
+      assert(!util_is_inf_or_nan((chan)->f[1]));\
+      assert(!util_is_inf_or_nan((chan)->f[2]));\
+      assert(!util_is_inf_or_nan((chan)->f[3]));\
+   } while (0)
 
 
 #ifdef DEBUG
@@ -422,18 +738,6 @@ tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
    align_free(mach);
 }
 
-
-static void
-micro_abs(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = fabsf( src->f[0] );
-   dst->f[1] = fabsf( src->f[1] );
-   dst->f[2] = fabsf( src->f[2] );
-   dst->f[3] = fabsf( src->f[3] );
-}
-
 static void
 micro_add(
    union tgsi_exec_channel *dst,
@@ -446,76 +750,6 @@ micro_add(
    dst->f[3] = src0->f[3] + src1->f[3];
 }
 
-#if 0
-static void
-micro_iadd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-#endif
-
-static void
-micro_and(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = ceilf( src->f[0] );
-   dst->f[1] = ceilf( src->f[1] );
-   dst->f[2] = ceilf( src->f[2] );
-   dst->f[3] = ceilf( src->f[3] );
-}
-
-static void
-micro_cos(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = cosf( src->f[0] );
-   dst->f[1] = cosf( src->f[1] );
-   dst->f[2] = cosf( src->f[2] );
-   dst->f[3] = cosf( src->f[3] );
-}
-
-static void
-micro_ddx(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
-}
-
 static void
 micro_div(
    union tgsi_exec_channel *dst,
@@ -536,99 +770,6 @@ micro_div(
    }
 }
 
-#if 0
-static void
-micro_udiv(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-#endif
-
-static void
-micro_eq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-#if 0
-static void
-micro_ieq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-static void
-micro_exp2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src)
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_exp2( src->f[0] );
-   dst->f[1] = util_fast_exp2( src->f[1] );
-   dst->f[2] = util_fast_exp2( src->f[2] );
-   dst->f[3] = util_fast_exp2( src->f[3] );
-#else
-
-#if DEBUG
-   /* Inf is okay for this instruction, so clamp it to silence assertions. */
-   uint i;
-   union tgsi_exec_channel clamped;
-
-   for (i = 0; i < 4; i++) {
-      if (src->f[i] > 127.99999f) {
-         clamped.f[i] = 127.99999f;
-      } else if (src->f[i] < -126.99999f) {
-         clamped.f[i] = -126.99999f;
-      } else {
-         clamped.f[i] = src->f[i];
-      }
-   }
-   src = &clamped;
-#endif
-
-   dst->f[0] = powf( 2.0f, src->f[0] );
-   dst->f[1] = powf( 2.0f, src->f[1] );
-   dst->f[2] = powf( 2.0f, src->f[2] );
-   dst->f[3] = powf( 2.0f, src->f[3] );
-#endif
-}
-
-#if 0
-static void
-micro_f2ut(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-#endif
-
 static void
 micro_float_clamp(union tgsi_exec_channel *dst,
                   const union tgsi_exec_channel *src)
@@ -656,71 +797,6 @@ micro_float_clamp(union tgsi_exec_channel *dst,
 }
 
 static void
-micro_flr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] );
-   dst->f[1] = floorf( src->f[1] );
-   dst->f[2] = floorf( src->f[2] );
-   dst->f[3] = floorf( src->f[3] );
-}
-
-static void
-micro_frc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] - floorf( src->f[0] );
-   dst->f[1] = src->f[1] - floorf( src->f[1] );
-   dst->f[2] = src->f[2] - floorf( src->f[2] );
-   dst->f[3] = src->f[3] - floorf( src->f[3] );
-}
-
-static void
-micro_i2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_log2( src->f[0] );
-   dst->f[1] = util_fast_log2( src->f[1] );
-   dst->f[2] = util_fast_log2( src->f[2] );
-   dst->f[3] = util_fast_log2( src->f[3] );
-#else
-   dst->f[0] = logf( src->f[0] ) * 1.442695f;
-   dst->f[1] = logf( src->f[1] ) * 1.442695f;
-   dst->f[2] = logf( src->f[2] ) * 1.442695f;
-   dst->f[3] = logf( src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_le(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
 micro_lt(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src0,
@@ -734,38 +810,6 @@ micro_lt(
    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 }
 
-#if 0
-static void
-micro_ilt(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_ult(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
-}
-#endif
-
 static void
 micro_max(
    union tgsi_exec_channel *dst,
@@ -778,34 +822,6 @@ micro_max(
    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
-static void
-micro_imax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_umax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
 static void
 micro_min(
    union tgsi_exec_channel *dst,
@@ -818,48 +834,6 @@ micro_min(
    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
-static void
-micro_imin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_umin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
-#if 0
-static void
-micro_umod(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
-}
-#endif
-
 static void
 micro_mul(
    union tgsi_exec_channel *dst,
@@ -874,20 +848,6 @@ micro_mul(
 
 #if 0
 static void
-micro_imul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-#endif
-
-#if 0
-static void
 micro_imul64(
    union tgsi_exec_channel *dst0,
    union tgsi_exec_channel *dst1,
@@ -951,42 +911,6 @@ micro_neg(
    dst->f[3] = -src->f[3];
 }
 
-#if 0
-static void
-micro_ineg(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
-}
-#endif
-
-static void
-micro_not(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
-
-static void
-micro_or(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
-}
-
 static void
 micro_pow(
    union tgsi_exec_channel *dst,
@@ -1007,88 +931,6 @@ micro_pow(
 }
 
 static void
-micro_rnd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] + 0.5f );
-   dst->f[1] = floorf( src->f[1] + 0.5f );
-   dst->f[2] = floorf( src->f[2] + 0.5f );
-   dst->f[3] = floorf( src->f[3] + 0.5f );
-}
-
-static void
-micro_sgn(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
-   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
-   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
-   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
-}
-
-static void
-micro_shl(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
-}
-
-static void
-micro_ishr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
-}
-
-static void
-micro_trunc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0 )
-{
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
-}
-
-#if 0
-static void
-micro_ushr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
-#endif
-
-static void
-micro_sin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sinf( src->f[0] );
-   dst->f[1] = sinf( src->f[1] );
-   dst->f[2] = sinf( src->f[2] );
-   dst->f[3] = sinf( src->f[3] );
-}
-
-static void
 micro_sqrt( union tgsi_exec_channel *dst,
             const union tgsi_exec_channel *src )
 {
@@ -1110,31 +952,6 @@ micro_sub(
    dst->f[3] = src0->f[3] - src1->f[3];
 }
 
-#if 0
-static void
-micro_u2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
-}
-#endif
-
-static void
-micro_xor(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
-}
-
 static void
 fetch_src_file_channel(
    const struct tgsi_exec_machine *mach,
@@ -1233,11 +1050,11 @@ fetch_src_file_channel(
 }
 
 static void
-fetch_source(
-   const struct tgsi_exec_machine *mach,
-   union tgsi_exec_channel *chan,
-   const struct tgsi_full_src_register *reg,
-   const uint chan_index )
+fetch_source(const struct tgsi_exec_machine *mach,
+             union tgsi_exec_channel *chan,
+             const struct tgsi_full_src_register *reg,
+             const uint chan_index,
+             enum tgsi_exec_datatype src_datatype)
 {
    union tgsi_exec_channel index;
    uint swizzle;
@@ -1286,10 +1103,10 @@ fetch_source(
          &indir_index );
 
       /* add value of address register to the offset */
-      index.i[0] += (int) indir_index.f[0];
-      index.i[1] += (int) indir_index.f[1];
-      index.i[2] += (int) indir_index.f[2];
-      index.i[3] += (int) indir_index.f[3];
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
 
       /* for disabled execution channels, zero-out the index to
        * avoid using a potential garbage value.
@@ -1366,10 +1183,10 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += (int) indir_index.f[0];
-         index.i[1] += (int) indir_index.f[1];
-         index.i[2] += (int) indir_index.f[2];
-         index.i[3] += (int) indir_index.f[3];
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
 
          /* for disabled execution channels, zero-out the index to
           * avoid using a potential garbage value.
@@ -1394,32 +1211,30 @@ fetch_source(
       &index,
       chan );
 
-   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
-      break;
+   if (reg->Register.Absolute) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_abs(chan, chan);
+      } else {
+         micro_iabs(chan, chan);
+      }
+   }
 
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   if (reg->Register.Negate) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_neg(chan, chan);
+      } else {
+         micro_ineg(chan, chan);
+      }
    }
 }
 
 static void
-store_dest(
-   struct tgsi_exec_machine *mach,
-   const union tgsi_exec_channel *chan,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   uint chan_index )
+store_dest(struct tgsi_exec_machine *mach,
+           const union tgsi_exec_channel *chan,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           uint chan_index,
+           enum tgsi_exec_datatype dst_datatype)
 {
    uint i;
    union tgsi_exec_channel null;
@@ -1428,9 +1243,9 @@ store_dest(
    int offset = 0;  /* indirection offset */
    int index;
 
-#ifdef DEBUG
-   check_inf_or_nan(chan);
-#endif
+   if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
+      CHECK_INF_OR_NAN(chan);
+   }
 
    /* There is an extra source register that indirectly subscripts
     * a register file. The direct index now becomes an offset
@@ -1465,7 +1280,7 @@ store_dest(
          &indir_index );
 
       /* save indirection offset */
-      offset = (int) indir_index.f[0];
+      offset = indir_index.i[0];
    }
 
    switch (reg->Register.File) {
@@ -1595,10 +1410,10 @@ store_dest(
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
-    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
+    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
 
 #define STORE(VAL,INDEX,CHAN)\
-    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
+   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
 
 
 /**
@@ -1694,7 +1509,8 @@ fetch_texel( struct tgsi_sampler *sampler,
              const union tgsi_exec_channel *s,
              const union tgsi_exec_channel *t,
              const union tgsi_exec_channel *p,
-             float lodbias,  /* XXX should be float[4] */
+             const union tgsi_exec_channel *c0,
+             enum tgsi_sampler_control control,
              union tgsi_exec_channel *r,
              union tgsi_exec_channel *g,
              union tgsi_exec_channel *b,
@@ -1703,7 +1519,7 @@ fetch_texel( struct tgsi_sampler *sampler,
    uint j;
    float rgba[NUM_CHANNELS][QUAD_SIZE];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
 
    for (j = 0; j < 4; j++) {
       r->f[j] = rgba[0][j];
@@ -1714,102 +1530,95 @@ fetch_texel( struct tgsi_sampler *sampler,
 }
 
 
+#define TEX_MODIFIER_NONE           0
+#define TEX_MODIFIER_PROJECTED      1
+#define TEX_MODIFIER_LOD_BIAS       2
+#define TEX_MODIFIER_EXPLICIT_LOD   3
+
+
 static void
 exec_tex(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
-         boolean biasLod,
-         boolean projected)
+         uint modifier)
 {
    const uint unit = inst->Src[1].Register.Index;
    union tgsi_exec_channel r[4];
+   const union tgsi_exec_channel *lod = &ZeroVec;
+   enum tgsi_sampler_control control;
    uint chan_index;
-   float lodBias;
 
-   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+   if (modifier != TEX_MODIFIER_NONE) {
+      FETCH(&r[3], 0, CHAN_W);
+      if (modifier != TEX_MODIFIER_PROJECTED) {
+         lod = &r[3];
+      }
+   }
+
+   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
+      control = tgsi_sampler_lod_explicit;
+   } else {
+      control = tgsi_sampler_lod_bias;
+   }
 
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-
       FETCH(&r[0], 0, CHAN_X);
 
-      if (projected) {
-         FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[1], 0, CHAN_W);
-         lodBias = r[2].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
+                  control,
+                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], lod,
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
    default:
-      assert (0);
+      assert(0);
    }
 
-   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE( &r[chan_index], 0, chan_index );
+   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(&r[chan_index], 0, chan_index);
    }
 }
 
@@ -1832,8 +1641,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[0], 0, CHAN_X);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
@@ -1846,8 +1656,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
-                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
@@ -1858,7 +1669,8 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,
+                  &r[0], &r[1], &r[2], &ZeroVec,
+                  tgsi_sampler_lod_bias,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
@@ -1955,7 +1767,7 @@ exec_declaration(struct tgsi_exec_machine *mach,
          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
             assert(decl->Semantic.Index == 0);
             assert(first == last);
-            assert(mask = TGSI_WRITEMASK_XYZW);
+            assert(mask == TGSI_WRITEMASK_XYZW);
 
             mach->Inputs[first] = mach->QuadPos;
          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
@@ -2001,6 +1813,585 @@ exec_declaration(struct tgsi_exec_machine *mach,
    }
 }
 
+typedef void (* micro_op)(union tgsi_exec_channel *dst,
+                          const union tgsi_exec_channel *src);
+
+static void
+exec_scalar_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   union tgsi_exec_channel src;
+   union tgsi_exec_channel dst;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
+   op(&dst, &src);
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src;
+
+         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
+         op(&dst.xyzw[chan], &src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_binary(struct tgsi_exec_machine *mach,
+                   const struct tgsi_full_instruction *inst,
+                   micro_op op,
+                   enum tgsi_exec_datatype dst_datatype,
+                   enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[2];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         op(&dst.xyzw[chan], src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_trinary(struct tgsi_exec_machine *mach,
+                    const struct tgsi_full_instruction *inst,
+                    micro_op op,
+                    enum tgsi_exec_datatype dst_datatype,
+                    enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[3];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
+         op(&dst.xyzw[chan], src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_dp3(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], arg);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp4(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], arg);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2a(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], arg);
+
+   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dph(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], arg);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], arg);
+
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], arg);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_break(struct tgsi_exec_machine *mach)
+{
+   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+   } else {
+      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
+
+      mach->Switch.mask = 0x0;
+
+      UPDATE_EXEC_MASK(mach);
+   }
+}
+
+static void
+exec_switch(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+   mach->Switch.mask = 0x0;
+   mach->Switch.defaultMask = 0x0;
+
+   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_case(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+   union tgsi_exec_channel src;
+   uint mask = 0;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+
+   if (mach->Switch.selector.u[0] == src.u[0]) {
+      mask |= 0x1;
+   }
+   if (mach->Switch.selector.u[1] == src.u[1]) {
+      mask |= 0x2;
+   }
+   if (mach->Switch.selector.u[2] == src.u[2]) {
+      mask |= 0x4;
+   }
+   if (mach->Switch.selector.u[3] == src.u[3]) {
+      mask |= 0x8;
+   }
+
+   mach->Switch.defaultMask |= mask;
+
+   mach->Switch.mask |= mask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_default(struct tgsi_exec_machine *mach)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+
+   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_endswitch(struct tgsi_exec_machine *mach)
+{
+   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
+   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+micro_i2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->i[0];
+   dst->f[1] = (float)src->i[1];
+   dst->f[2] = (float)src->i[2];
+   dst->f[3] = (float)src->i[3];
+}
+
+static void
+micro_not(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_shl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] << src[1].u[0];
+   dst->u[1] = src[0].u[1] << src[1].u[1];
+   dst->u[2] = src[0].u[2] << src[1].u[2];
+   dst->u[3] = src[0].u[3] << src[1].u[3];
+}
+
+static void
+micro_and(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] & src[1].u[0];
+   dst->u[1] = src[0].u[1] & src[1].u[1];
+   dst->u[2] = src[0].u[2] & src[1].u[2];
+   dst->u[3] = src[0].u[3] & src[1].u[3];
+}
+
+static void
+micro_or(union tgsi_exec_channel *dst,
+         const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] | src[1].u[0];
+   dst->u[1] = src[0].u[1] | src[1].u[1];
+   dst->u[2] = src[0].u[2] | src[1].u[2];
+   dst->u[3] = src[0].u[3] | src[1].u[3];
+}
+
+static void
+micro_xor(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] ^ src[1].u[0];
+   dst->u[1] = src[0].u[1] ^ src[1].u[1];
+   dst->u[2] = src[0].u[2] ^ src[1].u[2];
+   dst->u[3] = src[0].u[3] ^ src[1].u[3];
+}
+
+static void
+micro_f2i(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)src->f[0];
+   dst->i[1] = (int)src->f[1];
+   dst->i[2] = (int)src->f[2];
+   dst->i[3] = (int)src->f[3];
+}
+
+static void
+micro_idiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] / src[1].i[0];
+   dst->i[1] = src[0].i[1] / src[1].i[1];
+   dst->i[2] = src[0].i[2] / src[1].i[2];
+   dst->i[3] = src[0].i[3] / src[1].i[3];
+}
+
+static void
+micro_imax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
+   dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
+   dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
+   dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
+}
+
+static void
+micro_imin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
+   dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
+   dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
+   dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
+}
+
+static void
+micro_isge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
+   dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
+   dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
+   dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
+}
+
+static void
+micro_ishr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] >> src[1].i[0];
+   dst->i[1] = src[0].i[1] >> src[1].i[1];
+   dst->i[2] = src[0].i[2] >> src[1].i[2];
+   dst->i[3] = src[0].i[3] >> src[1].i[3];
+}
+
+static void
+micro_islt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
+   dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
+   dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
+   dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
+}
+
+static void
+micro_f2u(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = (uint)src->f[0];
+   dst->u[1] = (uint)src->f[1];
+   dst->u[2] = (uint)src->f[2];
+   dst->u[3] = (uint)src->f[3];
+}
+
+static void
+micro_u2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->u[0];
+   dst->f[1] = (float)src->u[1];
+   dst->f[2] = (float)src->u[2];
+   dst->f[3] = (float)src->u[3];
+}
+
+static void
+micro_uadd(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] + src[1].u[0];
+   dst->u[1] = src[0].u[1] + src[1].u[1];
+   dst->u[2] = src[0].u[2] + src[1].u[2];
+   dst->u[3] = src[0].u[3] + src[1].u[3];
+}
+
+static void
+micro_udiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] / src[1].u[0];
+   dst->u[1] = src[0].u[1] / src[1].u[1];
+   dst->u[2] = src[0].u[2] / src[1].u[2];
+   dst->u[3] = src[0].u[3] / src[1].u[3];
+}
+
+static void
+micro_umad(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
+   dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
+   dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
+   dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
+}
+
+static void
+micro_umax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
+   dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
+   dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
+   dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
+}
+
+static void
+micro_umin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
+   dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
+   dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
+   dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
+}
+
+static void
+micro_umod(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] % src[1].u[0];
+   dst->u[1] = src[0].u[1] % src[1].u[1];
+   dst->u[2] = src[0].u[2] % src[1].u[2];
+   dst->u[3] = src[0].u[3] % src[1].u[3];
+}
+
+static void
+micro_umul(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] * src[1].u[0];
+   dst->u[1] = src[0].u[1] * src[1].u[1];
+   dst->u[2] = src[0].u[2] * src[1].u[2];
+   dst->u[3] = src[0].u[3] * src[1].u[3];
+}
+
+static void
+micro_useq(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_usge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_ushr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] >> src[1].u[0];
+   dst->u[1] = src[0].u[1] >> src[1].u[1];
+   dst->u[2] = src[0].u[2] >> src[1].u[2];
+   dst->u[3] = src[0].u[3] >> src[1].u[3];
+}
+
+static void
+micro_uslt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_usne(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
+}
+
 static void
 exec_instruction(
    struct tgsi_exec_machine *mach,
@@ -2015,23 +2406,11 @@ exec_instruction(
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
-   case TGSI_OPCODE_FLR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_flr(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MOV:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&d[chan_index], 0, chan_index);
-      }
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LIT:
@@ -2068,23 +2447,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_abs( &r[0], &r[0] );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EXP:
@@ -2151,54 +2518,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Z );
-      FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp3(mach, inst);
       break;
 
-    case TGSI_OPCODE_DP4:
-    /* TGSI_OPCODE_DOT4 */
-       FETCH(&r[0], 0, CHAN_X);
-       FETCH(&r[1], 1, CHAN_X);
-
-       micro_mul( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Y);
-       FETCH(&r[2], 1, CHAN_Y);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Z);
-       FETCH(&r[2], 1, CHAN_Z);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_W);
-       FETCH(&r[2], 1, CHAN_W);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+   case TGSI_OPCODE_DP4:
+      exec_dp4(mach, inst);
       break;
 
    case TGSI_OPCODE_DST:
@@ -2255,41 +2579,15 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SUB:
@@ -2304,17 +2602,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_LRP:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add(&d[chan_index], &r[0], &r[2]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CND:
@@ -2330,31 +2618,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP2A:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[2], 2, CHAN_X );
-      micro_add( &r[0], &r[0], &r[2] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2a(mach, inst);
       break;
 
    case TGSI_OPCODE_FRC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_frc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CLAMP:
@@ -2370,33 +2638,20 @@ exec_instruction(
       }
       break;
 
+   case TGSI_OPCODE_FLR:
+      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
    case TGSI_OPCODE_ROUND:
-   case TGSI_OPCODE_ARR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_rnd(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EX2:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_exp2( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LG2:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_POW:
@@ -2449,15 +2704,9 @@ exec_instruction(
       }
       break;
 
-    case TGSI_OPCODE_ABS:
-       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-          FETCH(&r[0], 0, chan_index);
-          micro_abs(&d[chan_index], &r[0]);
-       }
-       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-       break;
+   case TGSI_OPCODE_ABS:
+      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
 
    case TGSI_OPCODE_RCC:
       FETCH(&r[0], 0, CHAN_X);
@@ -2469,60 +2718,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DPH:
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 1, CHAN_X);
-
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 1, CHAN_Y);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Z);
-      FETCH(&r[2], 1, CHAN_Z);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 1, CHAN_W);
-
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dph(mach, inst);
       break;
 
    case TGSI_OPCODE_COS:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_cos( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddx(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDY:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddy(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_KILP:
@@ -2599,14 +2807,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2616,44 +2817,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SIN:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SLE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SNE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_STR:
@@ -2666,14 +2842,14 @@ exec_instruction(
       /* simple texture lookup */
       /* src[0] = texcoord */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_NONE);
       break;
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
       /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
       break;
 
    case TGSI_OPCODE_TXD:
@@ -2689,14 +2865,14 @@ exec_instruction(
       /* Texture lookup with explit LOD */
       /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
       break;
 
    case TGSI_OPCODE_TXP:
       /* Texture lookup with projection */
       /* src[0] = texcoord (src[0].w = projection) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, TRUE);
+      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
       break;
 
    case TGSI_OPCODE_UP2H:
@@ -2758,6 +2934,10 @@ exec_instruction(
       assert (0);
       break;
 
+   case TGSI_OPCODE_ARR:
+      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
    case TGSI_OPCODE_BRA:
       assert (0);
       break;
@@ -2777,6 +2957,8 @@ exec_instruction(
          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
+         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
+         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
          /* note that PC was already incremented above */
          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
 
@@ -2784,12 +2966,17 @@ exec_instruction(
 
          /* Second, push the Cond, Loop, Cont, Func stacks */
          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
-         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* Finally, jump to the subroutine */
@@ -2822,6 +3009,12 @@ exec_instruction(
          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
          mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
          assert(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -2832,14 +3025,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_sgn(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CMP:
@@ -2946,18 +3132,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2(mach, inst);
       break;
 
    case TGSI_OPCODE_IF:
@@ -3023,87 +3198,31 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CEIL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ceil(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_I2F:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_i2f(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
       break;
 
    case TGSI_OPCODE_NOT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_not(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_TRUNC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_trunc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SHL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_shl(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_SHR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_ishr(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_AND:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_and(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_OR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_or(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_MOD:
@@ -3111,14 +3230,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_XOR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_xor(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_SAD:
@@ -3167,11 +3279,15 @@ exec_instruction(
    case TGSI_OPCODE_BGNLOOP:
       /* push LoopMask and ContMasks */
       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
+      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
       break;
 
    case TGSI_OPCODE_ENDFOR:
@@ -3218,6 +3334,8 @@ exec_instruction(
          --mach->LoopLabelStackTop;
          assert(mach->LoopCounterStackTop > 0);
          --mach->LoopCounterStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
@@ -3241,15 +3359,14 @@ exec_instruction(
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
          assert(mach->LoopLabelStackTop > 0);
          --mach->LoopLabelStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
 
    case TGSI_OPCODE_BRK:
-      /* turn off loop channels for each enabled exec channel */
-      mach->LoopMask &= ~mach->ExecMask;
-      /* Todo: if mach->LoopMask == 0, jump to end of loop */
-      UPDATE_EXEC_MASK(mach);
+      exec_break(mach);
       break;
 
    case TGSI_OPCODE_CONT:
@@ -3280,6 +3397,12 @@ exec_instruction(
       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
       mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
       assert(mach->FuncStackTop > 0);
       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -3310,11 +3433,116 @@ exec_instruction(
       UPDATE_EXEC_MASK(mach);
       break;
 
+   case TGSI_OPCODE_F2I:
+      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_IDIV:
+      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMAX:
+      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMIN:
+      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_INEG:
+      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISGE:
+      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISLT:
+      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_F2U:
+      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_U2F:
+      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UADD:
+      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UDIV:
+      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAD:
+      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAX:
+      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMIN:
+      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMOD:
+      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMUL:
+      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USEQ:
+      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USGE:
+      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USHR:
+      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USLT:
+      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USNE:
+      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_SWITCH:
+      exec_switch(mach, inst);
+      break;
+
+   case TGSI_OPCODE_CASE:
+      exec_case(mach, inst);
+      break;
+
+   case TGSI_OPCODE_DEFAULT:
+      exec_default(mach);
+      break;
+
+   case TGSI_OPCODE_ENDSWITCH:
+      exec_endswitch(mach);
+      break;
+
    default:
       assert( 0 );
    }
 }
 
+
 #define DEBUG_EXECUTION 0
 
 
@@ -3334,9 +3562,13 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
+   mach->Switch.mask = 0xf;
+
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
@@ -3393,11 +3625,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("(%6f, %6f, %6f, %6f)\n",
-                               temps[i].xyzw[0].f[j],
-                               temps[i].xyzw[1].f[j],
-                               temps[i].xyzw[2].f[j],
-                               temps[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
+                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
+                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
+                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3411,11 +3643,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("{%6f, %6f, %6f, %6f}\n",
-                               outputs[i].xyzw[0].f[j],
-                               outputs[i].xyzw[1].f[j],
-                               outputs[i].xyzw[2].f[j],
-                               outputs[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
+                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
+                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
+                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3437,6 +3669,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index afaf5c39c4..59e3b445cc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -35,11 +36,13 @@
 extern "C" {
 #endif
 
+
 #define MAX_LABELS (4 * 1024)  /**< basically, max instructions */
 
 #define NUM_CHANNELS 4  /* R,G,B,A */
 #define QUAD_SIZE    4  /* 4 pixel/quad */
 
+
 /**
   * Registers may be treated as float, signed int or unsigned int.
   */
@@ -69,6 +72,11 @@ struct tgsi_interp_coef
    float dady[NUM_CHANNELS];
 };
 
+enum tgsi_sampler_control {
+   tgsi_sampler_lod_bias,
+   tgsi_sampler_lod_explicit
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -80,7 +88,8 @@ struct tgsi_sampler
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
 };
 
@@ -179,6 +188,7 @@ struct tgsi_exec_labels
 
 #define TGSI_EXEC_MAX_COND_NESTING  32
 #define TGSI_EXEC_MAX_LOOP_NESTING  32
+#define TGSI_EXEC_MAX_SWITCH_NESTING 32
 #define TGSI_EXEC_MAX_CALL_NESTING  32
 
 /* The maximum number of input attributes per vertex. For 2D
@@ -206,9 +216,29 @@ struct tgsi_call_record
    uint CondStackTop;
    uint LoopStackTop;
    uint ContStackTop;
+   int SwitchStackTop;
+   int BreakStackTop;
    uint ReturnAddr;
 };
 
+
+/* Switch-case block state. */
+struct tgsi_switch_record {
+   uint mask;                          /**< execution mask */
+   union tgsi_exec_channel selector;   /**< a value case statements are compared to */
+   uint defaultMask;                   /**< non-execute mask for default case */
+};
+
+
+enum tgsi_break_type {
+   TGSI_EXEC_BREAK_INSIDE_LOOP,
+   TGSI_EXEC_BREAK_INSIDE_SWITCH
+};
+
+
+#define TGSI_EXEC_MAX_BREAK_STACK (TGSI_EXEC_MAX_LOOP_NESTING + TGSI_EXEC_MAX_SWITCH_NESTING)
+
+
 /**
  * Run-time virtual machine state for executing TGSI shader.
  */
@@ -251,6 +281,12 @@ struct tgsi_exec_machine
    uint FuncMask;  /**< For function calls */
    uint ExecMask;  /**< = CondMask & LoopMask */
 
+   /* Current switch-case state. */
+   struct tgsi_switch_record Switch;
+
+   /* Current break type. */
+   enum tgsi_break_type BreakType;
+
    /** Condition mask stack (for nested conditionals) */
    uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
    int CondStackTop;
@@ -271,6 +307,13 @@ struct tgsi_exec_machine
    uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int ContStackTop;
 
+   /** Switch case stack */
+   struct tgsi_switch_record SwitchStack[TGSI_EXEC_MAX_SWITCH_NESTING];
+   int SwitchStackTop;
+
+   enum tgsi_break_type BreakStack[TGSI_EXEC_MAX_BREAK_STACK];
+   int BreakStackTop;
+
    /** Function execution mask stack (for executing subroutine code) */
    uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
    int FuncStackTop;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index be375cabb8..de0e09cdba 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -119,7 +119,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 1, 0, 0, 0, 0, "NOT", TGSI_OPCODE_NOT },
    { 1, 1, 0, 0, 0, 0, "TRUNC", TGSI_OPCODE_TRUNC },
    { 1, 2, 0, 0, 0, 0, "SHL", TGSI_OPCODE_SHL },
-   { 1, 2, 0, 0, 0, 0, "SHR", TGSI_OPCODE_SHR },
+   { 0, 0, 0, 0, 0, 0, "", 88 },      /* removed */
    { 1, 2, 0, 0, 0, 0, "AND", TGSI_OPCODE_AND },
    { 1, 2, 0, 0, 0, 0, "OR", TGSI_OPCODE_OR },
    { 1, 2, 0, 0, 0, 0, "MOD", TGSI_OPCODE_MOD },
@@ -149,7 +149,33 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, 0, 0, "BREAKC", TGSI_OPCODE_BREAKC },
    { 0, 1, 0, 0, 0, 0, "KIL", TGSI_OPCODE_KIL },
    { 0, 0, 0, 0, 0, 0, "END", TGSI_OPCODE_END },
-   { 0, 0, 0, 0, 0, 0, "", 118 }      /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 118 },     /* removed */
+   { 1, 1, 0, 0, 0, 0, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH }
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index b34263da48..e4af15c156 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -124,7 +124,6 @@ OP11(I2F)
 OP11(NOT)
 OP11(TRUNC)
 OP12(SHL)
-OP12(SHR)
 OP12(AND)
 OP12(OR)
 OP12(MOD)
@@ -146,6 +145,28 @@ OP01(IFC)
 OP01(BREAKC)
 OP01(KIL)
 OP00(END)
+OP11(F2I)
+OP12(IDIV)
+OP12(IMAX)
+OP12(IMIN)
+OP11(INEG)
+OP12(ISGE)
+OP12(ISHR)
+OP12(ISLT)
+OP11(F2U)
+OP11(U2F)
+OP12(UADD)
+OP12(UDIV)
+OP13(UMAD)
+OP12(UMAX)
+OP12(UMIN)
+OP12(UMOD)
+OP12(UMUL)
+OP12(USEQ)
+OP12(USGE)
+OP12(USHR)
+OP12(USLT)
+OP12(USNE)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index fa65ecb997..8c7062d850 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -119,17 +119,29 @@ tgsi_parse_token(
    case TGSI_TOKEN_TYPE_IMMEDIATE:
    {
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+      uint imm_count;
 
       memset(imm, 0, sizeof *imm);
       copy_token(&imm->Immediate, &token);
 
+      imm_count = imm->Immediate.NrTokens - 1;
+
       switch (imm->Immediate.DataType) {
       case TGSI_IMM_FLOAT32:
-         {
-            uint imm_count = imm->Immediate.NrTokens - 1;
-            for (i = 0; i < imm_count; i++) {
-               next_token(ctx, &imm->u[i]);
-            }
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Float);
+         }
+         break;
+
+      case TGSI_IMM_UINT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Uint);
+         }
+         break;
+
+      case TGSI_IMM_INT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Int);
          }
          break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 16b8ec6051..7f1c8e5dd6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -195,7 +195,7 @@ is_any_register_declared(
    struct cso_hash_iter iter =
       cso_hash_first_node(ctx->regs_decl);
 
-   while (cso_hash_iter_is_null(iter)) {
+   while (!cso_hash_iter_is_null(iter)) {
       scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
       if (reg->file == file)
          return TRUE;
@@ -247,7 +247,7 @@ check_register_usage(
    boolean indirect_access )
 {
    if (!check_file_name( ctx, reg->file )) {
-      free(reg);
+      FREE(reg);
       return FALSE;
    }
 
@@ -261,21 +261,23 @@ check_register_usage(
       if (!is_ind_register_used(ctx, reg))
          cso_hash_insert(ctx->regs_ind_used, reg->file, reg);
       else
-         free(reg);
+         FREE(reg);
    }
    else {
       if (!is_register_declared( ctx, reg )) {
-         if (reg->dimensions == 2)
+         if (reg->dimensions == 2) {
             report_error( ctx, "%s[%d][%d]: Undeclared %s register", file_names[reg->file],
                           reg->indices[0], reg->indices[1], name );
-         else
+         }
+         else {
             report_error( ctx, "%s[%d]: Undeclared %s register", file_names[reg->file],
                           reg->indices[0], name );
          }
+      }
       if (!is_register_used( ctx, reg ))
          cso_hash_insert(ctx->regs_used, scan_register_key(reg), reg);
       else
-         free(reg);
+         FREE(reg);
    }
    return TRUE;
 }
@@ -333,15 +335,15 @@ iter_instruction(
          fill_scan_register1d(ind_reg,
                               inst->Src[i].Indirect.File,
                               inst->Src[i].Indirect.Index);
+         if (!(reg->file == TGSI_FILE_ADDRESS || reg->file == TGSI_FILE_LOOP) ||
+             reg->indices[0] != 0) {
+            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
+         }
          check_register_usage(
             ctx,
             reg,
             "indirect",
             FALSE );
-         if (!(reg->file == TGSI_FILE_ADDRESS || reg->file == TGSI_FILE_LOOP) ||
-             reg->indices[0] != 0) {
-            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
-         }
       }
    }
 
@@ -445,7 +447,9 @@ iter_immediate(
 
    /* Check data type validity.
     */
-   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
+   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32 &&
+       imm->Immediate.DataType != TGSI_IMM_UINT32 &&
+       imm->Immediate.DataType != TGSI_IMM_INT32) {
       report_error( ctx, "(%u): Invalid immediate data type", imm->Immediate.DataType );
       return TRUE;
    }
@@ -486,7 +490,7 @@ epilog(
       struct cso_hash_iter iter =
          cso_hash_first_node(ctx->regs_decl);
 
-      while (cso_hash_iter_is_null(iter)) {
+      while (!cso_hash_iter_is_null(iter)) {
          scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
          if (!is_register_used(ctx, reg) && !is_ind_register_used(ctx, reg)) {
             report_warning( ctx, "%s[%u]: Register never used",
@@ -511,7 +515,8 @@ regs_hash_destroy(struct cso_hash *hash)
    while (!cso_hash_iter_is_null(iter)) {
       scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
       iter = cso_hash_erase(hash, iter);
-      free(reg);
+      assert(reg->file < TGSI_FILE_COUNT);
+      FREE(reg);
    }
    cso_hash_delete(hash);
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index d63c75dafb..a85cc4659e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -1418,13 +1419,13 @@ fetch_texel( struct tgsi_sampler **sampler,
                 sampler, *sampler,
                 store );
 
-   debug_printf("lodbias %f\n", store[12]);
-
    for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n", 
+      debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
                    j, 
                    store[0+j],
-                   store[4+j]);
+                   store[4+j],
+                   store[8 + j],
+                   store[12 + j]);
 #endif
 
    {
@@ -1433,7 +1434,8 @@ fetch_texel( struct tgsi_sampler **sampler,
                               &store[0],  /* s */
                               &store[4],  /* t */
                               &store[8],  /* r */
-                              store[12],  /* lodbias */
+                              &store[12], /* lodbias */
+                              tgsi_sampler_lod_bias,
                               rgba);      /* results */
 
       memcpy( store, rgba, 16 * sizeof(float));
@@ -2144,40 +2146,50 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_XPD:
+      /* Note: we do all stores after all operands have been fetched
+       * to avoid src/dst register aliasing issues for an instruction
+       * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
+       */
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
+         FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
+         FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
+         FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
+         FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
+         emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
+         emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
+         emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
+         /* store xmm[7] in dst.x below */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
+         FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
+         FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
+         emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
+         emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
+         emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
+         /* store xmm[3] in dst.y below */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
+         emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
+         STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
 	 emit_tempf(
@@ -2506,7 +2518,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( func, inst, TRUE, FALSE );
+      return 0;
       break;
 
    case TGSI_OPCODE_TXP:
@@ -2578,7 +2590,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       return 0;
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 6a0af664dd..e64e2b731d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -101,8 +101,13 @@ struct ureg_program
    unsigned nr_outputs;
 
    struct {
-      float v[4];
+      union {
+         float f[4];
+         unsigned u[4];
+         int i[4];
+      } value;
       unsigned nr;
+      unsigned type;
    } immediate[UREG_MAX_IMMEDIATE];
    unsigned nr_immediates;
 
@@ -486,22 +491,22 @@ struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
 }
 
 
-
-
-static int match_or_expand_immediate( const float *v,
-                                      unsigned nr,
-                                      float *v2,
-                                      unsigned *nr2,
-                                      unsigned *swizzle )
+static int
+match_or_expand_immediate( const unsigned *v,
+                           unsigned nr,
+                           unsigned *v2,
+                           unsigned *pnr2,
+                           unsigned *swizzle )
 {
+   unsigned nr2 = *pnr2;
    unsigned i, j;
-   
+
    *swizzle = 0;
 
    for (i = 0; i < nr; i++) {
       boolean found = FALSE;
 
-      for (j = 0; j < *nr2 && !found; j++) {
+      for (j = 0; j < nr2 && !found; j++) {
          if (v[i] == v2[j]) {
             *swizzle |= j << (i * 2);
             found = TRUE;
@@ -509,24 +514,28 @@ static int match_or_expand_immediate( const float *v,
       }
 
       if (!found) {
-         if (*nr2 >= 4) 
+         if (nr2 >= 4) {
             return FALSE;
+         }
 
-         v2[*nr2] = v[i];
-         *swizzle |= *nr2 << (i * 2);
-         (*nr2)++;
+         v2[nr2] = v[i];
+         *swizzle |= nr2 << (i * 2);
+         nr2++;
       }
    }
 
+   /* Actually expand immediate only when fully succeeded.
+    */
+   *pnr2 = nr2;
    return TRUE;
 }
 
 
-
-
-struct ureg_src ureg_DECL_immediate( struct ureg_program *ureg, 
-                                     const float *v,
-                                     unsigned nr )
+static struct ureg_src
+decl_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned nr,
+                unsigned type )
 {
    unsigned i, j;
    unsigned swizzle;
@@ -536,38 +545,82 @@ struct ureg_src ureg_DECL_immediate( struct ureg_program *ureg,
     */
 
    for (i = 0; i < ureg->nr_immediates; i++) {
-      if (match_or_expand_immediate( v, 
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      if (ureg->immediate[i].type != type) {
+         continue;
+      }
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
    if (ureg->nr_immediates < UREG_MAX_IMMEDIATE) {
       i = ureg->nr_immediates++;
-      if (match_or_expand_immediate( v,
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      ureg->immediate[i].type = type;
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
-   set_bad( ureg );
+   set_bad(ureg);
 
 out:
    /* Make sure that all referenced elements are from this immediate.
     * Has the effect of making size-one immediates into scalars.
     */
-   for (j = nr; j < 4; j++)
+   for (j = nr; j < 4; j++) {
       swizzle |= (swizzle & 0x3) << (j * 2);
+   }
+
+   return ureg_swizzle(ureg_src_register(TGSI_FILE_IMMEDIATE, i),
+                       (swizzle >> 0) & 0x3,
+                       (swizzle >> 2) & 0x3,
+                       (swizzle >> 4) & 0x3,
+                       (swizzle >> 6) & 0x3);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate( struct ureg_program *ureg,
+                     const float *v,
+                     unsigned nr )
+{
+   union {
+      float f[4];
+      unsigned u[4];
+   } fu;
+   unsigned int i;
+
+   for (i = 0; i < nr; i++) {
+      fu.f[i] = v[i];
+   }
+
+   return decl_immediate(ureg, fu.u, nr, TGSI_IMM_FLOAT32);
+}
+
 
-   return ureg_swizzle( ureg_src_register( TGSI_FILE_IMMEDIATE, i ),
-                        (swizzle >> 0) & 0x3,
-                        (swizzle >> 2) & 0x3,
-                        (swizzle >> 4) & 0x3,
-                        (swizzle >> 6) & 0x3);
+struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *ureg,
+                          const unsigned *v,
+                          unsigned nr )
+{
+   return decl_immediate(ureg, v, nr, TGSI_IMM_UINT32);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *ureg,
+                         const int *v,
+                         unsigned nr )
+{
+   return decl_immediate(ureg, (const unsigned *)v, nr, TGSI_IMM_INT32);
 }
 
 
@@ -955,21 +1008,23 @@ static void emit_decl_range( struct ureg_program *ureg,
    out[1].decl_range.Last = first + count - 1;
 }
 
-static void emit_immediate( struct ureg_program *ureg,
-                            const float *v )
+static void
+emit_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned type )
 {
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_DECL, 5 );
 
    out[0].value = 0;
    out[0].imm.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
    out[0].imm.NrTokens = 5;
-   out[0].imm.DataType = TGSI_IMM_FLOAT32;
+   out[0].imm.DataType = type;
    out[0].imm.Padding = 0;
 
-   out[1].imm_data.Float = v[0];
-   out[2].imm_data.Float = v[1];
-   out[3].imm_data.Float = v[2];
-   out[4].imm_data.Float = v[3];
+   out[1].imm_data.Uint = v[0];
+   out[2].imm_data.Uint = v[1];
+   out[3].imm_data.Uint = v[2];
+   out[4].imm_data.Uint = v[3];
 }
 
 
@@ -1055,7 +1110,8 @@ static void emit_decls( struct ureg_program *ureg )
 
    for (i = 0; i < ureg->nr_immediates; i++) {
       emit_immediate( ureg,
-                      ureg->immediate[i].v );
+                      ureg->immediate[i].value.u,
+                      ureg->immediate[i].type );
    }
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 7e3e7bcf1d..6f11273320 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -148,6 +148,16 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *,
+                          const unsigned *v,
+                          unsigned nr );
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *,
+                         const int *v,
+                         unsigned nr );
+
+struct ureg_src
 ureg_DECL_constant( struct ureg_program *,
                     unsigned index );
 
@@ -221,6 +231,90 @@ ureg_imm1f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 1 );
 }
 
+static INLINE struct ureg_src
+ureg_imm4u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c, unsigned d)
+{
+   unsigned v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_uint( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c)
+{
+   unsigned v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_uint( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2u( struct ureg_program *ureg,
+            unsigned a, unsigned b)
+{
+   unsigned v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_uint( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1u( struct ureg_program *ureg,
+            unsigned a)
+{
+   return ureg_DECL_immediate_uint( ureg, &a, 1 );
+}
+
+static INLINE struct ureg_src
+ureg_imm4i( struct ureg_program *ureg,
+            int a, int b,
+            int c, int d)
+{
+   int v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_int( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3i( struct ureg_program *ureg,
+            int a, int b,
+            int c)
+{
+   int v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_int( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2i( struct ureg_program *ureg,
+            int a, int b)
+{
+   int v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_int( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1i( struct ureg_program *ureg,
+            int a)
+{
+   return ureg_DECL_immediate_int( ureg, &a, 1 );
+}
+
 /***********************************************************************
  * Functions for patching up labels
  */
diff --git a/src/gallium/auxiliary/util/u_bitmask.c b/src/gallium/auxiliary/util/u_bitmask.c
index 77587c07ec..23c93a3ebc 100644
--- a/src/gallium/auxiliary/util/u_bitmask.c
+++ b/src/gallium/auxiliary/util/u_bitmask.c
@@ -97,12 +97,12 @@ util_bitmask_resize(struct util_bitmask *bm,
    if(!minimum_size)
       return FALSE;
       
-   if(bm->size > minimum_size)
+   if(bm->size >= minimum_size)
       return TRUE;
 
    assert(bm->size % UTIL_BITMASK_BITS_PER_WORD == 0);
    new_size = bm->size;
-   while(!(new_size > minimum_size)) {
+   while(new_size < minimum_size) {
       new_size *= 2;
       /* Check integer overflow */
       if(new_size < bm->size)
@@ -136,7 +136,7 @@ util_bitmask_filled_set(struct util_bitmask *bm,
                         unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index == bm->filled) {
       ++bm->filled;
@@ -149,7 +149,7 @@ util_bitmask_filled_unset(struct util_bitmask *bm,
                           unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index < bm->filled)
       bm->filled = index;
@@ -182,7 +182,7 @@ util_bitmask_add(struct util_bitmask *bm)
       mask = 1;
    }
 found:
-   
+
    /* grow the bitmask if necessary */
    if(!util_bitmask_resize(bm, bm->filled))
       return UTIL_BITMASK_INVALID_INDEX;
@@ -198,9 +198,9 @@ unsigned
 util_bitmask_set(struct util_bitmask *bm, 
                  unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
@@ -208,6 +208,10 @@ util_bitmask_set(struct util_bitmask *bm,
    if(!util_bitmask_resize(bm, index))
       return UTIL_BITMASK_INVALID_INDEX;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] |= mask;
 
    util_bitmask_filled_set(bm, index);
@@ -220,15 +224,19 @@ void
 util_bitmask_clear(struct util_bitmask *bm, 
                    unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
    if(index >= bm->size)
       return;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] &= ~mask;
    
    util_bitmask_filled_unset(bm, index);
@@ -250,7 +258,7 @@ util_bitmask_get(struct util_bitmask *bm,
       return TRUE;
    }
 
-   if(index > bm->size)
+   if(index >= bm->size)
       return FALSE;
 
    if(bm->words[word] & mask) {
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 1f794d39a1..46b4706b76 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -48,6 +48,8 @@
 #include "util/u_simple_shaders.h"
 #include "util/u_texture.h"
 
+#define INVALID_PTR ((void*)~0)
+
 struct blitter_context_priv
 {
    struct blitter_context blitter;
@@ -110,6 +112,11 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    ctx->pipe = pipe;
 
    /* init state objects for them to be considered invalid */
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
    ctx->blitter.saved_fb_state.nr_cbufs = ~0;
    ctx->blitter.saved_num_textures = ~0;
    ctx->blitter.saved_num_sampler_states = ~0;
@@ -156,6 +163,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    rs_state.cull_mode = PIPE_WINDING_NONE;
    rs_state.bypass_vs_clip_and_viewport = 1;
    rs_state.gl_rasterization_rules = 1;
+   rs_state.flatshade = 1;
    ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state);
 
    /* fragment shaders are created on-demand */
@@ -234,11 +242,11 @@ void util_blitter_destroy(struct blitter_context *blitter)
 static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
 {
    /* make sure these CSOs have been saved */
-   assert(ctx->blitter.saved_blend_state &&
-          ctx->blitter.saved_dsa_state &&
-          ctx->blitter.saved_rs_state &&
-          ctx->blitter.saved_fs &&
-          ctx->blitter.saved_vs);
+   assert(ctx->blitter.saved_blend_state != INVALID_PTR &&
+          ctx->blitter.saved_dsa_state != INVALID_PTR &&
+          ctx->blitter.saved_rs_state != INVALID_PTR &&
+          ctx->blitter.saved_fs != INVALID_PTR &&
+          ctx->blitter.saved_vs != INVALID_PTR);
 }
 
 static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
@@ -252,11 +260,11 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
    pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
    pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
 
-   ctx->blitter.saved_blend_state = 0;
-   ctx->blitter.saved_dsa_state = 0;
-   ctx->blitter.saved_rs_state = 0;
-   ctx->blitter.saved_fs = 0;
-   ctx->blitter.saved_vs = 0;
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
 
    /* restore the state objects which are required to be saved before copy/fill
     */
@@ -560,45 +568,29 @@ void util_blitter_clear(struct blitter_context *blitter,
    blitter_restore_CSOs(ctx);
 }
 
-void util_blitter_copy(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       struct pipe_surface *src,
-                       unsigned srcx, unsigned srcy,
-                       unsigned width, unsigned height,
-                       boolean ignore_stencil)
+static boolean
+is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2,
+           unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2)
+{
+    if (sx1 >= dx2 || sx2 <= dx1 || sy1 >= dy2 || sy2 <= dy1) {
+        return FALSE;
+    } else {
+        return TRUE;
+    }
+}
+
+static void util_blitter_do_copy(struct blitter_context *blitter,
+				 struct pipe_surface *dst,
+				 unsigned dstx, unsigned dsty,
+				 struct pipe_surface *src,
+				 unsigned srcx, unsigned srcy,
+				 unsigned width, unsigned height,
+				 boolean is_depth)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
    struct pipe_framebuffer_state fb_state;
-   boolean is_stencil, is_depth;
-   unsigned dst_tex_usage;
-
-   /* give up if textures are not set */
-   assert(dst->texture && src->texture);
-   if (!dst->texture || !src->texture)
-      return;
 
-   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
-   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
-   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
-                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
-
-   /* check if we can sample from and render to the surfaces */
-   /* (assuming copying a stencil buffer is not possible) */
-   if ((!ignore_stencil && is_stencil) ||
-       !screen->is_format_supported(screen, dst->format, dst->texture->target,
-                                    dst_tex_usage, 0) ||
-       !screen->is_format_supported(screen, src->format, src->texture->target,
-                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
-      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
-                        width, height);
-      return;
-   }
-
-   /* check whether the states are properly saved */
-   blitter_check_saved_CSOs(ctx);
    assert(blitter->saved_fb_state.nr_cbufs != ~0);
    assert(blitter->saved_num_textures != ~0);
    assert(blitter->saved_num_sampler_states != ~0);
@@ -656,6 +648,108 @@ void util_blitter_copy(struct blitter_context *blitter,
 
    blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, 0);
    blitter_draw_quad(ctx);
+
+}
+
+static void util_blitter_overlap_copy(struct blitter_context *blitter,
+				      struct pipe_surface *dst,
+				      unsigned dstx, unsigned dsty,
+				      struct pipe_surface *src,
+				      unsigned srcx, unsigned srcy,
+				      unsigned width, unsigned height)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   struct pipe_texture texTemp;
+   struct pipe_texture *texture;
+   struct pipe_surface *tex_surf;
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = dst->texture->format; /* XXX verify supported by driver! */
+   texTemp.last_level = 0;
+   texTemp.width0 = width;
+   texTemp.height0 = height;
+   texTemp.depth0 = 1;
+
+   texture = screen->texture_create(screen, &texTemp);
+   if (!texture)
+      return;
+
+   tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0,
+				      PIPE_BUFFER_USAGE_GPU_READ | 
+				      PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   /* blit from the src to the temp */
+   util_blitter_do_copy(blitter, tex_surf, 0, 0,
+			src, srcx, srcy,
+			width, height,
+			FALSE);
+   util_blitter_do_copy(blitter, dst, dstx, dsty,
+			tex_surf, 0, 0,
+			width, height,
+			FALSE);
+   pipe_surface_reference(&tex_surf, NULL);
+   pipe_texture_reference(&texture, NULL);
+   blitter_restore_CSOs(ctx);
+}
+
+void util_blitter_copy(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface *src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height,
+                       boolean ignore_stencil)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   boolean is_stencil, is_depth;
+   unsigned dst_tex_usage;
+
+   /* give up if textures are not set */
+   assert(dst->texture && src->texture);
+   if (!dst->texture || !src->texture)
+      return;
+
+   if (dst->texture == src->texture) {
+      if (is_overlap(srcx, srcx + width, srcy, srcy + height,
+		             dstx, dstx + width, dsty, dsty + height)) {
+         util_blitter_overlap_copy(blitter, dst, dstx, dsty, src, srcx, srcy,
+                                   width, height);
+         return;
+      }
+   }
+		   
+   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
+   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
+   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
+                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   /* check if we can sample from and render to the surfaces */
+   /* (assuming copying a stencil buffer is not possible) */
+   if ((!ignore_stencil && is_stencil) ||
+       !screen->is_format_supported(screen, dst->format, dst->texture->target,
+                                    dst_tex_usage, 0) ||
+       !screen->is_format_supported(screen, src->format, src->texture->target,
+                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
+      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
+                        width, height);
+      return;
+   }
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+   util_blitter_do_copy(blitter,
+			dst, dstx, dsty,
+			src, srcx, srcy,
+			width, height, is_depth);
    blitter_restore_CSOs(ctx);
 }
 
diff --git a/src/gallium/auxiliary/util/u_debug_dump.c b/src/gallium/auxiliary/util/u_debug_dump.c
index 09866880ae..61624d05c0 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.c
+++ b/src/gallium/auxiliary/util/u_debug_dump.c
@@ -255,15 +255,13 @@ DEFINE_DEBUG_DUMP_CONTINUOUS(tex_mipfilter)
 static const char *
 debug_dump_tex_filter_names[] = {
    "PIPE_TEX_FILTER_NEAREST",
-   "PIPE_TEX_FILTER_LINEAR",
-   "PIPE_TEX_FILTER_ANISO"
+   "PIPE_TEX_FILTER_LINEAR"
 };
 
 static const char *
 debug_dump_tex_filter_short_names[] = {
    "nearest",
-   "linear",
-   "aniso"
+   "linear"
 };
 
 DEFINE_DEBUG_DUMP_CONTINUOUS(tex_filter)
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 7623cb9398..d6484f4ad5 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -297,9 +297,9 @@ debug_memory_end(unsigned long start_no)
 
       if((start_no <= hdr->no && hdr->no < last_no) ||
 	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
-	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
+	 debug_printf("%s:%u:%s: %lu bytes at %p not freed\n",
 		      hdr->file, hdr->line, hdr->function,
-		      hdr->size, ptr);
+		      (unsigned long) hdr->size, ptr);
 #if DEBUG_MEMORY_STACK
 	 debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK);
 #endif
@@ -315,8 +315,8 @@ debug_memory_end(unsigned long start_no)
    }
 
    if(total_size) {
-      debug_printf("Total of %u KB of system memory apparently leaked\n",
-		   (total_size + 1023)/1024);
+      debug_printf("Total of %lu KB of system memory apparently leaked\n",
+		   (unsigned long) (total_size + 1023)/1024);
    }
    else {
       debug_printf("No memory leaks detected.\n");
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 866b18ff16..9f16b42944 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -76,9 +76,9 @@ PIPE_FORMAT_R8G8_SNORM            , array , 1, 1, sn8 , sn8 ,     ,     , xy01,
 PIPE_FORMAT_R8G8B8_SNORM          , array , 1, 1, sn8 , sn8 , sn8 ,     , xyz1, rgb
 PIPE_FORMAT_R8G8B8A8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb
 PIPE_FORMAT_R8G8B8X8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyz1, rgb
-PIPE_FORMAT_B6G5R5_SNORM          , arith , 1, 1, sn5 , sn5 , sn6 ,     , zyx1, rgb
-PIPE_FORMAT_A8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyxw, rgb
-PIPE_FORMAT_X8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyx1, rgb
+PIPE_FORMAT_B6G5R5_SNORM          , arith , 1, 1, sn5 , sn5 , sn6 ,     , xyz1, rgb
+PIPE_FORMAT_A8B8G8R8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , wzyx, rgb
+PIPE_FORMAT_X8B8G8R8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , wzy1, rgb
 PIPE_FORMAT_R8_SSCALED            , array , 1, 1, s8  ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R8G8_SSCALED          , array , 1, 1, s8  , s8  ,     ,     , xy01, rgb
 PIPE_FORMAT_R8G8B8_SSCALED        , array , 1, 1, s8  , s8  , s8  ,     , xyz1, rgb
@@ -90,14 +90,14 @@ PIPE_FORMAT_R32G32B32_FIXED       , array , 1, 1, h32 , h32 , h32 ,     , xyz1,
 PIPE_FORMAT_R32G32B32A32_FIXED    , array , 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
 PIPE_FORMAT_L8_SRGB               , arith , 1, 1, u8  ,     ,     ,     , xxx1, srgb 
 PIPE_FORMAT_A8L8_SRGB             , arith , 1, 1, u8  , u8  ,     ,     , xxxy, srgb 
-PIPE_FORMAT_R8G8B8_SRGB           , arith , 1, 1, u8  , u8  , u8  ,     , xyz1, srgb 
-PIPE_FORMAT_R8G8B8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyzw, srgb 
-PIPE_FORMAT_R8G8B8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyz1, srgb 
-PIPE_FORMAT_A8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , wxyz, srgb 
-PIPE_FORMAT_X8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , 1xyz, srgb 
-PIPE_FORMAT_B8G8R8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyxw, srgb 
-PIPE_FORMAT_B8G8R8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyx1, srgb 
-PIPE_FORMAT_X8UB8UG8SR8S_NORM     , arith , 1, 1, sn8 , sn8 , un8 , x8  , 1zyx, rgb
+PIPE_FORMAT_R8G8B8_SRGB           , array , 1, 1, u8  , u8  , u8  ,     , xyz1, srgb 
+PIPE_FORMAT_R8G8B8A8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , xyzw, srgb 
+PIPE_FORMAT_R8G8B8X8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , xyz1, srgb 
+PIPE_FORMAT_A8R8G8B8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , yzwx, srgb 
+PIPE_FORMAT_X8R8G8B8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , yzw1, srgb 
+PIPE_FORMAT_B8G8R8A8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , zyxw, srgb 
+PIPE_FORMAT_B8G8R8X8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , zyx1, srgb 
+PIPE_FORMAT_X8UB8UG8SR8S_NORM     , array , 1, 1, sn8 , sn8 , un8 , x8  , wzy1, rgb
 PIPE_FORMAT_B6UG5SR5S_NORM        , arith , 1, 1, sn5 , sn5 , un6 ,     , xyz1, rgb
 PIPE_FORMAT_DXT1_RGB              , dxt   , 4, 4, x64 ,     ,     ,     , xyz1, rgb
 PIPE_FORMAT_DXT1_RGBA             , dxt   , 4, 4, x64 ,     ,     ,     , xyzw, rgb
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index 9eb8f309cd..87ee0e4768 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -117,7 +117,7 @@ u_socket_connect(const char *hostname, uint16_t port)
    if (!host)
       return -1;
 
-   memcpy((char *)&sa.sin_addr,host->h_addr,host->h_length);
+   memcpy((char *)&sa.sin_addr,host->h_addr_list[0],host->h_length);
    sa.sin_family= host->h_addrtype;
    sa.sin_port = htons(port);
 
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 298fbacecb..8479161c74 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -41,7 +41,7 @@
 /**
  * Copy 2D rect from one place to another.
  * Position and sizes are in pixels.
- * src_pitch may be negative to do vertical flip of pixels from source.
+ * src_stride may be negative to do vertical flip of pixels from source.
  */
 void
 util_copy_rect(ubyte * dst,
@@ -54,7 +54,7 @@ util_copy_rect(ubyte * dst,
                const ubyte * src,
                int src_stride,
                unsigned src_x, 
-               int src_y)
+               unsigned src_y)
 {
    unsigned i;
    int src_stride_pos = src_stride < 0 ? -src_stride : src_stride;
@@ -65,10 +65,6 @@ util_copy_rect(ubyte * dst,
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(src_x >= 0);
-   assert(src_y >= 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
@@ -113,8 +109,6 @@ util_fill_rect(ubyte * dst,
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index 5e444ffae2..b44d821904 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -45,7 +45,7 @@ extern void
 util_copy_rect(ubyte * dst, enum pipe_format format,
                unsigned dst_stride, unsigned dst_x, unsigned dst_y,
                unsigned width, unsigned height, const ubyte * src,
-               int src_stride, unsigned src_x, int src_y);
+               int src_stride, unsigned src_x, unsigned src_y);
 
 extern void
 util_fill_rect(ubyte * dst, enum pipe_format format,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 8172ead020..b751e29ab6 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -44,13 +44,15 @@
 
 /**
  * Make simple vertex pass-through shader.
+ * \param num_attribs  number of attributes to pass through
+ * \param semantic_names  array of semantic names for each attribute
+ * \param semantic_indexes  array of semantic indexes for each attribute
  */
 void *
 util_make_vertex_passthrough_shader(struct pipe_context *pipe,
                                     uint num_attribs,
                                     const uint *semantic_names,
                                     const uint *semantic_indexes)
-                                    
 {
    struct ureg_program *ureg;
    uint i;
@@ -78,8 +80,6 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 }
 
 
-
-
 /**
  * Make simple fragment texture shader:
  *  IMM {0,0,0,1}                         // (if writemask != 0xf)
@@ -125,6 +125,12 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
 
+
+/**
+ * Make a simple fragment shader that sets the output color to a color
+ * taken from a texture.
+ * \param tex_target  one of PIPE_TEXTURE_x
+ */
 void *
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
 {
@@ -133,6 +139,7 @@ util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
                                                    TGSI_WRITEMASK_XYZW );
 }
 
+
 /**
  * Make a simple fragment texture shader which reads an X component from
  * a texture and writes it as depth.
@@ -177,6 +184,7 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
 
+
 /**
  * Make simple fragment color pass-through shader.
  */
@@ -186,15 +194,19 @@ util_make_fragment_passthrough_shader(struct pipe_context *pipe)
    return util_make_fragment_clonecolor_shader(pipe, 1);
 }
 
+
+/**
+ * Make a fragment shader that copies the input color to N output colors.
+ */
 void *
 util_make_fragment_clonecolor_shader(struct pipe_context *pipe, int num_cbufs)
 {
    struct ureg_program *ureg;
    struct ureg_src src;
-   struct ureg_dst dst[8];
+   struct ureg_dst dst[PIPE_MAX_COLOR_BUFS];
    int i;
 
-   assert(num_cbufs <= 8);
+   assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
 
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 5b8dd1abb9..1ba82bb21f 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -1155,27 +1155,6 @@ ycbcr_get_tile_rgba(const ushort *src,
 }
 
 
-static void
-fake_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   float *p,
-                   unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] =
-         pRow[3] = (i ^ j) & 1 ? 1.0f : 0.0f;
-      }
-      p += dst_stride;
-   }
-}
-
-
 void
 pipe_tile_raw_to_rgba(enum pipe_format format,
                       void *src,
@@ -1258,8 +1237,10 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
       break;
    default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
-      fake_get_tile_rgba(src, w, h, dst, dst_stride);
+      util_format_read_4f(format,
+                          dst, dst_stride * sizeof(float),
+                          src, util_format_get_stride(format, w),
+                          0, 0, w, h);
    }
 }
 
diff --git a/src/gallium/docs/Makefile b/src/gallium/docs/Makefile
new file mode 100644
index 0000000000..d4a5be4192
--- /dev/null
+++ b/src/gallium/docs/Makefile
@@ -0,0 +1,89 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html      to make standalone HTML files"
+	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  pickle    to make pickle files"
+	@echo "  json      to make JSON files"
+	@echo "  htmlhelp  to make HTML files and a HTML help project"
+	@echo "  qthelp    to make HTML files and a qthelp project"
+	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  changes   to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck to check all external links for integrity"
+	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Gallium.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Gallium.qhc"
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
+	      "run these through (pdf)latex."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/src/gallium/docs/make.bat b/src/gallium/docs/make.bat
new file mode 100644
index 0000000000..6f97e0730a
--- /dev/null
+++ b/src/gallium/docs/make.bat
@@ -0,0 +1,113 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+set SPHINXBUILD=sphinx-build
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html      to make standalone HTML files
+	echo.  dirhtml   to make HTML files named index.html in directories
+	echo.  pickle    to make pickle files
+	echo.  json      to make JSON files
+	echo.  htmlhelp  to make HTML files and a HTML help project
+	echo.  qthelp    to make HTML files and a qthelp project
+	echo.  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  changes   to make an overview over all changed/added/deprecated items
+	echo.  linkcheck to check all external links for integrity
+	echo.  doctest   to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Gallium.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Gallium.ghc
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+:end
diff --git a/src/gallium/docs/source/conf.py b/src/gallium/docs/source/conf.py
new file mode 100644
index 0000000000..9b0c86babd
--- /dev/null
+++ b/src/gallium/docs/source/conf.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+#
+# Gallium documentation build configuration file, created by
+# sphinx-quickstart on Sun Dec 20 14:09:05 2009.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.append(os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.pngmath']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Gallium'
+copyright = u'2009, VMWare, X.org, Nouveau'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.3'
+# The full version, including alpha/beta/rc tags.
+release = '0.3'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directory, that shouldn't be searched
+# for source files.
+exclude_trees = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# The language for highlighting source code.
+highlight_language = 'c'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  Major themes that come with
+# Sphinx are currently 'default' and 'sphinxdoc'.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Galliumdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'Gallium.tex', u'Gallium Documentation',
+   u'VMWare, X.org, Nouveau', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
new file mode 100644
index 0000000000..21f5f9111a
--- /dev/null
+++ b/src/gallium/docs/source/context.rst
@@ -0,0 +1,128 @@
+Context
+=======
+
+The context object represents the purest, most directly accessible, abilities
+of the device's 3D rendering pipeline.
+
+Methods
+-------
+
+CSO State
+^^^^^^^^^
+
+All CSO state is created, bound, and destroyed, with triplets of methods that
+all follow a specific naming scheme. For example, ``create_blend_state``,
+``bind_blend_state``, and ``destroy_blend_state``.
+
+CSO objects handled by the context object:
+
+* :ref:`Blend`: ``*_blend_state``
+* :ref:`Sampler`: These are special; they can be bound to either vertex or
+  fragment samplers, and they are bound in groups.
+  ``bind_fragment_sampler_states``, ``bind_vertex_sampler_states``
+* :ref:`Rasterizer`: ``*_rasterizer_state``
+* :ref:`Depth, Stencil, & Alpha`: ``*_depth_stencil_alpha_state``
+* :ref:`Shader`: These have two sets of methods. ``*_fs_state`` is for
+  fragment shaders, and ``*_vs_state`` is for vertex shaders.
+
+
+Resource Binding State
+^^^^^^^^^^^^^^^^^^^^^^
+
+This state describes how resources in various flavours (textures,
+buffers, surfaces) are bound to the driver.
+
+
+* ``set_constant_buffer``
+* ``set_framebuffer_state``
+* ``set_fragment_sampler_textures``
+* ``set_vertex_sampler_textures``
+* ``set_vertex_buffers``
+
+
+Non-CSO State
+^^^^^^^^^^^^^
+
+These pieces of state are too small, variable, and/or trivial to have CSO
+objects. They all follow simple, one-method binding calls, e.g.
+``set_edgeflags``.
+
+* ``set_edgeflags``
+* ``set_blend_color``
+* ``set_clip_state``
+* ``set_polygon_stipple``
+* ``set_scissor_state``
+* ``set_viewport_state``
+* ``set_vertex_elements``
+
+
+Clearing
+^^^^^^^^
+
+``clear`` initializes some or all of the surfaces currently bound to
+the framebuffer to particular RGBA, depth, or stencil values.
+
+Clear is one of the most difficult concepts to nail down to a single
+interface and it seems likely that we will want to add additional
+clear paths, for instance clearing surfaces not bound to the
+framebuffer, or read-modify-write clears such as depth-only or
+stencil-only clears of packed depth-stencil buffers.  
+
+
+Drawing
+^^^^^^^
+
+``draw_arrays``
+
+``draw_elements``
+
+``draw_range_elements``
+
+
+Queries
+^^^^^^^
+
+Queries gather some statistic from the 3D pipeline over one or more
+draws.  Queries may be nested, though no state tracker currently
+exercises this.  
+
+Queries can be created with ``create_query`` and deleted with
+``destroy_query``. To enable a query, use ``begin_query``, and when finished,
+use ``end_query`` to stop the query. Finally, ``get_query_result`` is used
+to retrieve the results.
+
+Flushing
+^^^^^^^^
+
+``flush``
+
+
+Resource Busy Queries
+^^^^^^^^^^^^^^^^^^^^^
+
+``is_texture_referenced``
+
+``is_buffer_referenced``
+
+
+
+Blitting
+^^^^^^^^
+
+These methods emulate classic blitter controls. They are not guaranteed to be
+available; if they are set to NULL, then they are not present.
+
+These methods operate directly on ``pipe_surface`` objects, and stand
+apart from any 3D state in the context.  Blitting functionality may be
+moved to a separate abstraction at some point in the future.
+
+``surface_fill`` performs a fill operation on a section of a surface.
+
+``surface_copy`` blits a region of a surface to a region of another surface,
+provided that both surfaces are the same format. The source and destination
+may be the same surface, and overlapping blits are permitted.
+
+The interfaces to these calls are likely to change to make it easier
+for a driver to batch multiple blits with the same source and
+destination.
+
diff --git a/src/gallium/docs/source/cso.rst b/src/gallium/docs/source/cso.rst
new file mode 100644
index 0000000000..dab1ee50f3
--- /dev/null
+++ b/src/gallium/docs/source/cso.rst
@@ -0,0 +1,14 @@
+CSO
+===
+
+CSO, Constant State Objects, are a core part of Gallium's API.
+
+CSO work on the principle of reusable state; they are created by filling
+out a state object with the desired properties, then passing that object
+to a context. The context returns an opaque context-specific handle which
+can be bound at any time for the desired effect.
+
+.. toctree::
+   :glob:
+
+   cso/*
diff --git a/src/gallium/docs/source/cso/blend.rst b/src/gallium/docs/source/cso/blend.rst
new file mode 100644
index 0000000000..fd9e4a1e2d
--- /dev/null
+++ b/src/gallium/docs/source/cso/blend.rst
@@ -0,0 +1,14 @@
+.. _blend:
+
+Blend
+=====
+
+This state controls blending of the final fragments into the target rendering
+buffers.
+
+XXX it is unresolved what behavior should result if blend_enable is off.
+
+Members
+-------
+
+XXX undocumented members
diff --git a/src/gallium/docs/source/cso/dsa.rst b/src/gallium/docs/source/cso/dsa.rst
new file mode 100644
index 0000000000..12abaa9d6f
--- /dev/null
+++ b/src/gallium/docs/source/cso/dsa.rst
@@ -0,0 +1,58 @@
+.. _depth,stencil,&alpha:
+
+Depth, Stencil, & Alpha
+=======================
+
+These three states control the depth, stencil, and alpha tests, used to
+discard fragments that have passed through the fragment shader.
+
+Traditionally, these three tests have been clumped together in hardware, so
+they are all stored in one structure.
+
+During actual execution, the order of operations done on fragments is always:
+
+* Stencil
+* Depth
+* Alpha
+
+Depth Members
+-------------
+
+enabled
+    Whether the depth test is enabled.
+writemask
+    Whether the depth buffer receives depth writes.
+func
+    The depth test function. One of PIPE_FUNC.
+
+Stencil Members
+---------------
+
+XXX document valuemask, writemask
+
+enabled
+    Whether the stencil test is enabled. For the second stencil, whether the
+    two-sided stencil is enabled.
+func
+    The stencil test function. One of PIPE_FUNC.
+ref_value
+    Stencil test reference value; used for certain functions.
+fail_op
+    The operation to carry out if the stencil test fails. One of
+    PIPE_STENCIL_OP.
+zfail_op
+    The operation to carry out if the stencil test passes but the depth test
+    fails. One of PIPE_STENCIL_OP.
+zpass_op
+    The operation to carry out if the stencil test and depth test both pass.
+    One of PIPE_STENCIL_OP.
+
+Alpha Members
+-------------
+
+enabled
+    Whether the alpha test is enabled.
+func
+    The alpha test function. One of PIPE_FUNC.
+ref_value
+    Alpha test reference value; used for certain functions.
diff --git a/src/gallium/docs/source/cso/rasterizer.rst b/src/gallium/docs/source/cso/rasterizer.rst
new file mode 100644
index 0000000000..4d8e1708e7
--- /dev/null
+++ b/src/gallium/docs/source/cso/rasterizer.rst
@@ -0,0 +1,152 @@
+.. _rasterizer:
+
+Rasterizer
+==========
+
+The rasterizer state controls the rendering of points, lines and triangles.
+Attributes include polygon culling state, line width, line stipple,
+multisample state, scissoring and flat/smooth shading.
+
+
+Members
+-------
+
+flatshade
+    If set, the provoking vertex of each polygon is used to determine the
+    color of the entire polygon.  If not set, fragment colors will be
+    interpolated between the vertex colors.
+    Note that this is separate from the fragment shader input attributes
+    CONSTANT, LINEAR and PERSPECTIVE.  We need the flatshade state at
+    clipping time to determine how to set the color of new vertices.
+    Also note that the draw module can implement flat shading by copying
+    the provoking vertex color to all the other vertices in the primitive.
+
+flatshade_first
+    Whether the first vertex should be the provoking vertex, for most
+    primitives. If not set, the last vertex is the provoking vertex.
+
+light_twoside
+    If set, there are per-vertex back-facing colors.  The draw module
+    uses this state along with the front/back information to set the
+    final vertex colors prior to rasterization.
+
+front_winding
+    Indicates the window order of front-facing polygons, either
+    PIPE_WINDING_CW or PIPE_WINDING_CCW
+cull_mode
+    Indicates which polygons to cull, either PIPE_WINDING_NONE (cull no
+    polygons), PIPE_WINDING_CW (cull clockwise-winding polygons),
+    PIPE_WINDING_CCW (cull counter clockwise-winding polygons), or
+    PIPE_WINDING_BOTH (cull all polygons).
+
+fill_cw
+    Indicates how to fill clockwise polygons, either PIPE_POLYGON_MODE_FILL,
+    PIPE_POLYGON_MODE_LINE or PIPE_POLYGON_MODE_POINT.
+fill_ccw
+    Indicates how to fill counter clockwise polygons, either
+    PIPE_POLYGON_MODE_FILL, PIPE_POLYGON_MODE_LINE or PIPE_POLYGON_MODE_POINT.
+
+poly_stipple_enable
+    Whether polygon stippling is enabled.
+poly_smooth
+    Controls OpenGL-style polygon smoothing/antialiasing
+offset_cw
+    If set, clockwise polygons will have polygon offset factors applied
+offset_ccw
+    If set, counter clockwise polygons will have polygon offset factors applied
+offset_units
+    Specifies the polygon offset bias
+offset_scale
+    Specifies the polygon offset scale
+
+line_width
+    The width of lines.
+line_smooth
+    Whether lines should be smoothed. Line smoothing is simply anti-aliasing.
+line_stipple_enable
+    Whether line stippling is enabled.
+line_stipple_pattern
+    16-bit bitfield of on/off flags, used to pattern the line stipple.
+line_stipple_factor
+    When drawinga stippled line, each bit in the stipple pattern is
+    repeated N times, where N = line_stipple_factor + 1.
+line_last_pixel
+    Controls whether the last pixel in a line is drawn or not.  OpenGL
+    omits the last pixel to avoid double-drawing pixels at the ends of lines
+    when drawing connected lines.
+
+point_smooth
+    Whether points should be smoothed. Point smoothing turns rectangular
+    points into circles or ovals.
+point_size_per_vertex
+    Whether vertices have a point size element.
+point_size
+    The size of points, if not specified per-vertex.
+point_size_min
+    The minimum size of points.
+point_size_max
+    The maximum size of points.
+point_sprite
+    Whether points are drawn as sprites (textured quads)
+sprite_coord_mode
+    Specifies how the value for each shader output should be computed when
+    drawing sprites.  If PIPE_SPRITE_COORD_NONE, don't change the vertex
+    shader output.  Otherwise, the four vertices of the resulting quad will
+    be assigned texture coordinates.  For PIPE_SPRITE_COORD_LOWER_LEFT, the
+    lower left vertex will have coordinate (0,0,0,1).
+    For PIPE_SPRITE_COORD_UPPER_LEFT, the upper-left vertex will have
+    coordinate (0,0,0,1).
+    This state is needed by the 'draw' module because that's where each
+    point vertex is converted into four quad vertices.  There's no other
+    place to emit the new vertex texture coordinates which are required for
+    sprite rendering.
+    Note that when geometry shaders are available, this state could be
+    removed.  A special geometry shader defined by the state tracker could
+    converts the incoming points into quads with the proper texture coords.
+
+scissor
+    Whether the scissor test is enabled.
+
+multisample
+    Whether :ref:`MSAA` is enabled.
+
+bypass_vs_clip_and_viewport
+    Whether the entire TCL pipeline should be bypassed. This implies that
+    vertices are pre-transformed for the viewport, and will not be run
+    through the vertex shader. Note that implementations may still clip away
+    vertices that are not in the viewport.
+
+gl_rasterization_rules
+    Whether the rasterizer should use (0.5, 0.5) pixel centers. When not set,
+    the rasterizer will use (0, 0) for pixel centers.
+
+
+Notes
+-----
+
+flatshade
+^^^^^^^^^
+
+The actual interpolated shading algorithm is obviously
+implementation-dependent, but will usually be Gourard for most hardware.
+
+bypass_vs_clip_and_viewport
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When set, this implies that vertices are pre-transformed for the viewport, and
+will not be run through the vertex shader. Note that implementations may still
+clip away vertices that are not visible.
+
+flatshade_first
+^^^^^^^^^^^^^^^
+
+There are several important exceptions to the specification of this rule.
+
+* ``PIPE_PRIMITIVE_POLYGON``: The provoking vertex is always the first
+  vertex. If the caller wishes to change the provoking vertex, they merely
+  need to rotate the vertices themselves.
+* ``PIPE_PRIMITIVE_QUAD``, ``PIPE_PRIMITIVE_QUAD_STRIP``: This option has no
+  effect; the provoking vertex is always the last vertex.
+* ``PIPE_PRIMITIVE_TRIANGLE_FAN``: When set, the provoking vertex is the
+  second vertex, not the first. This permits each segment of the fan to have
+  a different color.
diff --git a/src/gallium/docs/source/cso/sampler.rst b/src/gallium/docs/source/cso/sampler.rst
new file mode 100644
index 0000000000..e3f1757f57
--- /dev/null
+++ b/src/gallium/docs/source/cso/sampler.rst
@@ -0,0 +1,46 @@
+.. _sampler:
+
+Sampler
+=======
+
+Texture units have many options for selecting texels from loaded textures;
+this state controls an individual texture unit's texel-sampling settings.
+
+Texture coordinates are always treated as four-dimensional, and referred to
+with the traditional (S, T, R, Q) notation.
+
+Members
+-------
+
+XXX undocumented compare_mode, compare_func
+
+wrap_s
+    How to wrap the S coordinate. One of PIPE_TEX_WRAP.
+wrap_t
+    How to wrap the T coordinate. One of PIPE_TEX_WRAP.
+wrap_r
+    How to wrap the R coordinate. One of PIPE_TEX_WRAP.
+min_img_filter
+    The filter to use when minifying texels. One of PIPE_TEX_FILTER.
+min_mip_filter
+    The filter to use when minifying mipmapped textures. One of
+    PIPE_TEX_FILTER.
+mag_img_filter
+    The filter to use when magnifying texels. One of PIPE_TEX_FILTER.
+normalized_coords
+    Whether the texture coordinates are normalized. If normalized, they will
+    always be in [0, 1]. If not, they will be in the range of each dimension
+    of the loaded texture.
+prefilter
+    XXX From the Doxy, "weird sampling state exposed by some APIs." Refine.
+lod_bias
+    The bias to apply to the level of detail.
+min_lod
+    Minimum level of detail, used to clamp LoD after bias.
+max_lod
+    Maximum level of detail, used to clamp LoD after bias.
+border_color
+    RGBA color used for out-of-bounds coordinates.
+max_anisotropy
+    Maximum filtering to apply anisotropically to textures. Setting this to
+    1.0 effectively disables anisotropic filtering.
diff --git a/src/gallium/docs/source/cso/shader.rst b/src/gallium/docs/source/cso/shader.rst
new file mode 100644
index 0000000000..0ee42c8787
--- /dev/null
+++ b/src/gallium/docs/source/cso/shader.rst
@@ -0,0 +1,12 @@
+.. _shader:
+
+Shader
+======
+
+One of the two types of shaders supported by Gallium.
+
+Members
+-------
+
+tokens
+    A list of tgsi_tokens.
diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst
new file mode 100644
index 0000000000..33e846e33d
--- /dev/null
+++ b/src/gallium/docs/source/distro.rst
@@ -0,0 +1,141 @@
+Distribution
+============
+
+Along with the interface definitions, the following drivers, state trackers,
+and auxiliary modules are shipped in the standard Gallium distribution.
+
+Drivers
+-------
+
+Cell
+^^^^
+
+Failover
+^^^^^^^^
+
+Deprecated.
+
+Intel i915
+^^^^^^^^^^
+
+Intel i965
+^^^^^^^^^^
+
+Highly experimental.
+
+Identity
+^^^^^^^^
+
+Wrapper driver.
+
+LLVM Softpipe
+^^^^^^^^^^^^^
+
+nVidia nv04
+^^^^^^^^^^^
+
+Deprecated.
+
+nVidia nv10
+^^^^^^^^^^^
+
+Deprecated.
+
+nVidia nv20
+^^^^^^^^^^^
+
+Deprecated.
+
+nVidia nv30
+^^^^^^^^^^^
+
+nVidia nv40
+^^^^^^^^^^^
+
+nVidia nv50
+^^^^^^^^^^^
+
+VMWare SVGA
+^^^^^^^^^^^
+
+ATI r300
+^^^^^^^^
+
+AMD/ATI r600
+^^^^^^^^^^^^
+
+Highly experimental.
+
+Softpipe
+^^^^^^^^
+
+Reference software rasterizer.
+
+Trace
+^^^^^
+
+Wrapper driver.
+
+State Trackers
+--------------
+
+Direct Rendering Infrastructure
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+EGL
+^^^
+
+GLX
+^^^
+
+MesaGL
+^^^^^^
+
+Python
+^^^^^^
+
+OpenVG
+^^^^^^
+
+WGL
+^^^
+
+Xorg XFree86 DDX
+^^^^^^^^^^^^^^^^
+
+Auxiliary
+---------
+
+CSO Cache
+^^^^^^^^^
+
+Draw
+^^^^
+
+Gallivm
+^^^^^^^
+
+Indices
+^^^^^^^
+
+Pipe Buffer Manager
+^^^^^^^^^^^^^^^^^^^
+
+Remote Debugger
+^^^^^^^^^^^^^^^
+
+Runtime Assembly Emission
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Surface Context Tracker
+^^^^^^^^^^^^^^^^^^^^^^^
+
+TGSI
+^^^^
+
+Translate
+^^^^^^^^^
+
+Util
+^^^^
+
diff --git a/src/gallium/docs/source/glossary.rst b/src/gallium/docs/source/glossary.rst
new file mode 100644
index 0000000000..6a9110ce78
--- /dev/null
+++ b/src/gallium/docs/source/glossary.rst
@@ -0,0 +1,10 @@
+Glossary
+========
+
+.. glossary::
+   :sorted:
+
+   MSAA
+      Multi-Sampled Anti-Aliasing. A basic anti-aliasing technique that takes
+      multiple samples of the depth buffer, and uses this information to
+      smooth the edges of polygons.
diff --git a/src/gallium/docs/source/index.rst b/src/gallium/docs/source/index.rst
new file mode 100644
index 0000000000..54bc883fce
--- /dev/null
+++ b/src/gallium/docs/source/index.rst
@@ -0,0 +1,28 @@
+.. Gallium documentation master file, created by
+   sphinx-quickstart on Sun Dec 20 14:09:05 2009.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Gallium's documentation!
+===================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   intro
+   tgsi
+   screen
+   context
+   cso
+   distro
+   glossary
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/src/gallium/docs/source/intro.rst b/src/gallium/docs/source/intro.rst
new file mode 100644
index 0000000000..1ea103840a
--- /dev/null
+++ b/src/gallium/docs/source/intro.rst
@@ -0,0 +1,9 @@
+Introduction
+============
+
+What is Gallium?
+----------------
+
+Gallium is essentially an API for writing graphics drivers in a largely
+device-agnostic fashion. It provides several objects which encapsulate the
+core services of graphics hardware in a straightforward manner.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
new file mode 100644
index 0000000000..9631e6967e
--- /dev/null
+++ b/src/gallium/docs/source/screen.rst
@@ -0,0 +1,39 @@
+Screen
+======
+
+A screen is an object representing the context-independent part of a device.
+
+Methods
+-------
+
+XXX moar; got bored
+
+get_name
+^^^^^^^^
+
+Returns an identifying name for the screen.
+
+get_vendor
+^^^^^^^^^^
+
+Returns the screen vendor.
+
+get_param
+^^^^^^^^^
+
+Get an integer/boolean screen parameter.
+
+get_paramf
+^^^^^^^^^^
+
+Get a floating-point screen parameter.
+
+is_format_supported
+^^^^^^^^^^^^^^^^^^^
+
+See if a format can be used in a specific manner.
+
+texture_create
+^^^^^^^^^^^^^^
+
+Given a template of texture setup, create a BO-backed texture.
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
new file mode 100644
index 0000000000..ebee4902b0
--- /dev/null
+++ b/src/gallium/docs/source/tgsi.rst
@@ -0,0 +1,1270 @@
+TGSI
+====
+
+TGSI, Tungsten Graphics Shader Infrastructure, is an intermediate language
+for describing shaders. Since Gallium is inherently shaderful, shaders are
+an important part of the API. TGSI is the only intermediate representation
+used by all drivers.
+
+Instruction Set
+---------------
+
+From GL_NV_vertex_program
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+ARL - Address Register Load
+
+.. math::
+
+  dst.x = \lfloor src.x\rfloor
+
+  dst.y = \lfloor src.y\rfloor
+
+  dst.z = \lfloor src.z\rfloor
+
+  dst.w = \lfloor src.w\rfloor
+
+
+MOV - Move
+
+.. math::
+
+  dst.x = src.x
+
+  dst.y = src.y
+
+  dst.z = src.z
+
+  dst.w = src.w
+
+
+LIT - Light Coefficients
+
+.. math::
+
+  dst.x = 1
+
+  dst.y = max(src.x, 0)
+
+  dst.z = (src.x > 0) ? max(src.y, 0)^{clamp(src.w, -128, 128))} : 0
+
+  dst.w = 1
+
+
+RCP - Reciprocal
+
+.. math::
+
+  dst.x = \frac{1}{src.x}
+
+  dst.y = \frac{1}{src.x}
+
+  dst.z = \frac{1}{src.x}
+
+  dst.w = \frac{1}{src.x}
+
+
+RSQ - Reciprocal Square Root
+
+.. math::
+
+  dst.x = \frac{1}{\sqrt{|src.x|}}
+
+  dst.y = \frac{1}{\sqrt{|src.x|}}
+
+  dst.z = \frac{1}{\sqrt{|src.x|}}
+
+  dst.w = \frac{1}{\sqrt{|src.x|}}
+
+
+EXP - Approximate Exponential Base 2
+
+.. math::
+
+  dst.x = 2^{\lfloor src.x\rfloor}
+
+  dst.y = src.x - \lfloor src.x\rfloor
+
+  dst.z = 2^{src.x}
+
+  dst.w = 1
+
+
+LOG - Approximate Logarithm Base 2
+
+.. math::
+
+  dst.x = \lfloor\log_2{|src.x|}\rfloor
+
+  dst.y = \frac{|src.x|}{2^{\lfloor\log_2{|src.x|}\rfloor}}
+
+  dst.z = \log_2{|src.x|}
+
+  dst.w = 1
+
+
+MUL - Multiply
+
+.. math::
+
+  dst.x = src0.x \times src1.x
+
+  dst.y = src0.y \times src1.y
+
+  dst.z = src0.z \times src1.z
+
+  dst.w = src0.w \times src1.w
+
+
+ADD - Add
+
+.. math::
+
+  dst.x = src0.x + src1.x
+
+  dst.y = src0.y + src1.y
+
+  dst.z = src0.z + src1.z
+
+  dst.w = src0.w + src1.w
+
+
+DP3 - 3-component Dot Product
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+
+  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+
+  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+
+  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+
+
+DP4 - 4-component Dot Product
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
+
+  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
+
+  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
+
+  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
+
+
+DST - Distance Vector
+
+.. math::
+
+  dst.x = 1
+
+  dst.y = src0.y \times src1.y
+
+  dst.z = src0.z
+
+  dst.w = src1.w
+
+
+MIN - Minimum
+
+.. math::
+
+  dst.x = min(src0.x, src1.x)
+
+  dst.y = min(src0.y, src1.y)
+
+  dst.z = min(src0.z, src1.z)
+
+  dst.w = min(src0.w, src1.w)
+
+
+MAX - Maximum
+
+.. math::
+
+  dst.x = max(src0.x, src1.x)
+
+  dst.y = max(src0.y, src1.y)
+
+  dst.z = max(src0.z, src1.z)
+
+  dst.w = max(src0.w, src1.w)
+
+
+SLT - Set On Less Than
+
+.. math::
+
+  dst.x = (src0.x < src1.x) ? 1 : 0
+
+  dst.y = (src0.y < src1.y) ? 1 : 0
+
+  dst.z = (src0.z < src1.z) ? 1 : 0
+
+  dst.w = (src0.w < src1.w) ? 1 : 0
+
+
+SGE - Set On Greater Equal Than
+
+.. math::
+
+  dst.x = (src0.x >= src1.x) ? 1 : 0
+
+  dst.y = (src0.y >= src1.y) ? 1 : 0
+
+  dst.z = (src0.z >= src1.z) ? 1 : 0
+
+  dst.w = (src0.w >= src1.w) ? 1 : 0
+
+
+MAD - Multiply And Add
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src2.x
+
+  dst.y = src0.y \times src1.y + src2.y
+
+  dst.z = src0.z \times src1.z + src2.z
+
+  dst.w = src0.w \times src1.w + src2.w
+
+
+SUB - Subtract
+
+.. math::
+
+  dst.x = src0.x - src1.x
+
+  dst.y = src0.y - src1.y
+
+  dst.z = src0.z - src1.z
+
+  dst.w = src0.w - src1.w
+
+
+LRP - Linear Interpolate
+
+.. math::
+
+  dst.x = src0.x \times src1.x + (1 - src0.x) \times src2.x
+
+  dst.y = src0.y \times src1.y + (1 - src0.y) \times src2.y
+
+  dst.z = src0.z \times src1.z + (1 - src0.z) \times src2.z
+
+  dst.w = src0.w \times src1.w + (1 - src0.w) \times src2.w
+
+
+CND - Condition
+
+.. math::
+
+  dst.x = (src2.x > 0.5) ? src0.x : src1.x
+
+  dst.y = (src2.y > 0.5) ? src0.y : src1.y
+
+  dst.z = (src2.z > 0.5) ? src0.z : src1.z
+
+  dst.w = (src2.w > 0.5) ? src0.w : src1.w
+
+
+DP2A - 2-component Dot Product And Add
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src0.y \times src1.y + src2.x
+
+  dst.y = src0.x \times src1.x + src0.y \times src1.y + src2.x
+
+  dst.z = src0.x \times src1.x + src0.y \times src1.y + src2.x
+
+  dst.w = src0.x \times src1.x + src0.y \times src1.y + src2.x
+
+
+FRAC - Fraction
+
+.. math::
+
+  dst.x = src.x - \lfloor src.x\rfloor
+
+  dst.y = src.y - \lfloor src.y\rfloor
+
+  dst.z = src.z - \lfloor src.z\rfloor
+
+  dst.w = src.w - \lfloor src.w\rfloor
+
+
+CLAMP - Clamp
+
+.. math::
+
+  dst.x = clamp(src0.x, src1.x, src2.x)
+
+  dst.y = clamp(src0.y, src1.y, src2.y)
+
+  dst.z = clamp(src0.z, src1.z, src2.z)
+
+  dst.w = clamp(src0.w, src1.w, src2.w)
+
+
+FLR - Floor
+
+This is identical to ARL.
+
+.. math::
+
+  dst.x = \lfloor src.x\rfloor
+
+  dst.y = \lfloor src.y\rfloor
+
+  dst.z = \lfloor src.z\rfloor
+
+  dst.w = \lfloor src.w\rfloor
+
+
+ROUND - Round
+
+.. math::
+
+  dst.x = round(src.x)
+
+  dst.y = round(src.y)
+
+  dst.z = round(src.z)
+
+  dst.w = round(src.w)
+
+
+EX2 - Exponential Base 2
+
+.. math::
+
+  dst.x = 2^{src.x}
+
+  dst.y = 2^{src.x}
+
+  dst.z = 2^{src.x}
+
+  dst.w = 2^{src.x}
+
+
+LG2 - Logarithm Base 2
+
+.. math::
+
+  dst.x = \log_2{src.x}
+
+  dst.y = \log_2{src.x}
+
+  dst.z = \log_2{src.x}
+
+  dst.w = \log_2{src.x}
+
+
+POW - Power
+
+.. math::
+
+  dst.x = src0.x^{src1.x}
+
+  dst.y = src0.x^{src1.x}
+
+  dst.z = src0.x^{src1.x}
+
+  dst.w = src0.x^{src1.x}
+
+XPD - Cross Product
+
+.. math::
+
+  dst.x = src0.y \times src1.z - src1.y \times src0.z
+
+  dst.y = src0.z \times src1.x - src1.z \times src0.x
+
+  dst.z = src0.x \times src1.y - src1.x \times src0.y
+
+  dst.w = 1
+
+
+ABS - Absolute
+
+.. math::
+
+  dst.x = |src.x|
+
+  dst.y = |src.y|
+
+  dst.z = |src.z|
+
+  dst.w = |src.w|
+
+
+RCC - Reciprocal Clamped
+
+XXX cleanup on aisle three
+
+.. math::
+
+  dst.x = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
+
+  dst.y = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
+
+  dst.z = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
+
+  dst.w = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
+
+
+DPH - Homogeneous Dot Product
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+
+  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+
+  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+
+  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+
+
+COS - Cosine
+
+.. math::
+
+  dst.x = \cos{src.x}
+
+  dst.y = \cos{src.x}
+
+  dst.z = \cos{src.x}
+
+  dst.w = \cos{src.x}
+
+
+DDX - Derivative Relative To X
+
+.. math::
+
+  dst.x = partialx(src.x)
+
+  dst.y = partialx(src.y)
+
+  dst.z = partialx(src.z)
+
+  dst.w = partialx(src.w)
+
+
+DDY - Derivative Relative To Y
+
+.. math::
+
+  dst.x = partialy(src.x)
+
+  dst.y = partialy(src.y)
+
+  dst.z = partialy(src.z)
+
+  dst.w = partialy(src.w)
+
+
+KILP - Predicated Discard
+
+  discard
+
+
+PK2H - Pack Two 16-bit Floats
+
+  TBD
+
+
+PK2US - Pack Two Unsigned 16-bit Scalars
+
+  TBD
+
+
+PK4B - Pack Four Signed 8-bit Scalars
+
+  TBD
+
+
+PK4UB - Pack Four Unsigned 8-bit Scalars
+
+  TBD
+
+
+RFL - Reflection Vector
+
+.. math::
+
+  dst.x = 2 \times (src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z) / (src0.x \times src0.x + src0.y \times src0.y + src0.z \times src0.z) \times src0.x - src1.x
+
+  dst.y = 2 \times (src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z) / (src0.x \times src0.x + src0.y \times src0.y + src0.z \times src0.z) \times src0.y - src1.y
+
+  dst.z = 2 \times (src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z) / (src0.x \times src0.x + src0.y \times src0.y + src0.z \times src0.z) \times src0.z - src1.z
+
+  dst.w = 1
+
+Considered for removal.
+
+
+SEQ - Set On Equal
+
+.. math::
+
+  dst.x = (src0.x == src1.x) ? 1 : 0
+  dst.y = (src0.y == src1.y) ? 1 : 0
+  dst.z = (src0.z == src1.z) ? 1 : 0
+  dst.w = (src0.w == src1.w) ? 1 : 0
+
+
+SFL - Set On False
+
+.. math::
+
+  dst.x = 0
+  dst.y = 0
+  dst.z = 0
+  dst.w = 0
+
+Considered for removal.
+
+SGT - Set On Greater Than
+
+.. math::
+
+  dst.x = (src0.x > src1.x) ? 1 : 0
+  dst.y = (src0.y > src1.y) ? 1 : 0
+  dst.z = (src0.z > src1.z) ? 1 : 0
+  dst.w = (src0.w > src1.w) ? 1 : 0
+
+
+SIN - Sine
+
+.. math::
+
+  dst.x = \sin{src.x}
+
+  dst.y = \sin{src.x}
+
+  dst.z = \sin{src.x}
+
+  dst.w = \sin{src.x}
+
+
+SLE - Set On Less Equal Than
+
+.. math::
+
+  dst.x = (src0.x <= src1.x) ? 1 : 0
+  dst.y = (src0.y <= src1.y) ? 1 : 0
+  dst.z = (src0.z <= src1.z) ? 1 : 0
+  dst.w = (src0.w <= src1.w) ? 1 : 0
+
+
+SNE - Set On Not Equal
+
+.. math::
+
+  dst.x = (src0.x != src1.x) ? 1 : 0
+  dst.y = (src0.y != src1.y) ? 1 : 0
+  dst.z = (src0.z != src1.z) ? 1 : 0
+  dst.w = (src0.w != src1.w) ? 1 : 0
+
+
+STR - Set On True
+
+.. math::
+
+  dst.x = 1
+  dst.y = 1
+  dst.z = 1
+  dst.w = 1
+
+
+TEX - Texture Lookup
+
+  TBD
+
+
+TXD - Texture Lookup with Derivatives
+
+  TBD
+
+
+TXP - Projective Texture Lookup
+
+  TBD
+
+
+UP2H - Unpack Two 16-Bit Floats
+
+  TBD
+
+  Considered for removal.
+
+UP2US - Unpack Two Unsigned 16-Bit Scalars
+
+  TBD
+
+  Considered for removal.
+
+UP4B - Unpack Four Signed 8-Bit Values
+
+  TBD
+
+  Considered for removal.
+
+UP4UB - Unpack Four Unsigned 8-Bit Scalars
+
+  TBD
+
+  Considered for removal.
+
+X2D - 2D Coordinate Transformation
+
+.. math::
+
+  dst.x = src0.x + src1.x \times src2.x + src1.y \times src2.y
+  dst.y = src0.y + src1.x \times src2.z + src1.y \times src2.w
+  dst.z = src0.x + src1.x \times src2.x + src1.y \times src2.y
+  dst.w = src0.y + src1.x \times src2.z + src1.y \times src2.w
+
+Considered for removal.
+
+
+From GL_NV_vertex_program2
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+ARA - Address Register Add
+
+  TBD
+
+  Considered for removal.
+
+ARR - Address Register Load With Round
+
+.. math::
+
+  dst.x = round(src.x)
+
+  dst.y = round(src.y)
+
+  dst.z = round(src.z)
+
+  dst.w = round(src.w)
+
+
+BRA - Branch
+
+  pc = target
+
+  Considered for removal.
+
+CAL - Subroutine Call
+
+  push(pc)
+  pc = target
+
+
+RET - Subroutine Call Return
+
+  pc = pop()
+
+  Potential restrictions:  
+  * Only occurs at end of function.
+
+SSG - Set Sign
+
+.. math::
+
+  dst.x = (src.x > 0) ? 1 : (src.x < 0) ? -1 : 0
+
+  dst.y = (src.y > 0) ? 1 : (src.y < 0) ? -1 : 0
+
+  dst.z = (src.z > 0) ? 1 : (src.z < 0) ? -1 : 0
+
+  dst.w = (src.w > 0) ? 1 : (src.w < 0) ? -1 : 0
+
+
+CMP - Compare
+
+.. math::
+
+  dst.x = (src0.x < 0) ? src1.x : src2.x
+
+  dst.y = (src0.y < 0) ? src1.y : src2.y
+
+  dst.z = (src0.z < 0) ? src1.z : src2.z
+
+  dst.w = (src0.w < 0) ? src1.w : src2.w
+
+
+KIL - Conditional Discard
+
+.. math::
+
+  if (src.x < 0 || src.y < 0 || src.z < 0 || src.w < 0)
+    discard
+  endif
+
+
+SCS - Sine Cosine
+
+.. math::
+
+  dst.x = \cos{src.x}
+
+  dst.y = \sin{src.x}
+
+  dst.z = 0
+
+  dst.y = 1
+
+
+TXB - Texture Lookup With Bias
+
+  TBD
+
+
+NRM - 3-component Vector Normalise
+
+.. math::
+
+  dst.x = src.x / (src.x \times src.x + src.y \times src.y + src.z \times src.z)
+
+  dst.y = src.y / (src.x \times src.x + src.y \times src.y + src.z \times src.z)
+
+  dst.z = src.z / (src.x \times src.x + src.y \times src.y + src.z \times src.z)
+
+  dst.w = 1
+
+
+DIV - Divide
+
+.. math::
+
+  dst.x = \frac{src0.x}{src1.x}
+
+  dst.y = \frac{src0.y}{src1.y}
+
+  dst.z = \frac{src0.z}{src1.z}
+
+  dst.w = \frac{src0.w}{src1.w}
+
+
+DP2 - 2-component Dot Product
+
+.. math::
+
+  dst.x = src0.x \times src1.x + src0.y \times src1.y
+
+  dst.y = src0.x \times src1.x + src0.y \times src1.y
+
+  dst.z = src0.x \times src1.x + src0.y \times src1.y
+
+  dst.w = src0.x \times src1.x + src0.y \times src1.y
+
+
+TXL - Texture Lookup With LOD
+
+  TBD
+
+
+BRK - Break
+
+  TBD
+
+
+IF - If
+
+  TBD
+
+
+BGNFOR - Begin a For-Loop
+
+  dst.x = floor(src.x)
+  dst.y = floor(src.y)
+  dst.z = floor(src.z)
+
+  if (dst.y <= 0)
+    pc = [matching ENDFOR] + 1
+  endif
+
+  Note: The destination must be a loop register.
+        The source must be a constant register.
+
+  Considered for cleanup / removal.
+
+
+REP - Repeat
+
+  TBD
+
+
+ELSE - Else
+
+  TBD
+
+
+ENDIF - End If
+
+  TBD
+
+
+ENDFOR - End a For-Loop
+
+  dst.x = dst.x + dst.z
+  dst.y = dst.y - 1.0
+
+  if (dst.y > 0)
+    pc = [matching BGNFOR instruction] + 1
+  endif
+
+  Note: The destination must be a loop register.
+
+  Considered for cleanup / removal.
+
+ENDREP - End Repeat
+
+  TBD
+
+
+PUSHA - Push Address Register On Stack
+
+  push(src.x)
+  push(src.y)
+  push(src.z)
+  push(src.w)
+
+  Considered for cleanup / removal.
+
+POPA - Pop Address Register From Stack
+
+  dst.w = pop()
+  dst.z = pop()
+  dst.y = pop()
+  dst.x = pop()
+
+  Considered for cleanup / removal.
+
+
+From GL_NV_gpu_program4
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Support for these opcodes indicated by a special pipe capability bit (TBD).
+
+CEIL - Ceiling
+
+.. math::
+
+  dst.x = \lceil src.x\rceil
+
+  dst.y = \lceil src.y\rceil
+
+  dst.z = \lceil src.z\rceil
+
+  dst.w = \lceil src.w\rceil
+
+
+I2F - Integer To Float
+
+.. math::
+
+  dst.x = (float) src.x
+
+  dst.y = (float) src.y
+
+  dst.z = (float) src.z
+
+  dst.w = (float) src.w
+
+
+NOT - Bitwise Not
+
+.. math::
+
+  dst.x = ~src.x
+
+  dst.y = ~src.y
+
+  dst.z = ~src.z
+
+  dst.w = ~src.w
+
+
+TRUNC - Truncate
+
+.. math::
+
+  dst.x = trunc(src.x)
+
+  dst.y = trunc(src.y)
+
+  dst.z = trunc(src.z)
+
+  dst.w = trunc(src.w)
+
+
+SHL - Shift Left
+
+.. math::
+
+  dst.x = src0.x << src1.x
+
+  dst.y = src0.y << src1.x
+
+  dst.z = src0.z << src1.x
+
+  dst.w = src0.w << src1.x
+
+
+SHR - Shift Right
+
+.. math::
+
+  dst.x = src0.x >> src1.x
+
+  dst.y = src0.y >> src1.x
+
+  dst.z = src0.z >> src1.x
+
+  dst.w = src0.w >> src1.x
+
+
+AND - Bitwise And
+
+.. math::
+
+  dst.x = src0.x & src1.x
+
+  dst.y = src0.y & src1.y
+
+  dst.z = src0.z & src1.z
+
+  dst.w = src0.w & src1.w
+
+
+OR - Bitwise Or
+
+.. math::
+
+  dst.x = src0.x | src1.x
+
+  dst.y = src0.y | src1.y
+
+  dst.z = src0.z | src1.z
+
+  dst.w = src0.w | src1.w
+
+
+MOD - Modulus
+
+.. math::
+
+  dst.x = src0.x \bmod src1.x
+
+  dst.y = src0.y \bmod src1.y
+
+  dst.z = src0.z \bmod src1.z
+
+  dst.w = src0.w \bmod src1.w
+
+
+XOR - Bitwise Xor
+
+.. math::
+
+  dst.x = src0.x ^ src1.x
+
+  dst.y = src0.y ^ src1.y
+
+  dst.z = src0.z ^ src1.z
+
+  dst.w = src0.w ^ src1.w
+
+
+SAD - Sum Of Absolute Differences
+
+.. math::
+
+  dst.x = |src0.x - src1.x| + src2.x
+
+  dst.y = |src0.y - src1.y| + src2.y
+
+  dst.z = |src0.z - src1.z| + src2.z
+
+  dst.w = |src0.w - src1.w| + src2.w
+
+
+TXF - Texel Fetch
+
+  TBD
+
+
+TXQ - Texture Size Query
+
+  TBD
+
+
+CONT - Continue
+
+  TBD
+
+
+From GL_NV_geometry_program4
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+EMIT - Emit
+
+  TBD
+
+
+ENDPRIM - End Primitive
+
+  TBD
+
+
+From GLSL
+^^^^^^^^^^
+
+
+BGNLOOP - Begin a Loop
+
+  TBD
+
+
+BGNSUB - Begin Subroutine
+
+  TBD
+
+
+ENDLOOP - End a Loop
+
+  TBD
+
+
+ENDSUB - End Subroutine
+
+  TBD
+
+
+NOP - No Operation
+
+  Do nothing.
+
+
+NRM4 - 4-component Vector Normalise
+
+.. math::
+
+  dst.x = \frac{src.x}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+
+  dst.y = \frac{src.y}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+
+  dst.z = \frac{src.z}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+
+  dst.w = \frac{src.w}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+
+
+ps_2_x
+^^^^^^^^^^^^
+
+
+CALLNZ - Subroutine Call If Not Zero
+
+  TBD
+
+
+IFC - If
+
+  TBD
+
+
+BREAKC - Break Conditional
+
+  TBD
+
+
+Explanation of symbols used
+------------------------------
+
+
+Functions
+^^^^^^^^^^^^^^
+
+
+  :math:`|x|`       Absolute value of `x`.
+
+  :math:`\lceil x \rceil` Ceiling of `x`.
+
+  clamp(x,y,z)      Clamp x between y and z.
+                    (x < y) ? y : (x > z) ? z : x
+
+  :math:`\lfloor x\rfloor` Floor of `x`.
+
+  :math:`\log_2{x}` Logarithm of `x`, base 2.
+
+  max(x,y)          Maximum of x and y.
+                    (x > y) ? x : y
+
+  min(x,y)          Minimum of x and y.
+                    (x < y) ? x : y
+
+  partialx(x)       Derivative of x relative to fragment's X.
+
+  partialy(x)       Derivative of x relative to fragment's Y.
+
+  pop()             Pop from stack.
+
+  :math:`x^y`       `x` to the power `y`.
+
+  push(x)           Push x on stack.
+
+  round(x)          Round x.
+
+  trunc(x)          Truncate x, i.e. drop the fraction bits.
+
+
+Keywords
+^^^^^^^^^^^^^
+
+
+  discard           Discard fragment.
+
+  dst               First destination register.
+
+  dst0              First destination register.
+
+  pc                Program counter.
+
+  src               First source register.
+
+  src0              First source register.
+
+  src1              Second source register.
+
+  src2              Third source register.
+
+  target            Label of target instruction.
+
+
+Other tokens
+---------------
+
+
+Declaration Semantic
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+  Follows Declaration token if Semantic bit is set.
+
+  Since its purpose is to link a shader with other stages of the pipeline,
+  it is valid to follow only those Declaration tokens that declare a register
+  either in INPUT or OUTPUT file.
+
+  SemanticName field contains the semantic name of the register being declared.
+  There is no default value.
+
+  SemanticIndex is an optional subscript that can be used to distinguish
+  different register declarations with the same semantic name. The default value
+  is 0.
+
+  The meanings of the individual semantic names are explained in the following
+  sections.
+
+TGSI_SEMANTIC_POSITION
+""""""""""""""""""""""
+
+Position, sometimes known as HPOS or WPOS for historical reasons, is the
+location of the vertex in space, in ``(x, y, z, w)`` format. ``x``, ``y``, and ``z``
+are the Cartesian coordinates, and ``w`` is the homogenous coordinate and used
+for the perspective divide, if enabled.
+
+As a vertex shader output, position should be scaled to the viewport. When
+used in fragment shaders, position will ---
+
+XXX --- wait a minute. Should position be in [0,1] for x and y?
+
+XXX additionally, is there a way to configure the perspective divide? it's
+accelerated on most chipsets AFAIK...
+
+Position, if not specified, usually defaults to ``(0, 0, 0, 1)``, and can
+be partially specified as ``(x, y, 0, 1)`` or ``(x, y, z, 1)``.
+
+XXX usually? can we solidify that?
+
+TGSI_SEMANTIC_COLOR
+"""""""""""""""""""
+
+Colors are used to, well, color the primitives. Colors are always in
+``(r, g, b, a)`` format.
+
+If alpha is not specified, it defaults to 1.
+
+TGSI_SEMANTIC_BCOLOR
+""""""""""""""""""""
+
+Back-facing colors are only used for back-facing polygons, and are only valid
+in vertex shader outputs. After rasterization, all polygons are front-facing
+and COLOR and BCOLOR end up occupying the same slots in the fragment, so
+all BCOLORs effectively become regular COLORs in the fragment shader.
+
+TGSI_SEMANTIC_FOG
+"""""""""""""""""
+
+The fog coordinate historically has been used to replace the depth coordinate
+for generation of fog in dedicated fog blocks. Gallium, however, does not use
+dedicated fog acceleration, placing it entirely in the fragment shader
+instead.
+
+The fog coordinate should be written in ``(f, 0, 0, 1)`` format. Only the first
+component matters when writing from the vertex shader; the driver will ensure
+that the coordinate is in this format when used as a fragment shader input.
+
+TGSI_SEMANTIC_PSIZE
+"""""""""""""""""""
+
+PSIZE, or point size, is used to specify point sizes per-vertex. It should
+be in ``(p, n, x, f)`` format, where ``p`` is the point size, ``n`` is the minimum
+size, ``x`` is the maximum size, and ``f`` is the fade threshold.
+
+XXX this is arb_vp. is this what we actually do? should double-check...
+
+When using this semantic, be sure to set the appropriate state in the
+:ref:`rasterizer` first.
+
+TGSI_SEMANTIC_GENERIC
+"""""""""""""""""""""
+
+Generic semantics are nearly always used for texture coordinate attributes,
+in ``(s, t, r, q)`` format. ``t`` and ``r`` may be unused for certain kinds
+of lookups, and ``q`` is the level-of-detail bias for biased sampling.
+
+These attributes are called "generic" because they may be used for anything
+else, including parameters, texture generation information, or anything that
+can be stored inside a four-component vector.
+
+TGSI_SEMANTIC_NORMAL
+""""""""""""""""""""
+
+Vertex normal; could be used to implement per-pixel lighting for legacy APIs
+that allow mixing fixed-function and programmable stages.
+
+TGSI_SEMANTIC_FACE
+""""""""""""""""""
+
+FACE is the facing bit, to store the facing information for the fragment
+shader. ``(f, 0, 0, 1)`` is the format. The first component will be positive
+when the fragment is front-facing, and negative when the component is
+back-facing.
+
+TGSI_SEMANTIC_EDGEFLAG
+""""""""""""""""""""""
+
+XXX no clue
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 01bea0f8cc..3fa8b975d3 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -85,7 +85,7 @@ cell_unmap_constant_buffers(struct cell_context *sp)
  *
  * XXX should the element buffer be specified/bound with a separate function?
  */
-static boolean
+static void
 cell_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -145,29 +145,27 @@ cell_draw_range_elements(struct pipe_context *pipe,
 
    /* Note: leave drawing surfaces mapped */
    cell_unmap_constant_buffers(sp);
-
-   return TRUE;
 }
 
 
-static boolean
+static void
 cell_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned mode, unsigned start, unsigned count)
 {
-   return cell_draw_range_elements( pipe, indexBuffer,
-                                    indexSize,
-                                    0, 0xffffffff,
-                                    mode, start, count );
+   cell_draw_range_elements( pipe, indexBuffer,
+                             indexSize,
+                             0, 0xffffffff,
+                             mode, start, count );
 }
 
 
-static boolean
+static void
 cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+   cell_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 5c0179d954..12b855a3db 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -405,8 +405,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
@@ -418,8 +416,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 5ed330aa6e..d86d8e09a5 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1681,7 +1681,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 37184eac7b..46e4338d98 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -44,11 +44,19 @@ static void failover_destroy( struct pipe_context *pipe )
 }
 
 
+void failover_fail_over( struct failover_context *failover )
+{
+   failover->dirty = TRUE;
+   failover->mode = FO_SW;
+}
 
-static boolean failover_draw_elements( struct pipe_context *pipe,
-				       struct pipe_buffer *indexBuffer,
-				       unsigned indexSize,
-				       unsigned prim, unsigned start, unsigned count)
+
+static void failover_draw_elements( struct pipe_context *pipe,
+                                    struct pipe_buffer *indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned prim, 
+                                    unsigned start, 
+                                    unsigned count)
 {
    struct failover_context *failover = failover_context( pipe );
 
@@ -62,24 +70,22 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
    /* Try hardware:
     */
    if (failover->mode == FO_HW) {
-      if (!failover->hw->draw_elements( failover->hw, 
-					indexBuffer, 
-					indexSize, 
-					prim, 
-					start, 
-					count )) {
-
-	 failover->hw->flush( failover->hw, ~0, NULL );
-	 failover->mode = FO_SW;
-      }
+      failover->hw->draw_elements( failover->hw, 
+                                   indexBuffer, 
+                                   indexSize, 
+                                   prim, 
+                                   start, 
+                                   count );
    }
 
    /* Possibly try software:
     */
    if (failover->mode == FO_SW) {
 
-      if (failover->dirty) 
+      if (failover->dirty) {
+         failover->hw->flush( failover->hw, ~0, NULL );
 	 failover_state_emit( failover );
+      }
 
       failover->sw->draw_elements( failover->sw, 
 				   indexBuffer, 
@@ -94,15 +100,13 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
        */
       failover->sw->flush( failover->sw, ~0, NULL );
    }
-
-   return TRUE;
 }
 
 
-static boolean failover_draw_arrays( struct pipe_context *pipe,
+static void failover_draw_arrays( struct pipe_context *pipe,
 				     unsigned prim, unsigned start, unsigned count)
 {
-   return failover_draw_elements(pipe, NULL, 0, prim, start, count);
+   failover_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 static unsigned int
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
index a8ce997a1f..533122b69d 100644
--- a/src/gallium/drivers/failover/fo_winsys.h
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -36,10 +36,13 @@
 
 
 struct pipe_context;
+struct failover_context;
 
 
 struct pipe_context *failover_create( struct pipe_context *hw,
 				      struct pipe_context *sw );
 
 
+void failover_fail_over( struct failover_context *failover );
+
 #endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 949f046350..89feeade75 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -45,7 +45,7 @@
  */
 
 
-static boolean
+static void
 i915_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -106,27 +106,25 @@ i915_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, indexBuffer);
       draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
    }
-
-   return TRUE;
 }
 
-static boolean
+static void
 i915_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_range_elements(pipe, indexBuffer,
-                                   indexSize,
-                                   0, 0xffffffff,
-                                   prim, start, count);
+   i915_draw_range_elements(pipe, indexBuffer,
+                            indexSize,
+                            0, 0xffffffff,
+                            prim, start, count);
 }
 
-static boolean
+static void
 i915_draw_arrays(struct pipe_context *pipe,
                  unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_elements(pipe, NULL, 0, prim, start, count);
+   i915_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 1528afc859..5f5b6f8e18 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -74,8 +74,6 @@ static unsigned translate_img_filter( unsigned filter )
       return FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:
       return FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:
-      return FILTER_ANISOTROPIC;
    default:
       assert(0);
       return FILTER_NEAREST;
@@ -221,6 +219,9 @@ i915_create_sampler_state(struct pipe_context *pipe,
    minFilt = translate_img_filter( sampler->min_img_filter );
    magFilt = translate_img_filter( sampler->mag_img_filter );
    
+   if (sampler->max_anisotropy > 1.0)
+      minFilt = magFilt = FILTER_ANISOTROPIC;
+
    if (sampler->max_anisotropy > 2.0) {
       cso->state[0] |= SS2_MAX_ANISO_4;
    }
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 852fd22982..ea8d39adaf 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -176,7 +176,7 @@ try_draw_range_elements(struct brw_context *brw,
 }
 
 
-static boolean
+static void
 brw_draw_range_elements(struct pipe_context *pipe,
 			struct pipe_buffer *index_buffer,
 			unsigned index_size,
@@ -228,29 +228,27 @@ brw_draw_range_elements(struct pipe_context *pipe,
       ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
       assert(ret == 0);
    }
-
-   return TRUE;
 }
 
-static boolean
+static void
 brw_draw_elements(struct pipe_context *pipe,
 		  struct pipe_buffer *index_buffer,
 		  unsigned index_size,
 		  unsigned mode, 
 		  unsigned start, unsigned count)
 {
-   return brw_draw_range_elements( pipe, index_buffer,
-				   index_size,
-				   0, 0xffffffff,
-				   mode, 
-				   start, count );
+   brw_draw_range_elements( pipe, index_buffer,
+                            index_size,
+                            0, 0xffffffff,
+                            mode, 
+                            start, count );
 }
 
-static boolean
+static void
 brw_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return brw_draw_elements(pipe, NULL, 0, mode, start, count);
+   brw_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index 4fe7b6acc1..00d8eaccbc 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -860,7 +860,7 @@ void brw_land_fwd_jump(struct brw_compile *p,
        jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
index 5ddc63f57e..81712798a5 100644
--- a/src/gallium/drivers/i965/brw_pipe_sampler.c
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -48,8 +48,6 @@ static GLuint translate_img_filter( unsigned filter )
       return BRW_MAPFILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:
       return BRW_MAPFILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:
-      return BRW_MAPFILTER_ANISOTROPIC;
    default:
       assert(0);
       return BRW_MAPFILTER_NEAREST;
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 7e57d0306b..8f983a60ae 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -691,7 +691,7 @@ static void emit_xpd( struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & BRW_WRITEMASK_W) == BRW_WRITEMASK_X);
+   assert((mask & BRW_WRITEMASK_W) != BRW_WRITEMASK_W);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
index bdbaae5987..9f5b4e6323 100644
--- a/src/gallium/drivers/identity/id_context.c
+++ b/src/gallium/drivers/identity/id_context.c
@@ -45,7 +45,7 @@ identity_destroy(struct pipe_context *_pipe)
    free(id_pipe);
 }
 
-static boolean
+static void
 identity_draw_arrays(struct pipe_context *_pipe,
                      unsigned prim,
                      unsigned start,
@@ -54,13 +54,13 @@ identity_draw_arrays(struct pipe_context *_pipe,
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
 
-   return pipe->draw_arrays(pipe,
-                            prim,
-                            start,
-                            count);
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
 }
 
-static boolean
+static void
 identity_draw_elements(struct pipe_context *_pipe,
                        struct pipe_buffer *_indexBuffer,
                        unsigned indexSize,
@@ -73,15 +73,15 @@ identity_draw_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_elements(pipe,
-                              indexBuffer,
-                              indexSize,
-                              prim,
-                              start,
-                              count);
+   pipe->draw_elements(pipe,
+                       indexBuffer,
+                       indexSize,
+                       prim,
+                       start,
+                       count);
 }
 
-static boolean
+static void
 identity_draw_range_elements(struct pipe_context *_pipe,
                              struct pipe_buffer *_indexBuffer,
                              unsigned indexSize,
@@ -96,14 +96,14 @@ identity_draw_range_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_range_elements(pipe,
-                                    indexBuffer,
-                                    indexSize,
-                                    minIndex,
-                                    maxIndex,
-                                    mode,
-                                    start,
-                                    count);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
 }
 
 static struct pipe_query *
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index e038a5229e..7c6e46006b 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -50,7 +50,6 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample_c.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 0c3f00fd58..72d9f39658 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -59,27 +59,16 @@ Requirements
    
    See /proc/cpuinfo to know what your CPU supports.
  
- - LLVM 2.5 or greater. LLVM 2.6 is preferred.
+ - LLVM 2.6.
  
-   On Debian based distributions do:
+   For Linux, on a recent Debian based distribution do:
  
      aptitude install llvm-dev
 
-   There is a typo in one of the llvm 2.5 headers, that may cause compilation
-   errors. To fix it apply the change:
+   For Windows download pre-built MSVC 9.0 or MinGW binaries from
+   http://people.freedesktop.org/~jrfonseca/llvm/ and set the LLVM environment
+   variable to the extracted path.
 
-     --- /usr/include/llvm-c/Core.h.orig	2009-08-10 15:38:54.000000000 +0100
-     +++ /usr/include/llvm-c/Core.h	2009-08-10 15:38:25.000000000 +0100
-     @@ -831,7 +831,7 @@
-        template<typename T>
-        inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
-          #if DEBUG
-     -    for (LLVMValueRef *I = Vals, E = Vals + Length; I != E; ++I)
-     +    for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
-            cast<T>(*I);
-          #endif
-          return reinterpret_cast<T**>(Vals);
- 
  - scons (optional)
 
  - udis86, http://udis86.sourceforge.net/ (optional):
@@ -95,9 +84,9 @@ Requirements
 Building
 ========
 
-To build everything invoke scons as:
+To build everything on Linux invoke scons as:
 
-  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=xlib dri=false
 
 Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
@@ -105,12 +94,15 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
 but the rest of these instructions assume that scons is used.
 
+For windows is everything the except except the winsys:
+
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=gdi dri=false
 
 Using
 =====
 
-Building will create a drop-in alternative for libGL.so. To use it set the
-environment variables:
+On Linux, building will create a drop-in alternative for libGL.so. To use it
+set the environment variables:
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
 
@@ -121,6 +113,11 @@ or
 For performance evaluation pass debug=no to scons, and use the corresponding
 lib directory without the "-debug" suffix.
 
+On Windows, building will create a drop-in alternative for opengl32.dll. To use
+it put it in the same directory as the application. It can also be used by
+replacing the native ICD driver, but it's quite an advanced usage, so if you
+need to ask, don't even try it.
+
 
 Unit testing
 ============
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 3ca676647c..6bb545a501 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -66,7 +66,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample_c.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index d14f468ba9..ced7b9c11d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -142,7 +142,7 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
 
 enum lp_build_blend_swizzle {
    LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
-   LP_BUILD_BLEND_SWIZZLE_AAAA = 1,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
index dcc25fbff8..25c10af29f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -47,7 +47,7 @@
  */
 enum lp_build_flow_construct_kind {
    lP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP,
+   LP_BUILD_FLOW_SKIP
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
index d3f78c06d9..6e79438ead 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
@@ -59,3 +59,17 @@ LLVMInitializeNativeTarget(void)
 
 
 #endif
+
+
+/* 
+ * Hack to allow the linking of release LLVM static libraries on a debug build.
+ *
+ * See also:
+ * - http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/7234ea2b-0042-42ed-b4e2-5d8644dfb57d
+ */
+#if defined(_MSC_VER) && defined(_DEBUG)
+#include <crtdefs.h>
+extern "C" {
+   _CRTIMP void __cdecl _invalid_parameter_noinfo(void) {}
+}
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
index af70ddc6ab..9003e108c1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
@@ -69,8 +69,8 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->min_img_filter    = sampler->min_img_filter;
    state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if(sampler->compare_mode) {
-      state->compare_mode      = sampler->compare_mode;
+   state->compare_mode      = sampler->compare_mode;
+   if(sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func      = sampler->compare_func;
    }
    state->normalized_coords = sampler->normalized_coords;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
index 47b68b71e2..5ee8d556a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -488,7 +488,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    LLVMValueRef res;
    unsigned chan;
 
-   if(!bld->static_state->compare_mode)
+   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
       return;
 
    /* TODO: Compare before swizzling, to avoid redundant computations */
@@ -577,7 +577,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
       break;
    case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
       if(lp_format_is_rgba8(bld.format_desc))
          lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
       else
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index 7cfa4cc59a..fb1eda4423 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -361,6 +361,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       if (projected)
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
+   for (i = num_coords; i < 3; i++) {
+      coords[i] = bld->base.undef;
+   }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->base.builder,
@@ -1315,7 +1318,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       /* deprecated? */
       assert(0);
       return 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 37587d4f79..1cc3c9227c 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -256,22 +256,6 @@ llvmpipe_create( struct pipe_screen *screen )
       llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
 
 
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.vert_samplers[i].processor = TGSI_PROCESSOR_VERTEX;
-      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->vertex_tex_cache[i];
-      llvmpipe->tgsi.vert_samplers_list[i] = &llvmpipe->tgsi.vert_samplers[i];
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.frag_samplers[i].processor = TGSI_PROCESSOR_FRAGMENT;
-      llvmpipe->tgsi.frag_samplers[i].cache = llvmpipe->tex_cache[i];
-      llvmpipe->tgsi.frag_samplers_list[i] = &llvmpipe->tgsi.frag_samplers[i];
-   }
-
    /*
     * Create drawing context and plug our rendering stage into it.
     */
@@ -279,10 +263,7 @@ llvmpipe_create( struct pipe_screen *screen )
    if (!llvmpipe->draw) 
       goto fail;
 
-   draw_texture_samplers(llvmpipe->draw,
-                         PIPE_MAX_VERTEX_SAMPLERS,
-                         (struct tgsi_sampler **)
-                            llvmpipe->tgsi.vert_samplers_list);
+   /* FIXME: devise alternative to draw_texture_samplers */
 
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index cc4d5ad5fd..6411797cf5 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -115,14 +115,6 @@ struct llvmpipe_context {
 
    unsigned line_stipple_counter;
 
-   /** TGSI exec things */
-   struct {
-      struct lp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
-   } tgsi;
-
    /** The primitive drawing context */
    struct draw_context *draw;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index a96c2cad9d..c152b4413f 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -45,11 +45,11 @@
 
 
 
-boolean
+void
 llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
@@ -58,7 +58,7 @@ llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
  * Basically, map the vertex buffers (and drawing surfaces), then hand off
  * the drawing to the 'draw' module.
  */
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -122,20 +122,18 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
    /* Note: leave drawing surfaces mapped */
 
    lp->dirty_render_cache = TRUE;
-   
-   return TRUE;
 }
 
 
-boolean
+void
 llvmpipe_draw_elements(struct pipe_context *pipe,
                        struct pipe_buffer *indexBuffer,
                        unsigned indexSize,
                        unsigned mode, unsigned start, unsigned count)
 {
-   return llvmpipe_draw_range_elements( pipe, indexBuffer,
-                                        indexSize,
-                                        0, 0xffffffff,
-                                        mode, start, count );
+   llvmpipe_draw_range_elements( pipe, indexBuffer,
+                                 indexSize,
+                                 0, 0xffffffff,
+                                 mode, start, count );
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index bce3baec16..4ef0783f3e 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -79,25 +79,22 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 
    /* struct lp_jit_context */
    {
-      LLVMTypeRef elem_types[5];
+      LLVMTypeRef elem_types[4];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
-      elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
-      elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
-      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[1] = LLVMFloatType();                     /* alpha_ref_value */
+      elem_types[2] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[3] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
                              screen->target, context_type, 0);
-      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers,
-                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
-                             screen->target, context_type, 2);
+                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
-                             screen->target, context_type, 3);
+                             screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
                              screen->target, context_type,
                              LP_JIT_CONTEXT_TEXTURES_INDEX);
@@ -109,24 +106,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       screen->context_ptr_type = LLVMPointerType(context_type, 0);
    }
 
-   /* fetch_texel
-    */
-   {
-      LLVMTypeRef ret_type;
-      LLVMTypeRef arg_types[3];
-      LLVMValueRef fetch_texel;
-
-      ret_type = LLVMVoidType();
-      arg_types[0] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      arg_types[1] = LLVMInt32Type();                     /* unit */
-      arg_types[2] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0); /* store */
-
-      fetch_texel = lp_declare_intrinsic(screen->module, "fetch_texel",
-                                         ret_type, arg_types, Elements(arg_types));
-
-      LLVMAddGlobalMapping(screen->engine, fetch_texel, lp_fetch_texel_soa);
-   }
-
 #ifdef DEBUG
    LLVMDumpModule(screen->module);
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 58f716ede2..277b690c02 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -41,7 +41,6 @@
 #include "pipe/p_state.h"
 
 
-struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
@@ -78,8 +77,6 @@ struct lp_jit_context
 {
    const float *constants;
 
-   struct tgsi_sampler **samplers;
-
    float alpha_ref_value;
 
    /* FIXME: store (also?) in floats */
@@ -92,16 +89,13 @@ struct lp_jit_context
 #define lp_jit_context_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 0, "constants")
 
-#define lp_jit_context_samplers(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 1, "samplers")
-
 #define lp_jit_context_alpha_ref_value(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 2, "alpha_ref_value")
+   lp_build_struct_get(_builder, _ptr, 1, "alpha_ref_value")
 
 #define lp_jit_context_blend_color(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 3, "blend_color")
+   lp_build_struct_get(_builder, _ptr, 2, "blend_color")
 
-#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 3
 
 #define lp_jit_context_textures(_builder, _ptr) \
    lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
@@ -118,12 +112,6 @@ typedef void
                     void *color,
                     void *depth);
 
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store );
-
-
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 5cee7bf74b..7020da145f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -56,7 +56,6 @@
 #define LP_NEW_QUERY         0x4000
 
 
-struct tgsi_sampler;
 struct vertex_info;
 struct pipe_context;
 struct llvmpipe_context;
@@ -197,14 +196,14 @@ void llvmpipe_update_fs(struct llvmpipe_context *lp);
 void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe );
 
 
-boolean llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+void llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
 			     unsigned start, unsigned count);
 
-boolean llvmpipe_draw_elements(struct pipe_context *pipe,
+void llvmpipe_draw_elements(struct pipe_context *pipe,
 			       struct pipe_buffer *indexBuffer,
 			       unsigned indexSize,
 			       unsigned mode, unsigned start, unsigned count);
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index acfd7be5f7..6c1ef6bc42 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -192,36 +192,6 @@ compute_cliprect(struct llvmpipe_context *lp)
 }
 
 
-static void
-update_tgsi_samplers( struct llvmpipe_context *llvmpipe )
-{
-   unsigned i;
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->vertex_samplers[i];
-      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->vertex_textures[i];
-      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      lp_tex_tile_cache_validate_texture( llvmpipe->vertex_tex_cache[i] );
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].sampler = llvmpipe->sampler[i];
-      llvmpipe->tgsi.frag_samplers[i].texture = llvmpipe->texture[i];
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      lp_tex_tile_cache_validate_texture( llvmpipe->tex_cache[i] );
-   }
-
-   llvmpipe->jit_context.samplers = (struct tgsi_sampler **)llvmpipe->tgsi.frag_samplers_list;
-}
-
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
@@ -237,8 +207,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
    }
       
    if (llvmpipe->dirty & (LP_NEW_SAMPLER |
-                          LP_NEW_TEXTURE))
-      update_tgsi_samplers( llvmpipe );
+                          LP_NEW_TEXTURE)) {
+      /* TODO */
+   }
 
    if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
                           LP_NEW_FS |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f2b8c36264..b73ca2d41e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -453,8 +453,8 @@ generate_fragment(struct llvmpipe_context *lp,
                          debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
             debug_printf("  .mag_img_filter = %s\n",
                          debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-            if(key->sampler[i].compare_mode)
-               debug_printf("  .compare_mode = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+               debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
             debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
             debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
          }
@@ -550,13 +550,8 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-#if 0
-   /* C texture sampling */
-   sampler = lp_c_sampler_soa_create(context_ptr);
-#else
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
-#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 968c7a2d4a..faddfb9677 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -330,7 +330,7 @@ test_one(unsigned verbose,
          fprintf(stderr, "conv.bc written\n");
          fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
          firsttime = FALSE;
-         //abort();
+         /* abort(); */
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 9ad1bde956..cb59a94464 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -31,64 +31,11 @@
 
 #include <llvm-c/Core.h>
 
-#include "tgsi/tgsi_exec.h"
 
-
-struct llvmpipe_tex_tile_cache;
 struct lp_sampler_static_state;
 
 
 /**
- * Subclass of tgsi_sampler
- */
-struct lp_shader_sampler
-{
-   struct tgsi_sampler base;  /**< base class */
-
-   unsigned processor;
-
-   /* For lp_get_samples_2d_linear_POT:
-    */
-   unsigned xpot;
-   unsigned ypot;
-   unsigned level;
-
-   const struct pipe_texture *texture;
-   const struct pipe_sampler_state *sampler;
-
-   struct llvmpipe_tex_tile_cache *cache;
-};
-
-
-
-static INLINE struct lp_shader_sampler *
-lp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (struct lp_shader_sampler *) sampler;
-}
-
-
-
-extern void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE]);
-
-
-/**
- * Texture sampling code generator that just calls lp_get_samples C function
- * for the actual sampling computation.
- *
- * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
- */
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr);
-
-
-/**
  * Pure-LLVM texture sampling code generator.
  *
  * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
deleted file mode 100644
index 68520fa4f0..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
+++ /dev/null
@@ -1,1713 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture sampling
- *
- * Authors:
- *   Brian Paul
- */
-
-#include "lp_context.h"
-#include "lp_quad.h"
-#include "lp_surface.h"
-#include "lp_texture.h"
-#include "lp_tex_sample.h"
-#include "lp_tex_cache.h"
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-
-/*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
- * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
- */
-#define FRAC(f)  ((f) - util_ifloor(f))
-
-
-/**
- * Linear interpolation macro
- */
-static INLINE float
-lerp(float a, float v0, float v1)
-{
-   return v0 + a * (v1 - v0);
-}
-
-
-/**
- * Do 2D/biliner interpolation of float values.
- * v00, v10, v01 and v11 are typically four texture samples in a square/box.
- * a and b are the horizontal and vertical interpolants.
- * It's important that this function is inlined when compiled with
- * optimization!  If we find that's not true on some systems, convert
- * to a macro.
- */
-static INLINE float
-lerp_2d(float a, float b,
-        float v00, float v10, float v01, float v11)
-{
-   const float temp0 = lerp(a, v00, v10);
-   const float temp1 = lerp(a, v01, v11);
-   return lerp(b, temp0, temp1);
-}
-
-
-/**
- * As above, but 3D interpolation of 8 values.
- */
-static INLINE float
-lerp_3d(float a, float b, float c,
-        float v000, float v100, float v010, float v110,
-        float v001, float v101, float v011, float v111)
-{
-   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
-   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
-   return lerp(c, temp0, temp1);
-}
-
-
-
-/**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
- */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
-
-
-/**
- * Apply texture coord wrapping mode and return integer texture indexes
- * for a vector of four texcoords (S or T or P).
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the incoming texcoords
- * \param size  the texture image size
- * \param icoord  returns the integer texcoords
- * \return  integer texture index
- */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
-      /* s limited to [0,1] */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Used to compute texel locations for linear sampling for four texcoords.
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the texcoords
- * \param size  the texture image size
- * \param icoord0  returns first texture indexes
- * \param icoord1  returns second texture indexes (usually icoord0 + 1)
- * \param w  returns blend factor/weight between texture indexes
- * \param icoord  returns the computed integer texture coords
- */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                  int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords.
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
-{
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx > ary && arx > arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
-      }
-   }
-   else if (ary > arx && ary > arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
-      }
-   }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
-      }
-   }
-
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
-
-   return face;
-}
-
-
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   float rho, lambda;
-
-   if (samp->processor == TGSI_PROCESSOR_VERTEX)
-      return lodbias;
-
-   assert(sampler->normalized_coords);
-
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * texture->width0;
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * texture->height0;
-      rho = MAX2(rho, max);
-   }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * texture->depth0;
-      rho = MAX2(rho, max);
-   }
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
-}
-
-
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
-static void
-choose_mipmap_levels(struct tgsi_sampler *tgsi_sampler,
-                     const float s[QUAD_SIZE],
-                     const float t[QUAD_SIZE],
-                     const float p[QUAD_SIZE],
-                     float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
-      }
-      else {
-         *imgFilter = sampler->mag_img_filter;
-      }
-   }
-   else {
-      float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
-      }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
-         }
-      }
-   }
-}
-
-
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into lp_tile_cache.c and merge with the
- * lp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
-static void
-get_texel_quad_2d(const struct tgsi_sampler *tgsi_sampler,
-                  unsigned face, unsigned level, int x, int y, 
-                  const uint8_t *out[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-      
-   out[0] = &tile->color[y  ][x  ][0];
-   out[1] = &tile->color[y  ][x+1][0];
-   out[2] = &tile->color[y+1][x  ][0];
-   out[3] = &tile->color[y+1][x+1][0];
-}
-
-static INLINE const uint8_t *
-get_texel_2d_ptr(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-
-   return &tile->color[y][x][0];
-}
-
-
-static void
-get_texel_quad_2d_mt(const struct tgsi_sampler *tgsi_sampler,
-                     unsigned face, unsigned level, 
-                     int x0, int y0, 
-                     int x1, int y1,
-                     const uint8_t *out[4])
-{
-   unsigned i;
-
-   for (i = 0; i < 4; i++) {
-      unsigned tx = (i & 1) ? x1 : x0;
-      unsigned ty = (i >> 1) ? y1 : y0;
-
-      out[i] = get_texel_2d_ptr( tgsi_sampler, face, level, tx, ty );
-   }
-}
-
-static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y, int z,
-                 float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
-       y < 0 || y >= (int) u_minify(texture->height0, level) ||
-       z < 0 || z >= (int) u_minify(texture->depth0, level)) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
-   }
-   else {
-      const unsigned tx = x % TEX_TILE_SIZE;
-      const unsigned ty = y % TEX_TILE_SIZE;
-      const struct llvmpipe_cached_tex_tile *tile;
-
-      tile = lp_get_cached_tex_tile(samp->cache,
-                                    tex_tile_address(x, y, z, face, level));
-
-      rgba[0][j] = ubyte_to_float(tile->color[ty][tx][0]);
-      rgba[1][j] = ubyte_to_float(tile->color[ty][tx][1]);
-      rgba[2][j] = ubyte_to_float(tile->color[ty][tx][2]);
-      rgba[3][j] = ubyte_to_float(tile->color[ty][tx][3]);
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
-   }
-}
-
-
-/**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
- */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
-{
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
-   }
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
-}
-
-
-/**
- * As above, but do four z/texture comparisons.
- */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
-{
-   int j, k0, k1, k2, k3;
-   float val;
-
-   /* compare four texcoords vs. four texture samples */
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k0 = p[0] < rgba[0][0];
-      k1 = p[1] < rgba[0][1];
-      k2 = p[2] < rgba[0][2];
-      k3 = p[3] < rgba[0][3];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k0 = p[0] <= rgba[0][0];
-      k1 = p[1] <= rgba[0][1];
-      k2 = p[2] <= rgba[0][2];
-      k3 = p[3] <= rgba[0][3];
-      break;
-   case PIPE_FUNC_GREATER:
-      k0 = p[0] > rgba[0][0];
-      k1 = p[1] > rgba[0][1];
-      k2 = p[2] > rgba[0][2];
-      k3 = p[3] > rgba[0][3];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k0 = p[0] >= rgba[0][0];
-      k1 = p[1] >= rgba[0][1];
-      k2 = p[2] >= rgba[0][2];
-      k3 = p[3] >= rgba[0][3];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k0 = p[0] == rgba[0][0];
-      k1 = p[1] == rgba[0][1];
-      k2 = p[2] == rgba[0][2];
-      k3 = p[3] == rgba[0][3];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k0 = p[0] != rgba[0][0];
-      k1 = p[1] != rgba[0][1];
-      k2 = p[2] != rgba[0][2];
-      k3 = p[3] != rgba[0][3];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k0 = k1 = k2 = k3 = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k0 = k1 = k2 = k3 = 0;
-      break;
-   default:
-      k0 = k1 = k2 = k3 = 0;
-      assert(0);
-      break;
-   }
-
-   /* convert four pass/fail values to an intensity in [0,1] */
-   val = 0.25F * (k0 + k1 + k2 + k3);
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for (j = 0; j < 4; j++) {
-      rgba[0][j] = rgba[1][j] = rgba[2][j] = val;
-      rgba[3][j] = 1.0F;
-   }
-}
-
-
-
-static void
-lp_get_samples_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                    const float s[QUAD_SIZE],
-                                    const float t[QUAD_SIZE],
-                                    const float p[QUAD_SIZE],
-                                    float lodbias,
-                                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-   unsigned xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
-   unsigned ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
-      
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot - 0.5F;
-      float v = t[j] * ypot - 0.5F;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      float xw = u - (float)uflr;
-      float yw = v - (float)vflr;
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *tx[4];
-      
-
-      /* Can we fetch all four at once:
-       */
-      if (x0 < xmax && y0 < ymax)
-      {
-         get_texel_quad_2d(tgsi_sampler, 0, level, x0, y0, tx);
-      }
-      else 
-      {
-         unsigned x1 = (x0 + 1) & (xpot - 1);
-         unsigned y1 = (y0 + 1) & (ypot - 1);
-         get_texel_quad_2d_mt(tgsi_sampler, 0, level, 
-                              x0, y0, x1, y1, tx);
-      }
-
-
-      /* interpolate R, G, B, A */
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = lerp_2d(xw, yw, 
-                              ubyte_to_float(tx[0][c]), ubyte_to_float(tx[1][c]),
-                              ubyte_to_float(tx[2][c]), ubyte_to_float(tx[3][c]));
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int x0, y0;
-      const uint8_t *out;
-
-      x0 = util_ifloor(u);
-      if (x0 < 0) 
-         x0 = 0;
-      else if (x0 > xpot - 1)
-         x0 = xpot - 1;
-
-      y0 = util_ifloor(v);
-      if (y0 < 0) 
-         y0 = 0;
-      else if (y0 > ypot - 1)
-         y0 = ypot - 1;
-      
-      out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_linear_mip_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                               const float s[QUAD_SIZE],
-                                               const float t[QUAD_SIZE],
-                                               const float p[QUAD_SIZE],
-                                               float lodbias,
-                                               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   int level0;
-   float lambda;
-
-   lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-   level0 = (int)lambda;
-
-   if (lambda < 0.0) { 
-      samp->level = 0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else if (level0 >= texture->last_level) {
-      samp->level = texture->last_level;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else {
-      float levelBlend = lambda - level0;
-      float rgba0[4][4];
-      float rgba1[4][4];
-      int c,j;
-
-      samp->level = level0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba0 );
-
-      samp->level = level0+1;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba1 );
-
-      for (j = 0; j < QUAD_SIZE; j++) {
-         for (c = 0; c < 4; c++) {
-            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
-         }
-      }
-   }
-}
-
-/**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
- */
-static void
-lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend = 0.0f;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-
-               /* XXX: This is incorrect -- will often end up with (x0
-                *  == x1 && y0 == y1), meaning that we fetch the same
-                *  texel four times and linearly interpolate between
-                *  identical values.  The correct approach would be to
-                *  call linear_texcoord again for the second level.
-                */
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static INLINE void
-lp_get_samples_1d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_2d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, t, p,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend = 0.0f;
-   const uint face = 0;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-   depth = u_minify(texture->depth0, level0);
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static void
-lp_get_samples_cube(struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
-   }
-   lp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static void
-lp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Error condition handler
- */
-static INLINE void
-lp_get_samples_null(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   int i,j;
-
-   for (i = 0; i < 4; i++)
-      for (j = 0; j < 4; j++)
-         rgba[i][j] = 1.0;
-}
-
-/**
- * Called via tgsi_sampler::get_samples() when using a sampler for the
- * first time.  Determine the actual sampler function, link it in and
- * call it.
- */
-void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   /* Default to the 'undefined' case:
-    */
-   tgsi_sampler->get_samples = lp_get_samples_null;
-
-   if (!texture) {
-      assert(0);                /* is this legal?? */
-      goto out;
-   }
-
-   if (!sampler->normalized_coords) {
-      assert (texture->target == PIPE_TEXTURE_2D);
-      tgsi_sampler->get_samples = lp_get_samples_rect;
-      goto out;
-   }
-
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      tgsi_sampler->get_samples = lp_get_samples_1d;
-      break;
-   case PIPE_TEXTURE_2D:
-      tgsi_sampler->get_samples = lp_get_samples_2d;
-      break;
-   case PIPE_TEXTURE_3D:
-      tgsi_sampler->get_samples = lp_get_samples_3d;
-      break;
-   case PIPE_TEXTURE_CUBE:
-      tgsi_sampler->get_samples = lp_get_samples_cube;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   /* Do this elsewhere: 
-    */
-   samp->xpot = util_unsigned_logbase2( samp->texture->width0 );
-   samp->ypot = util_unsigned_logbase2( samp->texture->height0 );
-
-   /* Try to hook in a faster sampler.  Ultimately we'll have to
-    * code-generate these.  Luckily most of this looks like it is
-    * orthogonal state within the sampler.
-    */
-   if (texture->target == PIPE_TEXTURE_2D &&
-       sampler->min_img_filter == sampler->mag_img_filter &&
-       sampler->wrap_s == sampler->wrap_t &&
-       sampler->compare_mode == FALSE &&
-       sampler->normalized_coords) 
-   {
-      if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-         samp->level = CLAMP((int) sampler->min_lod,
-                             0, (int) texture->last_level);
-
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_repeat_POT;
-               break;
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-         else if (sampler->wrap_s == PIPE_TEX_WRAP_CLAMP) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_clamp_POT;
-               break;
-            default:
-               break;
-            }
-         }
-      }
-      else if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_mip_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-      }
-   }
-   else if (0) {
-      _debug_printf("target %d/%d min_mip %d/%d min_img %d/%d wrap %d/%d compare %d/%d norm %d/%d\n",
-                    texture->target, PIPE_TEXTURE_2D,
-                    sampler->min_mip_filter, PIPE_TEX_MIPFILTER_NONE,
-                    sampler->min_img_filter, sampler->mag_img_filter,
-                    sampler->wrap_s, sampler->wrap_t,
-                    sampler->compare_mode, FALSE,
-                    sampler->normalized_coords, TRUE);
-   }
-
-out:
-   tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
-}
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-#include "lp_bld_type.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_tgsi.h"
-
-
-struct lp_c_sampler_soa
-{
-   struct lp_build_sampler_soa base;
-
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-static void
-lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
-{
-   FREE(sampler);
-}
-
-
-static void
-lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
-                                  LLVMBuilderRef builder,
-                                  struct lp_type type,
-                                  unsigned unit,
-                                  unsigned num_coords,
-                                  const LLVMValueRef *coords,
-                                  LLVMValueRef lodbias,
-                                  LLVMValueRef *texel)
-{
-   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!sampler->samplers_ptr)
-      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
-
-   if(!sampler->store_ptr)
-      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = sampler->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = sampler->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr)
-{
-   struct lp_c_sampler_soa *sampler;
-
-   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
-   if(!sampler)
-      return NULL;
-
-   sampler->base.destroy = lp_c_sampler_soa_destroy;
-   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
-   sampler->context_ptr = context_ptr;
-
-   return &sampler->base;
-}
-
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
deleted file mode 100644
index 9c235080a5..0000000000
--- a/src/gallium/drivers/nouveau/nouveau_push.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef __NOUVEAU_PUSH_H__
-#define __NOUVEAU_PUSH_H__
-
-#include "nouveau/nouveau_winsys.h"
-
-#ifndef NOUVEAU_PUSH_CONTEXT
-#error undefined push context
-#endif
-
-#define OUT_RING(data) do {                                                    \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	(*pc->base.channel->pushbuf->cur++) = (data);                          \
-} while(0)
-
-#define OUT_RINGp(src,size) do {                                               \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	memcpy(pc->base.channel->pushbuf->cur, (src), (size) * 4);             \
-	pc->base.channel->pushbuf->cur += (size);                              \
-} while(0)
-
-#define OUT_RINGf(data) do {                                                   \
-	union { float v; uint32_t u; } c;                                      \
-	c.v = (data);                                                          \
-	OUT_RING(c.u);                                                         \
-} while(0)
-
-#define BEGIN_RING(obj,mthd,size) do {                                         \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
-	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
-} while(0)
-
-static inline void
-DO_FIRE_RING(struct nouveau_channel *chan, struct pipe_fence_handle **fence)
-{
-	nouveau_pushbuf_flush(chan, 0);
-	if (fence)
-		*fence = NULL;
-}
-
-#define FIRE_RING(fence) do {                                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	DO_FIRE_RING(pc->base.channel, fence);                                 \
-} while(0)
-
-#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, nouveau_bo(bo), \
-				   (data), 0, (flags), (vor), (tor));          \
-} while(0)
-
-/* Raw data + flags depending on FB/TT buffer */
-#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
-	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
-} while(0)
-
-/* FB/TT object handle */
-#define OUT_RELOCo(bo,flags) do {                                              \
-	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
-		  pc->base.channel->vram->handle,                              \
-		  pc->base.channel->gart->handle);                             \
-} while(0)
-
-/* Low 32-bits of offset */
-#define OUT_RELOCl(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
-} while(0)
-
-/* High 32-bits of offset */
-#define OUT_RELOCh(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
-} while(0)
-
-/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
-#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
-		   (flags), 0, 0);                                             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 0437af3725..7ebc94ed6c 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -127,8 +127,18 @@ nouveau_screen_bo_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 		      unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map(bo, nouveau_screen_map_flags(usage));
 	if (ret) {
 		debug_printf("map failed: %d\n", ret);
@@ -143,11 +153,22 @@ nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 			    unsigned offset, unsigned length, unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	uint32_t flags = nouveau_screen_map_flags(usage);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map_range(bo, offset, length, flags);
 	if (ret) {
+		nouveau_bo_unmap(bo);
 		if (!(flags & NOUVEAU_BO_NOWAIT) || ret != -EBUSY)
 			debug_printf("map_range failed: %d\n", ret);
 		return NULL;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index ebfc67ad1c..a7927d88df 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -5,6 +5,9 @@ struct nouveau_screen {
 	struct pipe_screen base;
 	struct nouveau_device *device;
 	struct nouveau_channel *channel;
+
+	int (*pre_pipebuffer_map_callback) (struct pipe_screen *pscreen,
+		struct pipe_buffer *pb, unsigned usage);
 };
 
 static inline struct nouveau_screen *
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index 9aee9e4956..e844f6abb3 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -3,41 +3,95 @@
 
 #include "util/u_debug.h"
 
+#ifdef DEBUG
+#define DEBUG_NOUVEAU_STATEOBJ
+#endif /* DEBUG */
+
 struct nouveau_stateobj_reloc {
 	struct nouveau_bo *bo;
 
-	unsigned offset;
-	unsigned packet;
+	struct nouveau_grobj *gr;
+	uint32_t push_offset;
+	uint32_t mthd;
 
-	unsigned data;
+	uint32_t data;
 	unsigned flags;
 	unsigned vor;
 	unsigned tor;
 };
 
+struct nouveau_stateobj_start {
+	struct nouveau_grobj *gr;
+	uint32_t mthd;
+	uint32_t size;
+	unsigned offset;
+};
+
 struct nouveau_stateobj {
 	struct pipe_reference reference;
 
-	unsigned *push;
+	struct nouveau_stateobj_start *start;
 	struct nouveau_stateobj_reloc *reloc;
 
-	unsigned *cur;
-	unsigned cur_packet;
+	/* Common memory pool for data. */
+	uint32_t *pool;
+	unsigned pool_cur;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	unsigned start_alloc;
+	unsigned reloc_alloc;
+	unsigned pool_alloc;
+#endif  /* DEBUG_NOUVEAU_STATEOBJ */
+
+	unsigned total; /* includes begin_ring */
+	unsigned cur; /* excludes begin_ring, offset from "cur_start" */
+	unsigned cur_start;
 	unsigned cur_reloc;
 };
 
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr, total = 0;
+
+	for (i = 0; i < so->cur_start; i++) {
+		if (so->start[i].gr->subc > -1)
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | (so->start[i].gr->subc << 13)
+				| so->start[i].mthd);
+		else
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | so->start[i].mthd);
+		for (nr = 0; nr < so->start[i].size; nr++, total++)
+			debug_printf("+0x%04x: 0x%08x\n", total,
+				so->pool[so->start[i].offset + nr]);
+	}
+}
+
 static INLINE struct nouveau_stateobj *
-so_new(unsigned push, unsigned reloc)
+so_new(unsigned start, unsigned push, unsigned reloc)
 {
 	struct nouveau_stateobj *so;
 
 	so = MALLOC(sizeof(struct nouveau_stateobj));
 	pipe_reference_init(&so->reference, 1);
-	so->push = MALLOC(sizeof(unsigned) * push);
-	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+	so->total = so->cur = so->cur_start = so->cur_reloc = 0;
 
-	so->cur = so->push;
-	so->cur_reloc = so->cur_packet = 0;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	so->start_alloc = start;
+	so->reloc_alloc = reloc;
+	so->pool_alloc = push;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = MALLOC(start * sizeof(struct nouveau_stateobj_start));
+	so->reloc = MALLOC(reloc * sizeof(struct nouveau_stateobj_reloc));
+	so->pool = MALLOC(push * sizeof(uint32_t));
+	so->pool_cur = 0;
+
+	if (!so->start || !so->reloc || !so->pool) {
+		debug_printf("malloc failed\n");
+		assert(0);
+	}
 
 	return so;
 }
@@ -48,63 +102,128 @@ so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
 	struct nouveau_stateobj *so = *pso;
 	int i;
 
-        if (pipe_reference(&(*pso)->reference, &ref->reference)) {
-		free(so->push);
+	if (pipe_reference(&(*pso)->reference, &ref->reference)) {
+		FREE(so->start);
 		for (i = 0; i < so->cur_reloc; i++)
 			nouveau_bo_ref(NULL, &so->reloc[i].bo);
-		free(so->reloc);
-		free(so);
+		FREE(so->reloc);
+		FREE(so->pool);
+		FREE(so);
 	}
 	*pso = ref;
 }
 
 static INLINE void
-so_data(struct nouveau_stateobj *so, unsigned data)
+so_data(struct nouveau_stateobj *so, uint32_t data)
 {
-	(*so->cur++) = (data);
-	so->cur_packet += 4;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur >= so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->pool[so->start[so->cur_start - 1].offset + so->cur++] = data;
 }
 
 static INLINE void
-so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+so_datap(struct nouveau_stateobj *so, uint32_t *data, unsigned size)
 {
-	so->cur_packet += (4 * size);
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if ((so->cur + size) > so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
 	while (size--)
-		(*so->cur++) = (*data++);
+		so->pool[so->start[so->cur_start - 1].offset + so->cur++] =
+			*data++;
 }
 
 static INLINE void
 so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	  unsigned mthd, unsigned size)
 {
-	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
-	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+	struct nouveau_stateobj_start *start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start_alloc <= so->cur_start) {
+		debug_printf("exceeding num_start size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		start = so->start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur_start > 0 && start[so->cur_start - 1].size > so->cur) {
+		debug_printf("previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = start;
+	start[so->cur_start].gr = gr;
+	start[so->cur_start].mthd = mthd;
+	start[so->cur_start].size = size;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->pool_alloc < (size + so->pool_cur)) {
+		debug_printf("exceeding num_pool size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	start[so->cur_start].offset = so->pool_cur;
+	so->pool_cur += size;
+
+	so->cur_start++;
+	/* The 1 is for *this* begin_ring. */
+	so->total += so->cur + 1;
+	so->cur = 0;
 }
 
 static INLINE void
 so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
 	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
 {
-	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
-	
-	r->bo = NULL;
-	nouveau_bo_ref(bo, &r->bo);
-	r->offset = so->cur - so->push;
-	r->packet = so->cur_packet;
-	r->data = data;
-	r->flags = flags;
-	r->vor = vor;
-	r->tor = tor;
+	struct nouveau_stateobj_reloc *r;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->reloc_alloc <= so->cur_reloc) {
+		debug_printf("exceeding num_reloc size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		r = so->reloc;
+
+	so->reloc = r;
+	r[so->cur_reloc].bo = NULL;
+	nouveau_bo_ref(bo, &(r[so->cur_reloc].bo));
+	r[so->cur_reloc].gr = so->start[so->cur_start-1].gr;
+	r[so->cur_reloc].push_offset = so->total + so->cur;
+	r[so->cur_reloc].data = data;
+	r[so->cur_reloc].flags = flags;
+	r[so->cur_reloc].mthd = so->start[so->cur_start-1].mthd +
+							(so->cur << 2);
+	r[so->cur_reloc].vor = vor;
+	r[so->cur_reloc].tor = tor;
+
 	so_data(so, data);
+	so->cur_reloc++;
 }
 
-static INLINE void
-so_dump(struct nouveau_stateobj *so)
+/* Determine if this buffer object is referenced by this state object. */
+static INLINE boolean
+so_bo_is_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo)
 {
-	unsigned i, nr = so->cur - so->push;
+	int i;
+
+	for (i = 0; i < so->cur_reloc; i++)
+		if (so->reloc[i].bo == bo)
+			return true;
 
-	for (i = 0; i < nr; i++)
-		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+	return false;
 }
 
 static INLINE void
@@ -114,75 +233,93 @@ so_emit(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 	unsigned nr, i;
 	int ret = 0;
 
-	nr = so->cur - so->push;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start[so->cur_start - 1].size > so->cur) {
+		debug_printf("emit: previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	/* We cannot update total in case we so_emit again. */
+	nr = so->total + so->cur;
+
 	/* This will flush if we need space.
 	 * We don't actually need the marker.
 	 */
 	if ((ret = nouveau_pushbuf_marker_emit(chan, nr, so->cur_reloc))) {
 		debug_printf("so_emit failed marker emit with error %d\n", ret);
-		return;
+		assert(0);
+	}
+
+	/* Submit data. This will ensure proper binding of objects. */
+	for (i = 0; i < so->cur_start; i++) {
+		BEGIN_RING(chan, so->start[i].gr, so->start[i].mthd, so->start[i].size);
+		OUT_RINGp(chan, &(so->pool[so->start[i].offset]), so->start[i].size);
 	}
-	pb->remaining -= nr;
 
-	memcpy(pb->cur, so->push, nr * 4);
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur + r->offset,
-					   r->bo, r->data, 0, r->flags,
-					   r->vor, r->tor))) {
+		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur - nr +
+						r->push_offset, r->bo, r->data,
+						0, r->flags, r->vor, r->tor))) {
 			debug_printf("so_emit failed reloc with error %d\n", ret);
-			goto out;
+			assert(0);
 		}
 	}
-out:
-	pb->cur += nr;
 }
 
 static INLINE void
 so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *gr = NULL;
 	unsigned i;
 	int ret = 0;
 
 	if (!so)
 		return;
 
-	i = so->cur_reloc << 1;
-	/* This will flush if we need space.
-	 * We don't actually need the marker.
-	 */
-	if ((ret = nouveau_pushbuf_marker_emit(chan, i, i))) {
-		debug_printf("so_emit_reloc_markers failed marker emit with" \
-			"error %d\n", ret);
-		return;
-	}
-	pb->remaining -= i;
-
+	/* If we need to flush in flush notify, then we have a problem anyway. */
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo,
-					   r->packet, 0,
-					   (r->flags & (NOUVEAU_BO_VRAM |
-							NOUVEAU_BO_GART |
-							NOUVEAU_BO_RDWR)) |
-					   NOUVEAU_BO_DUMMY, 0, 0))) {
-			debug_printf("so_emit_reloc_markers failed reloc" \
-						"with error %d\n", ret);
-			pb->remaining += ((so->cur_reloc - i) << 1);
-			return;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+		if (r->mthd & 0x40000000) {
+			debug_printf("error: NI mthd 0x%08X\n", r->mthd);
+			continue;
 		}
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo,
-					   r->data, 0,
-					   r->flags | NOUVEAU_BO_DUMMY,
-					   r->vor, r->tor))) {
-			debug_printf("so_emit_reloc_markers failed reloc" \
-						"with error %d\n", ret);
-			pb->remaining += ((so->cur_reloc - i) << 1) - 1;
-			return;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+		/* The object needs to be bound and the system must know the
+		 * subchannel is being used. Otherwise it will discard it.
+		 */
+		if (gr != r->gr) {
+			BEGIN_RING(chan, r->gr, 0x100, 1);
+			OUT_RING(chan, 0);
+			gr = r->gr;
+		}
+
+		/* Some relocs really don't like to be hammered,
+		 * NOUVEAU_BO_DUMMY makes sure it only
+		 * happens when needed.
+		 */
+		ret = OUT_RELOC(chan, r->bo, (r->gr->subc << 13) | (1<< 18) |
+			r->mthd, (r->flags & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART
+				| NOUVEAU_BO_RDWR)) | NOUVEAU_BO_DUMMY, 0, 0);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
 		}
+
+		ret = OUT_RELOC(chan, r->bo, r->data, r->flags |
+			NOUVEAU_BO_DUMMY, r->vor, r->tor);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+
+		pb->remaining -= 2;
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
index 770733a4a1..edd96859cf 100644
--- a/src/gallium/drivers/nv04/nv04_context.c
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -10,10 +10,14 @@ nv04_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv04->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -30,32 +34,36 @@ nv04_destroy(struct pipe_context *pipe)
 static boolean
 nv04_init_hwctx(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
 	// requires a valid handle
-//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
 //	OUT_RING(0);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
-	OUT_RING(0);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(chan, 0);
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(0x40182800);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, 0x40182800);
 //	OUT_RING(1<<20/*no cull*/);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
 //	OUT_RING(0x24|(1<<6)|(1<<8));
-	OUT_RING(0x120001a4);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
-	OUT_RING(0x332213a1);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
-	OUT_RING(0x11001010);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
-	OUT_RING(0x0);
-//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
+	OUT_RING(chan, 0x120001a4);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(chan, 0x332213a1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(chan, 0x11001010);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(chan, 0x0);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
 //	OUT_RING(SCREEN_OFFSET);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
-	OUT_RING(0xff000000);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(chan, 0xff000000);
 
 
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 	return TRUE;
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
index 55326c787a..fe3b527423 100644
--- a/src/gallium/drivers/nv04/nv04_context.h
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv04_screen *ctx = nv04->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv04_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -141,9 +137,9 @@ extern void nv04_emit_hw_state(struct nv04_context *nv04);
 extern void nv04_state_tex_update(struct nv04_context *nv04);
 
 /* nv04_vbo.c */
-extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv04_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv04_draw_elements( struct pipe_context *pipe,
+extern void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
index 25395edfd7..0b795ea243 100644
--- a/src/gallium/drivers/nv04/nv04_prim_vbuf.c
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -93,33 +93,45 @@ nv04_vbuf_render_set_primitive( struct vbuf_render *render,
 
 static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
-	OUT_RING(0xFEDCBA);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA), 49);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(chan, 0xFEDCBA);
 }
 
 static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RING(0xFED);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD), 25);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(chan, 0xFED);
 }
 
 static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RING(0xFECEDC);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC), 33);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(chan, 0xFECEDC);
 }
 
 static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
@@ -156,7 +168,10 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 {
 	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
 	for(i = 0; i<nr_indices; i+=14) 
@@ -166,15 +181,15 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 		if (numvert<3)
 			break;
 
-		BEGIN_RING( fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8);
 		for(j = 0; j<numvert; j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices [i+j], 8 );
 
-		BEGIN_RING_NI( fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
 		for(j = 0; j<numtri/2; j++ )
-			OUT_RING(striptbl[j]);
+			OUT_RING(chan, striptbl[j]);
 		if (numtri%2)
-			OUT_RING(striptbl[numtri/2]&0xFFF);
+			OUT_RING(chan, striptbl[numtri/2]&0xFFF);
 	}
 }
 
@@ -182,11 +197,14 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 {
 	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
-	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[0], 8);
 
 	for(i = 1; i<nr_indices; i+=14)
 	{
@@ -195,16 +213,16 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 		if (numvert < 3)
 			break;
 
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
 
 		for(j=0;j<numvert;j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[ i+j ], 8 );
 
-		BEGIN_RING_NI(fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
 		for(j = 0; j<numtri/2; j++)
-			OUT_RING(fantbl[j]);
+			OUT_RING(chan, fantbl[j]);
 		if (numtri%2)
-			OUT_RING(fantbl[numtri/2]&0xFFF);
+			OUT_RING(chan, fantbl[numtri/2]&0xFFF);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
index bd98ae091f..b8d6dc560f 100644
--- a/src/gallium/drivers/nv04/nv04_state_emit.c
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -57,13 +57,19 @@ static uint32_t nv04_blend_func(uint32_t f)
 static void nv04_emit_control(struct nv04_context* nv04)
 {
 	uint32_t control = nv04->dsa->control;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(control);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, control);
 }
 
 static void nv04_emit_blend(struct nv04_context* nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	uint32_t blend;
 
 	blend=0x4; // texture MODULATE_ALPHA
@@ -75,19 +81,23 @@ static void nv04_emit_blend(struct nv04_context* nv04)
 	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
 	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
-	OUT_RING(blend);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(chan, blend);
 }
 
 static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
 {
 	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
 	struct pipe_texture *pt = &nv04mt->base;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING(nv04->sampler[unit]->filter);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(chan, nv04->sampler[unit]->filter);
 }
 
 static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
@@ -97,6 +107,10 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv04_miptree *nv04mt = 0;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
+	struct nouveau_bo *bo;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -128,24 +142,29 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 		assert(0);
 	}
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
-	OUT_RING(rt_format);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(chan, rt_format);
 
 	nv04mt = (struct nv04_miptree *)rt->base.texture;
+	bo = nouveau_bo(nv04mt->buffer);
 	/* FIXME pitches have to be aligned ! */
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt->pitch|(zeta->pitch<<16));
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt->pitch|(zeta->pitch<<16));
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (fb->zsbuf) {
 		nv04mt = (struct nv04_miptree *)zeta->base.texture;
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 }
 
 void
 nv04_emit_hw_state(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
 	int i;
 
 	if (nv04->dirty & NV04_NEW_VERTPROG) {
@@ -163,8 +182,8 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	if (nv04->dirty & NV04_NEW_CONTROL) {
 		nv04->dirty &= ~NV04_NEW_CONTROL;
 
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-		OUT_RING(nv04->dsa->control);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(chan, nv04->dsa->control);
 	}
 
 	if (nv04->dirty & NV04_NEW_BLEND) {
@@ -205,12 +224,12 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	unsigned rt_pitch = ((struct nv04_surface *)nv04->rt)->pitch;
 	unsigned zeta_pitch = ((struct nv04_surface *)nv04->zeta)->pitch;
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt_pitch|(zeta_pitch<<16));
-	OUT_RELOCl(nv04->rt, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt_pitch|(zeta_pitch<<16));
+	OUT_RELOCl(chan, nouveau_bo(nv04->rt), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (nv04->zeta) {
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv04->zeta), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 
 	/* Texture images */
@@ -218,9 +237,10 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 		if (!(nv04->fp_samplers & (1 << i)))
 			continue;
 		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+		struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
index 099ab10043..3484771814 100644
--- a/src/gallium/drivers/nv04/nv04_vbo.c
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv04_draw_elements( struct pipe_context *pipe,
+void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -65,15 +65,13 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv04_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv04_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
 	printf("coucou in draw arrays\n");
-	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv04_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
index 0dadeb03dd..1ecb73d06e 100644
--- a/src/gallium/drivers/nv10/nv10_context.c
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -10,10 +10,14 @@ nv10_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv10->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,225 +35,226 @@ static void nv10_init_hwctx(struct nv10_context *nv10)
 {
 	struct nv10_screen *screen = nv10->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	float projectionmatrix[16];
 
-	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
 
 	for (i=1;i<8;i++) {
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, 0x290, 1);
-	OUT_RING  ((0x10<<16)|1);
-	BEGIN_RING(celsius, 0x3f4, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, 0x290, 1);
+	OUT_RING  (chan, (0x10<<16)|1);
+	BEGIN_RING(chan, celsius, 0x3f4, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	if (nv10->screen->celsius->grclass != NV10TCL) {
 		/* For nv11, nv17 */
-		BEGIN_RING(celsius, 0x120, 3);
-		OUT_RING  (0);
-		OUT_RING  (1);
-		OUT_RING  (2);
+		BEGIN_RING(chan, celsius, 0x120, 3);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 2);
 
-		BEGIN_RING(celsius, NV10TCL_NOP, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	/* Set state */
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
-	OUT_RING  (0x30141010);
-	OUT_RING  (0);
-	OUT_RING  (0x20040000);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0x18000000);
-	OUT_RING  (0x300e0300);
-	OUT_RING  (0x0c091c80);
+	BEGIN_RING(chan, celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (chan, 0x30141010);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x20040000);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0x18000000);
+	OUT_RING  (chan, 0x300e0300);
+	OUT_RING  (chan, 0x0c091c80);
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x8006);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
-	OUT_RING  (0xff);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	OUT_RING  (0xff);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1d01);
-	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING  (0x201);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (0x1b02);
-	OUT_RING  (0x1b02);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (0x405);
-	OUT_RING  (0x901);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x8006);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1d01);
+	BEGIN_RING(chan, celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, 0x201);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, 0x1b02);
+	OUT_RING  (chan, 0x1b02);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, 0x405);
+	OUT_RING  (chan, 0x901);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_GEN_S(0), 8);
 	for (i=0;i<8;i++) {
-		OUT_RING  (0);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RING  (0x3fc00000);	/* -1.50 */
-	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
-	OUT_RING  (0);		/*  0.00 */
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (chan, 0x3fc00000);	/* -1.50 */
+	OUT_RING  (chan, 0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (chan, 0);		/*  0.00 */
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
-	OUT_RING  (0x802);
-	OUT_RING  (2);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (chan, 0x802);
+	OUT_RING  (chan, 2);
 	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
 	 * using texturing, except when using the texture matrix
 	 */
-	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
-	OUT_RING  (6);
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (0x01010101);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (chan, 6);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, 0x01010101);
 
 	/* Set vertex component */
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
-	OUT_RINGf (0.0);
-	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (chan, 0.0);
+	BEGIN_RING(chan, celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
-	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 1.0;
 	projectionmatrix[3*4+3] = 1.0;
 	for (i=0;i<16;i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RING  (0.0);
-	OUT_RINGf  (16777216.0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (chan, 0.0);
+	OUT_RINGf  (chan, 16777216.0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (16777215.0 * 0.5);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, 16777215.0 * 0.5);
+	OUT_RING  (chan, 0);
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
index 36a6aa7a74..ab4b825487 100644
--- a/src/gallium/drivers/nv10/nv10_context.h
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv10_screen *ctx = nv10->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv10_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -144,9 +140,9 @@ extern void nv10_emit_hw_state(struct nv10_context *nv10);
 extern void nv10_state_tex_update(struct nv10_context *nv10);
 
 /* nv10_vbo.c */
-extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv10_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv10_draw_elements( struct pipe_context *pipe,
+extern void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
index 906fdfeeb9..c1f7ccb9ab 100644
--- a/src/gallium/drivers/nv10/nv10_fragtex.c
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -52,6 +52,9 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
 	struct pipe_texture *pt = &nv10mt->base;
 	struct nv10_texture_format *tf;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	uint32_t txf, txs, txp;
 
 	tf = nv10_fragtex_format(pt->format);
@@ -82,15 +85,15 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 		return;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width0 << 16) | pt->height0);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv10mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv10mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 {
 #if 0
 	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	unsigned samplers, unit;
 
 	samplers = nv10->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv10->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
index 7ba9777a22..c5dbe43dbc 100644
--- a/src/gallium/drivers/nv10/nv10_prim_vbuf.c
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -67,12 +67,15 @@ struct nv10_vbuf_render {
 
 void nv10_vtxbuf_bind( struct nv10_context* nv10 )
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	for(i = 0; i < 8; i++) {
-		BEGIN_RING(celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv10->vtxbuf*/);
-		BEGIN_RING(celsius, NV10TCL_VTXFMT(i), 1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv10->vtxbuf*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXFMT(i), 1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 }
 
@@ -163,19 +166,22 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
 	struct nv10_context *nv10 = nv10_render->nv10;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int push, i;
 
 	nv10_emit_hw_state(nv10);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv10_render->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv10_render->hwprim);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv10_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -183,16 +189,16 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 6a39ddeaac..69a6dab866 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -180,7 +180,6 @@ nv10_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->celsius, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
index 2577ab73b5..30a596ca60 100644
--- a/src/gallium/drivers/nv10/nv10_state_emit.c
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -4,25 +4,32 @@
 static void nv10_state_emit_blend(struct nv10_context* nv10)
 {
 	struct nv10_blend_state *b = nv10->blend;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
-	OUT_RING  (b->b_enable);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (chan, b->b_enable);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 {
 	struct pipe_blend_color *c = nv10->blend_color;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -31,60 +38,66 @@ static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 static void nv10_state_emit_rast(struct nv10_context* nv10)
 {
 	struct nv10_rasterizer_state *r = nv10->rast;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv10_state_emit_dsa(struct nv10_context* nv10)
 {
 	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
 #if 0
-	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv10_state_emit_viewport(struct nv10_context* nv10)
@@ -108,6 +121,10 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	int colour_format = 0, zeta_format = 0;
         struct nv10_miptree *nv10mt = 0;
 
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
 	colour_format = fb->cbufs[0]->format;
@@ -144,11 +161,11 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	}
 
 	if (zeta) {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv10mt = (struct nv10_miptree *)rt->base.texture;
@@ -160,13 +177,13 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 		nv10->zeta = nv10mt->buffer;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0);
-	OUT_RING  (rt_format);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
-	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0);
+	OUT_RING  (chan, rt_format);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (chan, ((h - 1) << 16) | 0 | 0x08000800);
 }
 
 static void nv10_vertex_layout(struct nv10_context *nv10)
@@ -201,6 +218,10 @@ static void nv10_vertex_layout(struct nv10_context *nv10)
 void
 nv10_emit_hw_state(struct nv10_context *nv10)
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv10->dirty & NV10_NEW_VERTPROG) {
@@ -269,38 +290,41 @@ nv10_emit_hw_state(struct nv10_context *nv10)
 	 */
 
 	/* Render target */
+	rt_bo = nouveau_bo(nv10->rt[0]);
 // XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
-//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
-//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//	BEGIN_RING(chan, celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv10->zeta) {
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv10->zeta);
 // XXX
-//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
-//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//		BEGIN_RING(chan, celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv10->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv10->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv10->tex[i].buffer);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+		BEGIN_RING(chan, celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv10->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
 			   NV10TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
index 0d26141248..9180c72c9b 100644
--- a/src/gallium/drivers/nv10/nv10_vbo.c
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv10_draw_elements( struct pipe_context *pipe,
+void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -65,14 +65,12 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv10_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv10_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
-	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv10_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
index 6a147a4159..5b80af2d22 100644
--- a/src/gallium/drivers/nv20/nv20_context.c
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -10,10 +10,14 @@ nv20_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv20->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,348 +35,352 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 {
 	struct nv20_screen *screen = nv20->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	float projectionmatrix[16];
-	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+	const boolean is_nv25tcl = (kelvin->grclass == NV25TCL);
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle); /* TEXTURE1 */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle); /* ZETA */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle); /* ZETA */
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
-	OUT_RING  (0); /* renouveau: beef0351, unique */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (chan, 0); /* renouveau: beef0351, unique */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
 
 	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x17e0, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
+	BEGIN_RING(chan, kelvin, 0x17e0, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
 	} else {
-		BEGIN_RING(kelvin, 0x1e68, 1);
-		OUT_RING  (0x4b800000); /* 16777216.000000 */
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+		BEGIN_RING(chan, kelvin, 0x1e68, 1);
+		OUT_RING  (chan, 0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL);
 	}
 
-	BEGIN_RING(kelvin, 0x290, 1);
-	OUT_RING  ((0x10 << 16) | 1);
-	BEGIN_RING(kelvin, 0x9fc, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, 0x1d80, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, 0x9f8, 1);
-	OUT_RING  (4);
-	BEGIN_RING(kelvin, 0x17ec, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, 0x290, 1);
+	OUT_RING  (chan, (0x10 << 16) | 1);
+	BEGIN_RING(chan, kelvin, 0x9fc, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, 0x1d80, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, 0x9f8, 1);
+	OUT_RING  (chan, 4);
+	BEGIN_RING(chan, kelvin, 0x17ec, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 0.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d88, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d88, 1);
+		OUT_RING  (chan, 3);
 
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
-		OUT_RING  (chan->vram->handle);
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
-		OUT_RING  (chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (chan, chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (chan, chan->vram->handle);
 	}
-	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
-	OUT_RING  (0);	/* renouveau: beef1e10 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (chan, 0);	/* renouveau: beef1e10 */
 
-	BEGIN_RING(kelvin, 0x1e98, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, 0x1e98, 1);
+	OUT_RING  (chan, 0);
 #if 0
 	if (is_nv25tcl) {
-		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
-		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+		BEGIN_RING(chan, NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (chan, NvDmaFB);	/* renouveau: beef0201 */
 
-		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		BEGIN_RING(chan, NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
 	}
 #endif
-	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x120, 3);
-	OUT_RING  (0);
-	OUT_RING  (1);
-	OUT_RING  (2);
+	BEGIN_RING(chan, kelvin, 0x120, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 2);
 
 /* error: ILLEGAL_MTHD, PROTECTION_FAULT
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (512.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 512.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
 */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x022c, 2);
-		OUT_RING  (0x280);
-		OUT_RING  (0x07d28000);
+		BEGIN_RING(chan, kelvin, 0x022c, 2);
+		OUT_RING  (chan, 0x280);
+		OUT_RING  (chan, 0x07d28000);
 	}
 
 /* * illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c2c, 1);
-	OUT_RING  (0); */
+	BEGIN_RING(chan, NvSub3D, 0x1c2c, 1);
+	OUT_RING  (chan, 0); */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1da4, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, 0x1da4, 1);
+		OUT_RING  (chan, 0);
 	}
 
 /* * crashes with illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c18, 1);
-	OUT_RING  (0x200); */
+	BEGIN_RING(chan, NvSub3D, 0x1c18, 1);
+	OUT_RING  (chan, 0x200); */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  ((0 << 16) | 0);
-	OUT_RING  ((0 << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, (0 << 16) | 0);
+	OUT_RING  (chan, (0 << 16) | 0);
 
 	/* *** Set state *** */
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_ALPHA_FUNC_REF */
 
 	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
-	OUT_RING  (0x30d410d0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
-	OUT_RING  (0x00011101);
-	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
-	OUT_RING  (0x130e0300);
-	OUT_RING  (0x0c091c80);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
-	OUT_RING  (0x20c400c0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
-	OUT_RING  (0x035125a0);
-	OUT_RING  (0);
-	OUT_RING  (0x40002000);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
-	OUT_RING  (0xffff0000);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (chan, 0x30d410d0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (chan, 0x00011101);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (chan, 0x130e0300);
+	OUT_RING  (chan, 0x0c091c80);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (chan, 0x20c400c0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (chan, 0x035125a0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x40002000);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (chan, 0xffff0000);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
-	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
-	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
-	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RING  (0xff);
-	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
-	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
-	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (chan, 0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (chan, NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (chan, 0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZPASS_KEEP);
 
-	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
-	BEGIN_RING(kelvin, 0x17cc, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(chan, kelvin, 0x17cc, 1);
+	OUT_RING  (chan, 0);
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 1);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
-	OUT_RING  (0x00020000);
-	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (chan, 0x00020000);
+	BEGIN_RING(chan, kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
 					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
 	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
-		OUT_RING(0xffffffff);
+		OUT_RING(chan, 0xffffffff);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (chan, 1);
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 3);
 	}
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
 	if (!is_nv25tcl) {
-		OUT_RING  (8);
+		OUT_RING  (chan, 8);
 	} else {
-		OUT_RINGf (1.0);
+		OUT_RINGf (chan, 1.0);
 	}
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
-		OUT_RING  (0);
-		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, 0x0a1c, 1);
-		OUT_RING  (0x800);
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, 0x0a1c, 1);
+		OUT_RING  (chan, 0x800);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
-	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (NV20TCL_CULL_FACE_BACK);
-	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
-	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (chan, NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (chan, NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
 	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
-		OUT_RING(0);
+		OUT_RING(chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RINGf (1.5);
-	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
-	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
-	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
-	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
-	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
-	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
-	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
-	OUT_RING  (NV20TCL_ENGINE_FIXED);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (chan, 1.5);
+	OUT_RINGf (chan, -0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (chan, 0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (chan, NV20TCL_FOG_MODE_EXP_SIGNED);
+	OUT_RING  (chan, NV20TCL_FOG_COORD_FOG);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(chan, kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (chan, NV20TCL_ENGINE_FIXED);
 
 	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
-	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
-	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
-	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
 	for (i = 4; i < 16; ++i) {
-		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 1.0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING (0x00010101);
-	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
-	OUT_RING (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (chan, 0x00010101);
+	BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (chan, 0);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 16777215.0;
 	projectionmatrix[3*4+3] = 1.0;
-	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
 	for (i = 0; i < 16; i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
-	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777215.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0); /* x-offset, w/2 + 1.031250 */
+	OUT_RINGf (chan, 0.0); /* y-offset, h/2 + 0.030762 */
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777215.0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
-	OUT_RINGf (0.0); /* no effect?, w/2 */
-	OUT_RINGf (0.0); /* no effect?, h/2 */
-	OUT_RINGf (16777215.0 * 0.5);
-	OUT_RINGf (65535.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf (chan, 0.0); /* no effect?, w/2 */
+	OUT_RINGf (chan, 0.0); /* no effect?, h/2 */
+	OUT_RINGf (chan, 16777215.0 * 0.5);
+	OUT_RINGf (chan, 65535.0);
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
index a4eaa95660..c7dfadaa31 100644
--- a/src/gallium/drivers/nv20/nv20_context.h
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv20_screen *ctx = nv20->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv20_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -143,9 +139,9 @@ extern void nv20_emit_hw_state(struct nv20_context *nv20);
 extern void nv20_state_tex_update(struct nv20_context *nv20);
 
 /* nv20_vbo.c */
-extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv20_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv20_draw_elements( struct pipe_context *pipe,
+extern void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
index 2db4a4015a..dedbec73f3 100644
--- a/src/gallium/drivers/nv20/nv20_fragtex.c
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -52,6 +52,9 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
 	struct pipe_texture *pt = &nv20mt->base;
 	struct nv20_texture_format *tf;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t txf, txs, txp;
 
 	tf = nv20_fragtex_format(pt->format);
@@ -82,15 +85,15 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 		return;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width0 << 16) | pt->height0);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv20mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv20mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 {
 #if 0
 	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	unsigned samplers, unit;
 
 	samplers = nv20->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv20->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
index ddfcdb8057..2e145672da 100644
--- a/src/gallium/drivers/nv20/nv20_prim_vbuf.c
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -81,12 +81,15 @@ nv20_vbuf_render(struct vbuf_render *render)
 void nv20_vtxbuf_bind( struct nv20_context* nv20 )
 {
 #if 0
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv20->vtxbuf*/);
-		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv20->vtxbuf*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 #endif
 }
@@ -202,6 +205,9 @@ nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
 static unsigned
 nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t hwfmt = 0;
 	unsigned fields;
 
@@ -231,8 +237,8 @@ nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 		return 0;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
-	OUT_RING(hwfmt);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(chan, hwfmt);
 	return fields;
 }
 
@@ -262,6 +268,9 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	struct vertex_info *vinfo = &nv20->vertex_info;
 	unsigned nr_fields;
 	int max_push;
@@ -270,29 +279,29 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 
 	nr_fields = nv20__emit_vertex_array_format(nv20);
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	max_push = 1200 / nr_fields;
 	while (nr_indices) {
 		int i;
 		int push = MIN2(nr_indices, max_push);
 
-		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		BEGIN_RING_NI(chan, kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
 		for (i = 0; i < push; i++) {
 			/* XXX: fixme to handle other than floats? */
 			int f = nr_fields;
 			float *attrv = (float*)&data[indices[i] * vsz];
 			while (f-- > 0)
-				OUT_RINGf(*attrv++);
+				OUT_RINGf(chan, *attrv++);
 		}
 
 		nr_indices -= push;
 		indices += push;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, NV20TCL_VERTEX_BEGIN_END_STOP);
 }
 
 static void
@@ -301,20 +310,23 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int push, i;
 
 	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv20_render->pbuffer, 0,
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv20_render->pbuffer), 0,
 			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -322,16 +334,16 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 static void
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index a0973f1ebd..d091335063 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -176,7 +176,6 @@ nv20_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->kelvin, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
index 63cba1f412..6bbd1fdae9 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -5,27 +5,34 @@
 static void nv20_state_emit_blend(struct nv20_context* nv20)
 {
 	struct nv20_blend_state *b = nv20->blend;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (b->b_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, b->b_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 {
 	struct pipe_blend_color *c = nv20->blend_color;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -34,63 +41,69 @@ static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 static void nv20_state_emit_rast(struct nv20_context* nv20)
 {
 	struct nv20_rasterizer_state *r = nv20->rast;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv20_state_emit_dsa(struct nv20_context* nv20)
 {
 	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING (chan, 1);
 
 #if 0
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv20_state_emit_viewport(struct nv20_context* nv20)
@@ -101,9 +114,13 @@ static void nv20_state_emit_scissor(struct nv20_context* nv20)
 {
 	/* NV20TCL_SCISSOR_* is probably a software method */
 /*	struct pipe_scissor_state *s = nv20->scissor;
-	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
-	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
-	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+
+	BEGIN_RING(chan, kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (chan, ((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (chan, ((s->maxy - s->miny) << 16) | s->miny);*/
 }
 
 static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
@@ -113,6 +130,9 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv20_miptree *nv20mt = 0;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -150,11 +170,11 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	}
 
 	if (zeta) {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv20mt = (struct nv20_miptree *)rt->base.texture;
@@ -166,13 +186,13 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 		nv20->zeta = nv20mt->buffer;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0); /*NV20TCL_RT_VERT */
-	OUT_RING  (rt_format); /* NV20TCL_RT_FORMAT */
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0);
-	OUT_RING  (((h - 1) << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0); /*NV20TCL_RT_VERT */
+	OUT_RING  (chan, rt_format); /* NV20TCL_RT_FORMAT */
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0);
+	OUT_RING  (chan, ((h - 1) << 16) | 0);
 }
 
 static void nv20_vertex_layout(struct nv20_context *nv20)
@@ -293,6 +313,10 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 void
 nv20_emit_hw_state(struct nv20_context *nv20)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv20->dirty & NV20_NEW_VERTPROG) {
@@ -361,36 +385,39 @@ nv20_emit_hw_state(struct nv20_context *nv20)
 	 */
 
 	/* Render target */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	rt_bo = nouveau_bo(nv20->rt[0]);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv20->zeta) {
-		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv20->zeta);
+		BEGIN_RING(chan, kelvin, NV20TCL_DMA_ZETA, 1);
+		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv20->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv20->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv20->tex[i].buffer);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv20->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
 			   NV20TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
index 4bf461eba9..52991a0d85 100644
--- a/src/gallium/drivers/nv20/nv20_vbo.c
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv20_draw_elements( struct pipe_context *pipe,
+void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -67,13 +67,12 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 	}
 
 	draw_flush(nv20->draw);
-	return TRUE;
 }
 
-boolean nv20_draw_arrays( struct pipe_context *pipe,
+void nv20_draw_arrays( struct pipe_context *pipe,
 				 unsigned prim, unsigned start, unsigned count)
 {
-	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv20_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
index 38b39159f1..54572e9ab3 100644
--- a/src/gallium/drivers/nv30/nv30_context.c
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -10,15 +10,20 @@ nv30_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
index 864ddaeb59..e59449287b 100644
--- a/src/gallium/drivers/nv30/nv30_context.h
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv30_screen *ctx = nv30->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv30_state.h"
@@ -198,9 +194,9 @@ extern struct nv30_state_entry nv30_state_fragtex;
 extern struct nv30_state_entry nv30_state_vbo;
 
 /* nv30_vbo.c */
-extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv30_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv30_draw_elements(struct pipe_context *pipe,
+extern void nv30_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index d1ff18e2df..2d565cb631 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -837,7 +837,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv30_fragprog_upload(nv30, fp);
 
-	so = so_new(8, 1);
+	so = so_new(4, 4, 1);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index b3293ee700..9893567891 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -106,7 +106,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(1, 8, 2);
 	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -135,7 +135,7 @@ nv30_fragtex_validate(struct nv30_context *nv30)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
index 1d1c8a484e..e27e9ccbf6 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -41,6 +41,9 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_query *q = nv30_query(pq);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv30->screen->query, q->object->start);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -69,12 +72,15 @@ static void
 nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_query *q = nv30_query(pq);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index 760467f736..9ed48178dc 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -233,7 +233,6 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->rankine, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -270,7 +269,7 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static rankine initialisation */
-	so = so_new(128, 0);
+	so = so_new(36, 60, 0);
 	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
index e6321b480f..a80dfb0488 100644
--- a/src/gallium/drivers/nv30/nv30_state.c
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -14,7 +14,7 @@ nv30_blend_state_create(struct pipe_context *pipe,
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
@@ -300,7 +300,7 @@ nv30_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(9, 19, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	/*XXX: ignored:
@@ -435,7 +435,7 @@ nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(5, 21, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
index 64cf9ae93a..c36d58c040 100644
--- a/src/gallium/drivers/nv30/nv30_state_blend.c
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -18,7 +18,7 @@ struct nv30_state_entry nv30_state_blend = {
 static boolean
 nv30_state_blend_colour_validate(struct nv30_context *nv30)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv30->blend_colour;
 
 	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
index 6f6d1740d6..2ed2ea55e8 100644
--- a/src/gallium/drivers/nv30/nv30_state_fb.c
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -10,7 +10,7 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 	struct nv04_surface *rt[2], *zeta = NULL;
 	uint32_t rt_enable = 0, rt_format = 0;
 	int i, colour_format = 0, zeta_format = 0, depth_only = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(12, 18, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
index 3ac7a8471e..ba61a9e24a 100644
--- a/src/gallium/drivers/nv30/nv30_state_scissor.c
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -12,7 +12,7 @@ nv30_state_scissor_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
 	if (nv30->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
index d0c791ac08..ed520a4f43 100644
--- a/src/gallium/drivers/nv30/nv30_state_stipple.c
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -14,14 +14,14 @@ nv30_state_stipple_validate(struct nv30_context *nv30)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv30->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
index c3eb413dac..2d7781292b 100644
--- a/src/gallium/drivers/nv30/nv30_state_viewport.c
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -19,7 +19,7 @@ nv30_state_viewport_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(3, 10, 0);
 	if (!bypass) {
 		so_method(so, nv30->screen->rankine,
 			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
index e32b8141af..1c5db03ea2 100644
--- a/src/gallium/drivers/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -163,19 +163,21 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv30_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	nv30_vbo_set_idxbuf(nv30, NULL, 0);
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;
+		return;
 	}
 
 	while (count) {
@@ -186,17 +188,17 @@ nv30_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,15 +208,15 @@ nv30_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
@@ -228,7 +230,9 @@ static INLINE void
 nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv30_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv30_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv30_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -461,7 +468,7 @@ nv30_draw_elements(struct pipe_context *pipe,
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;	
+		return;	
 	}
 
 	if (idxbuf) {
@@ -472,7 +479,6 @@ nv30_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -485,9 +491,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
 
 	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
@@ -500,7 +506,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
index 5d60984622..e77a5be3f2 100644
--- a/src/gallium/drivers/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -650,7 +650,9 @@ static boolean
 nv30_vertprog_validate(struct nv30_context *nv30)
 { 
 	struct pipe_screen *pscreen = nv30->pipe.screen;
-	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -684,7 +686,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				assert(0);
 		}
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_ref(so, &vp->so);
@@ -770,9 +772,9 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -788,11 +790,11 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				vp->insns[i].data[2], vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index d56c7a6b49..f79ae4db84 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -10,15 +10,20 @@ nv40_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
index 83fcf1785d..e219bb537a 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv40_screen *ctx = nv40->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv40_state.h"
@@ -183,7 +179,7 @@ extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
 
 /* nv40_draw.c */
 extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
-extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+extern void nv40_draw_elements_swtnl(struct pipe_context *pipe,
 					struct pipe_buffer *idxbuf,
 					unsigned ib_size, unsigned mode,
 					unsigned start, unsigned count);
@@ -219,9 +215,9 @@ extern struct nv40_state_entry nv40_state_vbo;
 extern struct nv40_state_entry nv40_state_vtxfmt;
 
 /* nv40_vbo.c */
-extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv40_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv40_draw_elements(struct pipe_context *pipe,
+extern void nv40_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index 3875bc3545..d826f8c2f5 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -31,6 +31,9 @@ nv40_render_stage(struct draw_stage *stage)
 static INLINE void
 nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 {
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
@@ -41,30 +44,30 @@ nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (fui(v->data[idx][0]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
 			break;
 		case EMIT_2F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
 			break;
 		case EMIT_3F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
 			break;
 		case EMIT_4F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
-			OUT_RING  (fui(v->data[idx][3]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
 			break;
 		case EMIT_4UB:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
-			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
 					    float_to_ubyte(v->data[idx][1]),
 					    float_to_ubyte(v->data[idx][2]),
 					    float_to_ubyte(v->data[idx][3])));
@@ -82,7 +85,11 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 {
 	struct nv40_render_stage *rs = nv40_render_stage(stage);
 	struct nv40_context *nv40 = rs->nv40;
-	struct nouveau_pushbuf *pb = nv40->screen->base.channel->pushbuf;
+
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
@@ -91,19 +98,19 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 			NOUVEAU_ERR("AIII, missed flush\n");
 			assert(0);
 		}
-		FIRE_RING(NULL);
+		FIRE_RING(chan);
 		nv40_state_emit(nv40);
 	}
 
 	/* Switch primitive modes if necessary */
 	if (rs->prim != mode) {
 		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+			BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (mode);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
 		rs->prim = mode;
 	}
 
@@ -115,8 +122,8 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 	 * off the primitive now.
 	 */
 	if (pb->remaining < ((count * 20) + 6)) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -144,10 +151,13 @@ nv40_render_flush(struct draw_stage *draw, unsigned flags)
 {
 	struct nv40_render_stage *rs = nv40_render_stage(draw);
 	struct nv40_context *nv40 = rs->nv40;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -226,7 +236,7 @@ nv40_draw_render_stage(struct nv40_context *nv40)
 	return &render->stage;
 }
 
-boolean
+void
 nv40_draw_elements_swtnl(struct pipe_context *pipe,
 			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
 			 unsigned mode, unsigned start, unsigned count)
@@ -237,7 +247,7 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 	void *map;
 
 	if (!nv40_state_validate_swtnl(nv40))
-		return FALSE;
+		return;
 	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
 	nv40_state_emit(nv40);
 
@@ -278,8 +288,6 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 
 	draw_flush(nv40->draw);
 	pipe->flush(pipe, 0, NULL);
-
-	return TRUE;
 }
 
 static INLINE void
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index bb9c85cc43..1237066c39 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -919,7 +919,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv40_fragprog_upload(nv40, fp);
 
-	so = so_new(4, 1);
+	so = so_new(2, 2, 1);
 	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
index 44abc84596..aad9198210 100644
--- a/src/gallium/drivers/nv40/nv40_fragtex.c
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -108,7 +108,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(2, 9, 2);
 	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -139,7 +139,7 @@ nv40_fragtex_validate(struct nv40_context *nv40)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
index 7874aedd42..8ed4a67dd0 100644
--- a/src/gallium/drivers/nv40/nv40_query.c
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -41,6 +41,9 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv40->screen->query, q->object->start);
 
-	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -70,11 +73,14 @@ nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
-	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index d01e712805..9e55e5a089 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -215,7 +215,6 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->curie, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -252,7 +251,7 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static curie initialisation */
-	so = so_new(128, 0);
+	so = so_new(16, 25, 0);
 	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
index ed55d29aff..ed0ca9e02c 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -16,7 +16,7 @@ nv40_blend_state_create(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
@@ -310,7 +310,7 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(8, 18, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	/*XXX: ignored:
@@ -445,7 +445,7 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(4, 21, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
index 8cd05ce66e..3ff00a37f6 100644
--- a/src/gallium/drivers/nv40/nv40_state_blend.c
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -18,7 +18,7 @@ struct nv40_state_entry nv40_state_blend = {
 static boolean
 nv40_state_blend_colour_validate(struct nv40_context *nv40)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv40->blend_colour;
 
 	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
index 789ed16126..13fe854915 100644
--- a/src/gallium/drivers/nv40/nv40_state_emit.c
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -54,9 +54,10 @@ nv40_state_do_validate(struct nv40_context *nv40,
 void
 nv40_state_emit(struct nv40_context *nv40)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
 	struct nv40_state *state = &nv40->state;
 	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 	uint64_t states;
 
@@ -80,10 +81,10 @@ nv40_state_emit(struct nv40_context *nv40)
 
 	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
 			    (1ULL << NV40_STATE_FRAGTEX0))) {
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 1);
 	}
 
 	state->dirty = 0;
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
index 1c7a7cd64f..a58fe9ddb1 100644
--- a/src/gallium/drivers/nv40/nv40_state_fb.c
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -19,7 +19,7 @@ nv40_state_framebuffer_validate(struct nv40_context *nv40)
 	struct nv04_surface *rt[4], *zeta;
 	uint32_t rt_enable, rt_format;
 	int i, colour_format = 0, zeta_format = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(18, 24, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
index cf58d33906..753a505e93 100644
--- a/src/gallium/drivers/nv40/nv40_state_scissor.c
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -12,7 +12,7 @@ nv40_state_scissor_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
 	if (nv40->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
index b51024ad9b..2b371ebfec 100644
--- a/src/gallium/drivers/nv40/nv40_state_stipple.c
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -14,14 +14,14 @@ nv40_state_stipple_validate(struct nv40_context *nv40)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv40->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
index 665d2d5fca..9919ba1d0b 100644
--- a/src/gallium/drivers/nv40/nv40_state_viewport.c
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -19,7 +19,7 @@ nv40_state_viewport_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(2, 9, 0);
 	if (!bypass) {
 		so_method(so, nv40->screen->curie,
 			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
index af3fcf6a34..a777898f68 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -164,18 +164,21 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv40_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	nv40_vbo_set_idxbuf(nv40, NULL, 0);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	while (count) {
@@ -186,17 +189,17 @@ nv40_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,29 +209,30 @@ nv40_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static INLINE void
 nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv40_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv40_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv40_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -459,8 +466,9 @@ nv40_draw_elements(struct pipe_context *pipe,
 
 	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	if (idxbuf) {
@@ -471,7 +479,6 @@ nv40_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -484,9 +491,9 @@ nv40_vbo_validate(struct nv40_context *nv40)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
 
 	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
@@ -499,7 +506,7 @@ nv40_vbo_validate(struct nv40_context *nv40)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index d9fc31006f..8d80fcad38 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -834,7 +834,9 @@ static boolean
 nv40_vertprog_validate(struct nv40_context *nv40)
 { 
 	struct pipe_screen *pscreen = nv40->pipe.screen;
-	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	struct nv40_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -884,7 +886,7 @@ check_gpu_resources:
 				assert(0);
 		}
 
-		so = so_new(7, 0);
+		so = so_new(3, 4, 0);
 		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
@@ -974,9 +976,9 @@ check_gpu_resources:
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -993,11 +995,11 @@ check_gpu_resources:
 			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 5578a5838f..cbd4c3ff86 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -191,9 +191,9 @@ nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
 extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
 
 /* nv50_vbo.c */
-extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv50_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv50_draw_elements(struct pipe_context *pipe,
+extern void nv50_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 2d0b1818ef..e16fa479e5 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -96,7 +96,11 @@ struct nv50_reg {
 
 #define NV50_MOD_NEG 1
 #define NV50_MOD_ABS 2
+#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS)
 #define NV50_MOD_SAT 4
+#define NV50_MOD_I32 8
+
+/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */
 
 /* STACK: Conditionals and loops have to use the (per warp) stack.
  * Stack entries consist of an entry type (divergent path, join at),
@@ -134,6 +138,7 @@ struct nv50_pc {
 	uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
 
 	struct nv50_reg *temp_temp[16];
+	struct nv50_program_exec *temp_temp_exec[16];
 	unsigned temp_temp_nr;
 
 	/* broadcast and destination replacement regs */
@@ -241,7 +246,8 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
 }
 
 static INLINE struct nv50_reg *
@@ -281,7 +287,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
 	return NULL;
 }
 
@@ -343,23 +350,29 @@ free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
 }
 
 static struct nv50_reg *
-temp_temp(struct nv50_pc *pc)
+temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	if (pc->temp_temp_nr >= 16)
 		assert(0);
 
 	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	pc->temp_temp_exec[pc->temp_temp_nr] = e;
 	return pc->temp_temp[pc->temp_temp_nr++];
 }
 
+/* This *must* be called for all nv50_program_exec that have been
+ * given as argument to temp_temp, or the temps will be leaked !
+ */
 static void
-kill_temp_temp(struct nv50_pc *pc)
+kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	int i;
 
 	for (i = 0; i < pc->temp_temp_nr; i++)
-		free_temp(pc, pc->temp_temp[i]);
-	pc->temp_temp_nr = 0;
+		if (pc->temp_temp_exec[i] == e)
+			free_temp(pc, pc->temp_temp[i]);
+	if (!e)
+		pc->temp_temp_nr = 0;
 }
 
 static int
@@ -421,6 +434,8 @@ emit(struct nv50_pc *pc, struct nv50_program_exec *e)
 		p->exec_head = e;
 	p->exec_tail = e;
 	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+
+	kill_temp_temp(pc, e);
 }
 
 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
@@ -776,7 +791,7 @@ set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
 	struct nv50_reg *temp;
 
 	if (src->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, e);
 		emit_mov(pc, temp, src);
 		src = temp;
 	}
@@ -795,7 +810,7 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 		e->inst[1] |= 0x00200000;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -811,7 +826,7 @@ static void
 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -819,7 +834,7 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_CONST || src->type == P_IMMD) {
 		assert(!(e->inst[0] & 0x00800000));
 		if (e->inst[0] & 0x01000000) {
-			struct nv50_reg *temp = temp_temp(pc);
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
@@ -841,7 +856,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	set_long(pc, e);
 
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -849,7 +864,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_CONST || src->type == P_IMMD) {
 		assert(!(e->inst[0] & 0x01000000));
 		if (e->inst[0] & 0x00800000) {
-			struct nv50_reg *temp = temp_temp(pc);
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
@@ -864,6 +879,26 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 }
 
 static void
+set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh,
+	     struct nv50_program_exec *e, int pos)
+{
+	struct nv50_reg *r = src;
+
+	alloc_reg(pc, r);
+	if (r->type != P_TEMP) {
+		r = temp_temp(pc, e);
+		emit_mov(pc, r, src);
+	}
+
+	if (r->hw > (NV50_SU_MAX_TEMP / 2)) {
+		NOUVEAU_ERR("out of low GPRs\n");
+		abort();
+	}
+
+	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32);
+}
+
+static void
 emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
 {
 	struct nv50_program_exec *e = exec(pc);
@@ -967,6 +1002,13 @@ emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+#define NV50_MAX_F32 0x880
+#define NV50_MAX_S32 0x08c
+#define NV50_MAX_U32 0x084
+#define NV50_MIN_F32 0x8a0
+#define NV50_MIN_S32 0x0ac
+#define NV50_MIN_U32 0x0a4
+
 static void
 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	    struct nv50_reg *src0, struct nv50_reg *src1)
@@ -974,8 +1016,8 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	struct nv50_program_exec *e = exec(pc);
 
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (sub << 29);
+	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20);
+	e->inst[1] |= (sub << 24);
 
 	check_swap_src_0_1(pc, &src0, &src1);
 	set_dst(pc, dst, e);
@@ -1039,6 +1081,69 @@ emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 }
 
 static void
+emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	e->inst[1] = 0x0402c000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_1(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_shift(struct nv50_pc *pc, struct nv50_reg *dst,
+	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4000000;
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (src1->type == P_IMMD) {
+		e->inst[1] |= (1 << 20);
+		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16;
+	} else
+		set_src_1(pc, src1, e);
+
+	if (dir != TGSI_OPCODE_SHL)
+		e->inst[1] |= (1 << 29);
+
+	if (dir == TGSI_OPCODE_ISHR)
+		e->inst[1] |= (1 << 27);
+
+	emit(pc, e);
+}
+
+static void
+emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src, int s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4100000;
+	if (s < 0) {
+		e->inst[1] |= 1 << 29;
+		s = -s;
+	}
+	e->inst[1] |= ((s & 0x7f) << 16);
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
@@ -1142,36 +1247,41 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit(pc, e);
 }
 
-#define CVTOP_RN	0x01
-#define CVTOP_FLOOR	0x03
-#define CVTOP_CEIL	0x05
-#define CVTOP_TRUNC	0x07
-#define CVTOP_SAT	0x08
-#define CVTOP_ABS	0x10
+#define CVT_RN    (0x00 << 16)
+#define CVT_FLOOR (0x02 << 16)
+#define CVT_CEIL  (0x04 << 16)
+#define CVT_TRUNC (0x06 << 16)
+#define CVT_SAT   (0x08 << 16)
+#define CVT_ABS   (0x10 << 16)
 
-/* 0x04 == 32 bit dst */
-/* 0x40 == dst is float */
-/* 0x80 == src is float */
-#define CVT_F32_F32 0xc4
-#define CVT_F32_S32 0x44
-#define CVT_S32_F32 0x8c
-#define CVT_S32_S32 0x0c
-#define CVT_NEG     0x20
-#define CVT_RI      0x08
+#define CVT_X32_X32 0x04004000
+#define CVT_X32_S32 0x04014000
+#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
+#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
+#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
+#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
+#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
+#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
+#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
+#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
+
+#define CVT_NEG 0x20000000
+#define CVT_RI  0x08000000
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cvn, unsigned fmt)
+	 int wp, uint32_t cvn)
 {
 	struct nv50_program_exec *e;
 
 	e = exec(pc);
-	set_long(pc, e);
 
-	e->inst[0] |= 0xa0000000;
-	e->inst[1] |= 0x00004000; /* 32 bit src */
-	e->inst[1] |= (cvn << 16);
-	e->inst[1] |= (fmt << 24);
+	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG;
+	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS;
+
+	e->inst[0] = 0xa0000000;
+	e->inst[1] = cvn;
+	set_long(pc, e);
 	set_src_0(pc, src, e);
 
 	if (wp >= 0)
@@ -1196,10 +1306,12 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
  *  0x6 = GE
  *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
  *  0x8 = unordered bit (allows NaN)
+ *
+ *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32)
  */
 static void
 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
-	 struct nv50_reg *src0, struct nv50_reg *src1)
+	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode)
 {
 	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 
@@ -1214,16 +1326,10 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
-	/* set.u32 */
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
+	e->inst[0] |= 0x30000000 | (mode << 24);
 	e->inst[1] |= 0x60000000 | (ccode << 14);
 
-	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
-	 * that doesn't seem to match what the hw actually does
-	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
-	 */
-
 	if (wp >= 0)
 		set_pred_wr(pc, 1, wp, e);
 	if (dst)
@@ -1238,33 +1344,146 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 
 	emit(pc, e);
 
-	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
-	if (rdst)
-		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && mode == 0x80) /* convert to float ? */
+		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32);
 	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
-static INLINE unsigned
-map_tgsi_setop_cc(unsigned op)
+static INLINE void
+map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty)
 {
 	switch (op) {
-	case TGSI_OPCODE_SLT: return 0x1;
-	case TGSI_OPCODE_SGE: return 0x6;
-	case TGSI_OPCODE_SEQ: return 0x2;
-	case TGSI_OPCODE_SGT: return 0x4;
-	case TGSI_OPCODE_SLE: return 0x3;
-	case TGSI_OPCODE_SNE: return 0xd;
+	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break;
+	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break;
+	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break;
+	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break;
+	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break;
+	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break;
+
+	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break;
+	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break;
+	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break;
+	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break;
+	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break;
+	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break;
 	default:
 		assert(0);
-		return 0;
+		return;
+	}
+}
+
+static void
+emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, struct nv50_reg *rsrc1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	struct nv50_reg *src1;
+
+	e->inst[0] = 0x20000000;
+
+	alloc_reg(pc, rsrc1);
+	check_swap_src_0_1(pc, &src0, &rsrc1);
+
+	src1 = rsrc1;
+	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) {
+		src1 = temp_temp(pc, e);
+		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32);
+	}
+
+	if (!pc->allow32 || src1->hw > 63 ||
+	    (src1->type != P_TEMP && src1->type != P_IMMD))
+		set_long(pc, e);
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (is_long(e)) {
+		e->inst[1] |= 1 << 26;
+		set_src_2(pc, src1, e);
+	} else {
+		e->inst[0] |= 0x8000;
+		if (src1->type == P_IMMD)
+			set_immd(pc, src1, e);
+		else
+			set_src_1(pc, src1, e);
 	}
+
+	if (src0->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 28;
+	else
+	if (src1->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 22;
+
+	emit(pc, e);
+}
+
+static void
+emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1,
+	     struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x60000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+
+	emit(pc, e);
+}
+
+static void
+emit_sad(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x50000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	if (is_long(e))
+		e->inst[1] |= 0x0c << 24;
+	else
+		e->inst[0] |= 0x81 << 8;
+
+	emit(pc, e);
 }
 
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
+	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -1282,15 +1501,9 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 }
 
 static INLINE void
-emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
-}
-
-static INLINE void
 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32);
 }
 
 static void
@@ -1308,18 +1521,18 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 
 	if (mask & (3 << 1)) {
 		tmp[0] = alloc_temp(pc, NULL);
-		emit_minmax(pc, 4, tmp[0], src[0], zero);
+		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero);
 	}
 
 	if (mask & (1 << 2)) {
 		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
 
-		tmp[1] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[1], src[1], zero);
+		tmp[1] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero);
 
-		tmp[3] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[3], src[3], neg128);
-		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+		tmp[3] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128);
+		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128);
 
 		emit_pow(pc, dst[2], tmp[1], tmp[3]);
 		emit_mov(pc, dst[2], zero);
@@ -1347,12 +1560,6 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	FREE(one);
 }
 
-static INLINE void
-emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
-}
-
 static void
 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 {
@@ -1364,14 +1571,9 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	set_long(pc, e); /* sets cond code to ALWAYS */
 
 	if (src) {
-		unsigned cvn = CVT_F32_F32;
-
 		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
-
-		if (src->mod & NV50_MOD_NEG)
-			cvn |= CVT_NEG;
-		/* write predicate reg */
-		emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
+		/* write to predicate reg */
+		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32);
 	}
 
 	emit(pc, e);
@@ -1474,8 +1676,8 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 	src[1]->mod |= NV50_MOD_ABS;
 	src[2]->mod |= NV50_MOD_ABS;
 
-	emit_minmax(pc, 4, t[2], src[0], src[1]);
-	emit_minmax(pc, 4, t[2], src[2], t[2]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]);
 
 	src[0]->mod = mod[0];
 	src[1]->mod = mod[1];
@@ -1778,6 +1980,21 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 		q = 0x0403c000;
 		m = 0xffff7fff;
 		break;
+	case 0x2:
+	case 0x3:
+		/* ADD, SUB, SUBR b32 */
+		m = ~(0x8000 | (127 << 16));
+		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26);
+		break;
+	case 0x5:
+		/* SAD */
+		m = ~(0x81 << 8);
+		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12);
+		break;
+	case 0x6:
+		/* MAD u16 */
+		q = (e->inst[0] & (0x7f << 2)) << 12;
+		break;
 	case 0x8:
 		/* INTERP (move centroid, perspective and flat bits) */
 		m = ~0x03000100;
@@ -1814,8 +2031,8 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 }
 
 /* Some operations support an optional negation flag. */
-static boolean
-negate_supported(const struct tgsi_full_instruction *insn, int i)
+static int
+get_supported_mods(const struct tgsi_full_instruction *insn, int i)
 {
 	switch (insn->Instruction.Opcode) {
 	case TGSI_OPCODE_ADD:
@@ -1835,9 +2052,36 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
 	case TGSI_OPCODE_SUB:
-		return TRUE;
+		return NV50_MOD_NEG;
+	case TGSI_OPCODE_MAX:
+	case TGSI_OPCODE_MIN:
+	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */
+		return NV50_MOD_ABS;
+	case TGSI_OPCODE_CEIL:
+	case TGSI_OPCODE_FLR:
+	case TGSI_OPCODE_TRUNC:
+		return NV50_MOD_NEG | NV50_MOD_ABS;
+	case TGSI_OPCODE_F2I:
+	case TGSI_OPCODE_F2U:
+	case TGSI_OPCODE_I2F:
+	case TGSI_OPCODE_U2F:
+		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32;
+	case TGSI_OPCODE_UADD:
+		return NV50_MOD_NEG | NV50_MOD_I32;
+	case TGSI_OPCODE_SAD:
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_IMAX:
+	case TGSI_OPCODE_IMIN:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_NOT:
+	case TGSI_OPCODE_UMAD:
+	case TGSI_OPCODE_UMAX:
+	case TGSI_OPCODE_UMIN:
+	case TGSI_OPCODE_UMUL:
+	case TGSI_OPCODE_USHR:
+		return NV50_MOD_I32;
 	default:
-		return FALSE;
+		return 0;
 	}
 }
 
@@ -1944,11 +2188,11 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 
 static struct nv50_reg *
 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
-	 boolean neg)
+	 int mod)
 {
 	struct nv50_reg *r = NULL;
-	struct nv50_reg *temp;
-	unsigned sgn, c, swz;
+	struct nv50_reg *temp = NULL;
+	unsigned sgn, c, swz, cvn;
 
 	if (src->Register.File != TGSI_FILE_CONSTANT)
 		assert(!src->Register.Indirect);
@@ -1988,7 +2232,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			r = &pc->immd[src->Register.Index * 4 + c];
 			break;
 		case TGSI_FILE_SAMPLER:
-			break;
+			return NULL;
 		case TGSI_FILE_ADDRESS:
 			r = pc->addr[src->Register.Index * 4 + c];
 			assert(r);
@@ -2003,35 +2247,34 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	}
 
+	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32;
+
 	switch (sgn) {
-	case TGSI_UTIL_SIGN_KEEP:
-		break;
 	case TGSI_UTIL_SIGN_CLEAR:
-		temp = temp_temp(pc);
-		emit_abs(pc, temp, r);
-		r = temp;
-		break;
-	case TGSI_UTIL_SIGN_TOGGLE:
-		if (neg)
-			r->mod = NV50_MOD_NEG;
-		else {
-			temp = temp_temp(pc);
-			emit_neg(pc, temp, r);
-			r = temp;
-		}
+		r->mod = NV50_MOD_ABS;
 		break;
 	case TGSI_UTIL_SIGN_SET:
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
-		r = temp;
+		r->mod = NV50_MOD_NEG_ABS;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		r->mod = NV50_MOD_NEG;
 		break;
 	default:
-		assert(0);
+		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP);
 		break;
 	}
 
-	if (r && r->acc >= 0 && r != temp)
-		return reg_instance(pc, r);
+	if ((r->mod & mod) != r->mod) {
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, r, -1, cvn);
+		r->mod = 0;
+		r = temp;
+	} else
+		r->mod |= mod & NV50_MOD_I32;
+
+	assert(r);
+	if (r->acc >= 0 && r != temp)
+		return reg_instance(pc, r); /* will clear r->mod */
 	return r;
 }
 
@@ -2195,22 +2438,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->Src[i];
 		unsigned src_mask;
-		boolean neg_supp;
+		int mod_supp;
 
 		src_mask = nv50_tgsi_src_mask(inst, i);
-		neg_supp = negate_supported(inst, i);
+		mod_supp = get_supported_mods(inst, i);
 
 		if (fs->Register.File == TGSI_FILE_SAMPLER)
 			unit = fs->Register.Index;
 
 		for (c = 0; c < 4; c++)
 			if (src_mask & (1 << c))
-				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
+				src[i][c] = tgsi_src(pc, c, fs, mod_supp);
 	}
 
 	brdc = temp = pc->r_brdc;
 	if (brdc && brdc->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (sat)
 			brdc = temp;
 	} else
@@ -2219,7 +2462,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
 			/* rdst[c] = dst[c]; */ /* done above */
-			dst[c] = temp_temp(pc);
+			dst[c] = temp_temp(pc, NULL);
 		}
 	}
 
@@ -2230,7 +2473,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_abs(pc, dst[c], src[0][c]);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_ABS | CVT_F32_F32);
 		}
 		break;
 	case TGSI_OPCODE_ADD:
@@ -2252,8 +2496,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		break;
 	case TGSI_OPCODE_ARL:
 		assert(src[0][0]);
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32);
 		emit_arl(pc, dst[0], temp, 4);
 		break;
 	case TGSI_OPCODE_BGNLOOP:
@@ -2282,7 +2526,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
+				 CVT_CEIL | CVT_F32_F32 | CVT_RI);
 		}
 		break;
 	case TGSI_OPCODE_CMP:
@@ -2290,7 +2534,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
+			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32);
 			emit_mov(pc, dst[c], src[1][c]);
 			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
 			emit_mov(pc, dst[c], src[2][c]);
@@ -2309,7 +2553,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
@@ -2397,8 +2641,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		struct nv50_reg *t[2];
 
 		assert(!temp);
-		t[0] = temp_temp(pc);
-		t[1] = temp_temp(pc);
+		t[0] = temp_temp(pc, NULL);
+		t[1] = temp_temp(pc, NULL);
 
 		if (mask & 0x6)
 			emit_mov(pc, t[0], src[0][0]);
@@ -2419,6 +2663,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[3], 1.0f);
 	}
 		break;
+	case TGSI_OPCODE_F2I:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_S32_F32);
+		}
+		break;
+	case TGSI_OPCODE_F2U:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_U32_F32);
+		}
+		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
@@ -2427,7 +2687,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_FRC:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2435,14 +2695,42 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_I2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32);
+		}
+		break;
 	case TGSI_OPCODE_IF:
 		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
-		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
-			 CVT_F32_F32);
+		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32);
 		pc->if_join[pc->if_lvl] = emit_joinat(pc);
 		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_IMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_IMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_INEG:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_S32_S32 | CVT_NEG);
+		}
+		break;
 	case TGSI_OPCODE_KIL:
 		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
 		emit_kil(pc, src[0][0]);
@@ -2463,13 +2751,13 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	{
 		struct nv50_reg *t[2];
 
-		t[0] = temp_temp(pc);
+		t[0] = temp_temp(pc, NULL);
 		if (mask & (1 << 1))
-			t[1] = temp_temp(pc);
+			t[1] = temp_temp(pc, NULL);
 		else
 			t[1] = t[0];
 
-		emit_abs(pc, t[0], src[0][0]);
+		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32);
 		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], t[1]);
@@ -2488,7 +2776,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	}
 		break;
 	case TGSI_OPCODE_LRP:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2507,14 +2795,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MIN:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MOV:
@@ -2531,10 +2819,19 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mul(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
+	case TGSI_OPCODE_NOT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_not(pc, dst[c], src[0][c]);
+		}
+		break;
 	case TGSI_OPCODE_POW:
 		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
 		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RET:
@@ -2543,11 +2840,20 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		emit_ret(pc, -1, 0);
 		break;
 	case TGSI_OPCODE_RSQ:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
 		src[0][0]->mod |= NV50_MOD_ABS;
 		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
 		break;
+	case TGSI_OPCODE_SAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
 	case TGSI_OPCODE_SCS:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & 3)
 			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
@@ -2559,6 +2865,16 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_USHR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_shift(pc, dst[c], src[0][c], src[1][c],
+				   inst->Instruction.Opcode);
+		}
+		break;
 	case TGSI_OPCODE_SIN:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
@@ -2566,7 +2882,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
@@ -2577,12 +2893,23 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	case TGSI_OPCODE_SGT:
 	case TGSI_OPCODE_SLE:
 	case TGSI_OPCODE_SNE:
-		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USEQ:
+	case TGSI_OPCODE_USGE:
+	case TGSI_OPCODE_USLT:
+	case TGSI_OPCODE_USNE:
+	{
+		uint8_t cc, ty;
+
+		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty);
+
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
+			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty);
 		}
+	}
 		break;
 	case TGSI_OPCODE_SUB:
 		for (c = 0; c < 4; c++) {
@@ -2612,11 +2939,72 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
+				 CVT_TRUNC | CVT_F32_F32 | CVT_RI);
 		}
 		break;
+	case TGSI_OPCODE_U2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32);
+		}
+		break;
+	case TGSI_OPCODE_UADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add_b32(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAD:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0,
+				     temp);
+			emit_add_b32(pc, dst[c], temp, src[2][c]);
+		}
+	}
+		break;
+	case TGSI_OPCODE_UMUL:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0,
+				     temp);
+		}
+	}
+		break;
 	case TGSI_OPCODE_XPD:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & (1 << 0)) {
 			emit_mul(pc, temp, src[0][2], src[1][1]);
 			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
@@ -2670,7 +3058,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 	}
 
-	kill_temp_temp(pc);
+	kill_temp_temp(pc, NULL);
 	pc->reg_instance_nr = 0;
 
 	return TRUE;
@@ -2679,7 +3067,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 static void
 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	struct nv50_reg *reg = NULL;
+	struct nv50_reg *r, *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
 	unsigned i, c, k, mask;
@@ -2725,7 +3113,15 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 				continue;
 			k = tgsi_util_get_full_src_register_swizzle(src, c);
 
-			reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
+			r = &reg[src->Register.Index * 4 + k];
+
+			/* If used before written, pre-allocate the reg,
+			 * lest we overwrite results from a subroutine.
+			 */
+			if (!r->acc && r->type == P_TEMP)
+				alloc_reg(pc, r);
+
+			r->acc = pc->insn_nr;
 		}
 	}
 }
@@ -2814,7 +3210,7 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
-		boolean neg_supp = negate_supported(insn, i);
+		int ms = get_supported_mods(insn, i);
 
 		fs = &insn->Src[i];
 		if (fs->Register.File != fd->Register.File ||
@@ -2832,10 +3228,12 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 			if (!(fd->Register.WriteMask & (1 << c)))
 				continue;
 
-			/* no danger if src is copied to TEMP first */
-			if ((s != TGSI_UTIL_SIGN_KEEP) &&
-			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
-				continue;
+			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG))
+					continue;
+			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS))
+					continue;
+			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3))
+					continue;
 
 			rdep[c] |= nv50_tgsi_dst_revdep(
 				insn->Instruction.Opcode, i, chn);
@@ -2859,7 +3257,7 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	if (is_scalar_op(insn.Instruction.Opcode)) {
 		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
 		if (!pc->r_brdc)
-			pc->r_brdc = temp_temp(pc);
+			pc->r_brdc = temp_temp(pc, NULL);
 		return nv50_program_tx_insn(pc, &insn);
 	}
 	pc->r_brdc = NULL;
@@ -3579,7 +3977,7 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(13, 2);
+	so = so_new(5, 8, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_HIGH, 0, 0);
@@ -3615,7 +4013,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(64, 2);
+	so = so_new(6, 7, 2);
 	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_HIGH, 0, 0);
@@ -3635,12 +4033,13 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_ref(NULL, &so);
 }
 
-static void
+static uint32_t
 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 {
 	struct nv50_program *fp = nv50->fragprog;
 	struct nv50_program *vp = nv50->vertprog;
 	unsigned i, c, m = base;
+	uint32_t origin = 0x00000010;
 
 	/* XXX: this might not work correctly in all cases yet - we'll
 	 * just assume that an FP generic input that is not written in
@@ -3674,7 +4073,9 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 			if (mode == PIPE_SPRITE_COORD_NONE) {
 				m += n;
 				continue;
-			}
+			} else
+			if (mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+				origin = 0;
 		}
 
 		/* this is either PointCoord or replaced by sprite coords */
@@ -3685,6 +4086,7 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 			++m;
 		}
 	}
+	return origin;
 }
 
 static int
@@ -3783,7 +4185,7 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	}
 
 	/* now fill the stateobj */
-	so = so_new(64, 0);
+	so = so_new(7, 57, 0);
 
 	n = (m + 3) / 4;
 	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
@@ -3801,7 +4203,9 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	so_datap (so, lin, 4);
 
 	if (nv50->rasterizer->pipe.point_sprite) {
-		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
+		so_data  (so,
+			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff));
 
 		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
 		so_datap (so, pcrd, 8);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 5d9e18218a..5a4ab3508b 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -111,7 +111,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 
 	if (!q->ready) {
 		ret = nouveau_bo_map(q->bo, NOUVEAU_BO_RD |
-				     wait ? 0 : NOUVEAU_BO_NOWAIT);
+				     (wait ? 0 : NOUVEAU_BO_NOWAIT));
 		if (ret)
 			return false;
 		q->result = ((uint32_t *)q->bo->map)[1];
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 7e039ea82e..28e2b35dea 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -189,6 +189,28 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 	FREE(screen);
 }
 
+static int
+nv50_pre_pipebuffer_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+	unsigned usage)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *ctx = screen->cur_ctx;
+
+	if (!(pb->usage & PIPE_BUFFER_USAGE_VERTEX))
+		return 0;
+
+	/* Our vtxbuf got mapped, it can no longer be considered part of current
+	 * state, remove it to avoid emitting reloc markers.
+	 */
+	if (ctx && ctx->state.vtxbuf && so_bo_is_reloc(ctx->state.vtxbuf,
+			nouveau_bo(pb))) {
+		so_ref(NULL, &ctx->state.vtxbuf);
+		ctx->dirty |= NV50_NEW_ARRAYS;
+	}
+
+	return 0;
+}
+
 struct pipe_screen *
 nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
@@ -216,6 +238,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	pscreen->get_param = nv50_screen_get_param;
 	pscreen->get_paramf = nv50_screen_get_paramf;
 	pscreen->is_format_supported = nv50_screen_is_format_supported;
+	screen->base.pre_pipebuffer_map_callback = nv50_pre_pipebuffer_map;
 
 	nv50_screen_init_miptree_functions(pscreen);
 	nv50_transfer_init_screen_functions(pscreen);
@@ -228,7 +251,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
 	ret = nouveau_grobj_alloc(chan, 0xbeef502d, NV50_2D, &screen->eng2d);
@@ -237,7 +259,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->eng2d, 2);
 
 	/* 3D object */
 	switch (chipset & 0xf0) {
@@ -273,7 +294,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->tesla, 3);
 
 	/* Sync notifier */
 	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
@@ -284,7 +304,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static M2MF init */
-	so = so_new(32, 0);
+	so = so_new(1, 3, 0);
 	so_method(so, screen->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 3);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -293,7 +313,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref (NULL, &so);
 
 	/* Static 2D init */
-	so = so_new(64, 0);
+	so = so_new(4, 7, 0);
 	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -309,7 +329,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
-	so = so_new(256, 20);
+	so = so_new(40, 84, 20);
 
 	so_method(so, screen->tesla, NV50TCL_COND_MODE, 1);
 	so_data  (so, NV50TCL_COND_MODE_ALWAYS);
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 61e24a5b57..a038a4e3c2 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -2,6 +2,7 @@
 #define __NV50_SCREEN_H__
 
 #include "nouveau/nouveau_screen.h"
+#include "nv50_context.h"
 
 struct nv50_screen {
 	struct nouveau_screen base;
@@ -9,6 +10,7 @@ struct nv50_screen {
 	struct nouveau_winsys *nvws;
 
 	unsigned cur_pctx;
+	struct nv50_context *cur_ctx;
 
 	struct nouveau_grobj *tesla;
 	struct nouveau_grobj *eng2d;
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 30b2b0f91b..1f67df814b 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -35,7 +35,7 @@ static void *
 nv50_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(5, 24, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
 	unsigned cmask = 0, i;
@@ -146,7 +146,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		  (wrap_mode(cso->wrap_r) << 6));
 
 	switch (cso->mag_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
 		break;
@@ -157,7 +156,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	switch (cso->min_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
 		break;
@@ -280,7 +278,7 @@ static void *
 nv50_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(15, 21, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_rasterizer_stateobj *rso =
 		CALLOC_STRUCT(nv50_rasterizer_stateobj);
@@ -425,7 +423,7 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(8, 22, 0);
 
 	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
 	so_data  (so, cso->depth.writemask ? 1 : 0);
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c8bdf9dc27..f83232f43c 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -33,7 +33,7 @@ static void
 nv50_state_validate_fb(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *so = so_new(128, 18);
+	struct nouveau_stateobj *so = so_new(32, 79, 18);
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned i, w, h, gw = 0;
 
@@ -185,6 +185,9 @@ nv50_state_emit(struct nv50_context *nv50)
 	struct nv50_screen *screen = nv50->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 
+	/* I don't want to copy headers from the winsys. */
+	screen->cur_ctx = nv50;
+
 	if (nv50->pctx_id != screen->cur_pctx) {
 		if (nv50->state.fb)
 			nv50->state.dirty |= NV50_NEW_FRAMEBUFFER;
@@ -296,7 +299,7 @@ nv50_state_validate(struct nv50_context *nv50)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
 	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
-		so = so_new(5, 0);
+		so = so_new(1, 4, 0);
 		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
 		so_data  (so, fui(nv50->blend_colour.color[0]));
 		so_data  (so, fui(nv50->blend_colour.color[1]));
@@ -307,7 +310,7 @@ nv50_state_validate(struct nv50_context *nv50)
 	}
 
 	if (nv50->dirty & NV50_NEW_STIPPLE) {
-		so = so_new(33, 0);
+		so = so_new(1, 32, 0);
 		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, util_bswap32(nv50->stipple.stipple[i]));
@@ -324,7 +327,7 @@ nv50_state_validate(struct nv50_context *nv50)
 			goto scissor_uptodate;
 		nv50->state.scissor_enabled = rast->scissor;
 
-		so = so_new(3, 0);
+		so = so_new(1, 2, 0);
 		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ(0), 2);
 		if (nv50->state.scissor_enabled) {
 			so_data(so, (s->maxx << 16) | s->minx);
@@ -353,7 +356,7 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(14, 0);
+		so = so_new(5, 9, 0);
 		if (!bypass) {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
@@ -397,7 +400,8 @@ viewport_uptodate:
 		for (i = 0; i < PIPE_SHADER_TYPES; ++i)
 			nr += nv50->sampler_nr[i];
 
-		so = so_new(nr * 8 + 24 * PIPE_SHADER_TYPES + 2, 4);
+		so = so_new(1+ 5 * PIPE_SHADER_TYPES, 1+ 19 * PIPE_SHADER_TYPES
+					+ nr * 8, PIPE_SHADER_TYPES * 2);
 
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_VERTEX);
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_FRAGMENT);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index c4ca096d6a..bef548b728 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -199,16 +199,18 @@ nv50_tex_validate(struct nv50_context *nv50)
 {
 	struct nouveau_stateobj *so;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	unsigned p, push, nrlc;
+	unsigned p, start, push, nrlc;
 
-	for (nrlc = 0, push = 0, p = 0; p < PIPE_SHADER_TYPES; ++p) {
+	for (nrlc = 0, start = 0, push = 0, p = 0; p < PIPE_SHADER_TYPES; ++p) {
+		start += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
 		push += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
 		nrlc += nv50->miptree_nr[p];
 	}
-	push = push * 11 + 23 * PIPE_SHADER_TYPES + 4;
+	start = start * 2 + 4 * PIPE_SHADER_TYPES + 2;
+	push = push * 9 + 19 * PIPE_SHADER_TYPES + 2;
 	nrlc = nrlc * 2 + 2 * PIPE_SHADER_TYPES;
 
-	so = so_new(push, nrlc);
+	so = so_new(start, push, nrlc);
 
 	if (nv50_validate_textures(nv50, so, PIPE_SHADER_VERTEX) == FALSE ||
 	    nv50_validate_textures(nv50, so, PIPE_SHADER_FRAGMENT) == FALSE) {
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 602adfc50d..f2e510fba6 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -152,7 +152,7 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
 	return (hw_type | hw_size);
 }
 
-boolean
+void
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
 {
@@ -182,7 +182,9 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
-	return ret;
+        /* XXX: not sure what to do if ret != TRUE: flush and retry?
+         */
+        assert(ret);
 }
 
 static INLINE boolean
@@ -275,7 +277,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	return TRUE;
 }
 
-boolean
+void
 nv50_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -317,8 +319,10 @@ nv50_draw_elements(struct pipe_context *pipe,
 	OUT_RING  (chan, 0);
 
 	pipe_buffer_unmap(pscreen, indexBuffer);
-
-	return ret;
+        
+        /* XXX: what to do if ret != TRUE?  Flush and retry?
+         */
+	assert(ret);
 }
 
 static INLINE boolean
@@ -350,7 +354,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 
 	so = *pso;
 	if (!so)
-		*pso = so = so_new(nv50->vtxelt_nr * 5, 0);
+		*pso = so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 4, 0);
 
 	switch (ve->nr_components) {
 	case 4:
@@ -411,8 +415,8 @@ nv50_vbo_validate(struct nv50_context *nv50)
 	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
 	vtxattr = NULL;
-	vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4);
-	vtxfmt = so_new(n_ve + 1, 0);
+	vtxbuf = so_new(n_ve * 2, n_ve * 5, nv50->vtxelt_nr * 4);
+	vtxfmt = so_new(1, n_ve, 0);
 	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
 
 	for (i = 0; i < nv50->vtxelt_nr; i++) {
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index ffe066d536..c14414fff6 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -27,9 +27,9 @@
 
 static void r300_blitter_save_states(struct r300_context* r300)
 {
-    util_blitter_save_blend(r300->blitter, r300->blend_state);
-    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state);
-    util_blitter_save_rasterizer(r300->blitter, r300->rs_state);
+    util_blitter_save_blend(r300->blitter, r300->blend_state.state);
+    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state.state);
+    util_blitter_save_rasterizer(r300->blitter, r300->rs_state.state);
     util_blitter_save_fragment_shader(r300->blitter, r300->fs);
     util_blitter_save_vertex_shader(r300->blitter, r300->vs);
 }
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index d5c2d63d39..af95bbe789 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -30,6 +30,7 @@
 
 #include "r300_blit.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_flush.h"
 #include "r300_query.h"
 #include "r300_render.h"
@@ -69,11 +70,13 @@ static void r300_destroy_context(struct pipe_context* context)
         FREE(query);
     }
 
-    FREE(r300->blend_color_state);
+    FREE(r300->blend_color_state.state);
+    FREE(r300->clip_state.state);
     FREE(r300->rs_block);
-    FREE(r300->scissor_state);
+    FREE(r300->scissor_state.state);
     FREE(r300->vertex_info);
-    FREE(r300->viewport_state);
+    FREE(r300->viewport_state.state);
+    FREE(r300->ztop_state.state);
     FREE(r300);
 }
 
@@ -107,6 +110,25 @@ static void r300_flush_cb(void *data)
     cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
 }
 
+#define R300_INIT_ATOM(name) \
+    r300->name##_state.state = NULL; \
+    r300->name##_state.emit = r300_emit_##name##_state; \
+    r300->name##_state.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->name##_state);
+
+static void r300_setup_atoms(struct r300_context* r300)
+{
+    make_empty_list(&r300->atom_list);
+    R300_INIT_ATOM(ztop);
+    R300_INIT_ATOM(blend);
+    R300_INIT_ATOM(blend_color);
+    R300_INIT_ATOM(clip);
+    R300_INIT_ATOM(dsa);
+    R300_INIT_ATOM(rs);
+    R300_INIT_ATOM(scissor);
+    R300_INIT_ATOM(viewport);
+}
+
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          struct radeon_winsys* radeon_winsys)
 {
@@ -155,11 +177,15 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->shader_hash_table = util_hash_table_create(r300_shader_key_hash,
         r300_shader_key_compare);
 
-    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300_setup_atoms(r300);
+
+    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+    r300->scissor_state.state = CALLOC_STRUCT(r300_scissor_state);
     r300->vertex_info = CALLOC_STRUCT(r300_vertex_info);
-    r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
+    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
+    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
 
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 232530b7dc..5937f0e2cc 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -30,9 +30,18 @@
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
 
+struct r300_context;
+
 struct r300_fragment_shader;
 struct r300_vertex_shader;
 
+struct r300_atom {
+    struct r300_atom *prev, *next;
+    void* state;
+    void (*emit)(struct r300_context*, void*);
+    boolean dirty;
+};
+
 struct r300_blend_state {
     uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
     uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
@@ -62,11 +71,6 @@ struct r300_rs_state {
     /* Draw-specific rasterizer state */
     struct pipe_rasterizer_state rs;
 
-    /* Whether or not to enable the VTE. This is referenced at the very
-     * last moment during emission of VTE state, to decide whether or not
-     * the VTE should be used for transformation. */
-    boolean enable_vte;
-
     uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
     uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
@@ -105,9 +109,6 @@ struct r300_sampler_state {
 struct r300_scissor_regs {
     uint32_t top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
     uint32_t bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
-
-    /* Whether everything is culled by scissoring. */
-    boolean empty_area;
 };
 
 struct r300_scissor_state {
@@ -135,24 +136,17 @@ struct r300_ztop_state {
     uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
 };
 
-#define R300_NEW_BLEND           0x00000001
-#define R300_NEW_BLEND_COLOR     0x00000002
-#define R300_NEW_CLIP            0x00000004
-#define R300_NEW_DSA             0x00000008
 #define R300_NEW_FRAMEBUFFERS    0x00000010
 #define R300_NEW_FRAGMENT_SHADER 0x00000020
 #define R300_NEW_FRAGMENT_SHADER_CONSTANTS    0x00000040
-#define R300_NEW_RASTERIZER      0x00000080
 #define R300_NEW_RS_BLOCK        0x00000100
 #define R300_NEW_SAMPLER         0x00000200
 #define R300_ANY_NEW_SAMPLERS    0x0001fe00
-#define R300_NEW_SCISSOR         0x00020000
 #define R300_NEW_TEXTURE         0x00040000
 #define R300_ANY_NEW_TEXTURES    0x03fc0000
 #define R300_NEW_VERTEX_FORMAT   0x04000000
 #define R300_NEW_VERTEX_SHADER   0x08000000
 #define R300_NEW_VERTEX_SHADER_CONSTANTS    0x10000000
-#define R300_NEW_VIEWPORT        0x20000000
 #define R300_NEW_QUERY           0x40000000
 #define R300_NEW_KITCHEN_SINK    0x7fffffff
 
@@ -273,38 +267,40 @@ struct r300_context {
     struct r300_vertex_info* vertex_info;
 
     /* Various CSO state objects. */
+    /* Beginning of atom list. */
+    struct r300_atom atom_list;
     /* Blend state. */
-    struct r300_blend_state* blend_state;
+    struct r300_atom blend_state;
     /* Blend color state. */
-    struct r300_blend_color_state* blend_color_state;
+    struct r300_atom blend_color_state;
     /* User clip planes. */
-    struct pipe_clip_state clip_state;
+    struct r300_atom clip_state;
     /* Shader constants. */
     struct r300_constant_buffer shader_constants[PIPE_SHADER_TYPES];
     /* Depth, stencil, and alpha state. */
-    struct r300_dsa_state* dsa_state;
+    struct r300_atom dsa_state;
     /* Fragment shader. */
     struct r300_fragment_shader* fs;
     /* Framebuffer state. We currently don't need our own version of this. */
     struct pipe_framebuffer_state framebuffer_state;
     /* Rasterizer state. */
-    struct r300_rs_state* rs_state;
+    struct r300_atom rs_state;
     /* RS block state. */
     struct r300_rs_block* rs_block;
     /* Sampler states. */
     struct r300_sampler_state* sampler_states[8];
     int sampler_count;
     /* Scissor state. */
-    struct r300_scissor_state* scissor_state;
+    struct r300_atom scissor_state;
     /* Texture states. */
     struct r300_texture* textures[8];
     int texture_count;
     /* Vertex shader. */
     struct r300_vertex_shader* vs;
     /* Viewport state. */
-    struct r300_viewport_state* viewport_state;
+    struct r300_atom viewport_state;
     /* ZTOP state. */
-    struct r300_ztop_state ztop_state;
+    struct r300_atom ztop_state;
 
     /* Vertex buffers for Gallium. */
     struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -317,6 +313,8 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+    /* Whether the TCL engine should be in bypass mode. */
+    boolean tcl_bypass;
 
     /** Combination of DBG_xxx flags */
     unsigned debug;
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 199ce3a945..0e5533c790 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -25,6 +25,7 @@
 
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_simple_list.h"
 
 #include "r300_context.h"
 #include "r300_cs.h"
@@ -36,11 +37,12 @@
 #include "r300_texture.h"
 #include "r300_vs.h"
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend)
+void r300_emit_blend_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_state* blend = (struct r300_blend_state*)state;
     CS_LOCALS(r300);
     BEGIN_CS(8);
+    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
     if (r300->framebuffer_state.nr_cbufs) {
         OUT_CS(blend->blend_control);
@@ -52,14 +54,13 @@ void r300_emit_blend_state(struct r300_context* r300,
         OUT_CS(0);
         /* XXX also disable fastfill here once it's supported */
     }
-    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
     END_CS;
 }
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc)
+void r300_emit_blend_color_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
@@ -76,9 +77,9 @@ void r300_emit_blend_color_state(struct r300_context* r300,
     }
 }
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip)
+void r300_emit_clip_state(struct r300_context* r300, void* state)
 {
+    struct pipe_clip_state* clip = (struct pipe_clip_state*)state;
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
@@ -106,13 +107,13 @@ void r300_emit_clip_state(struct r300_context* r300,
 
 }
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                           struct r300_dsa_state* dsa)
+void r300_emit_dsa_state(struct r300_context* r300, void* state)
 {
+    struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
-    BEGIN_CS(r300screen->caps->is_r500 ? 10 : 8);
+    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 6);
     OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
 
     /* not needed since we use the 8bit alpha ref */
@@ -121,10 +122,16 @@ void r300_emit_dsa_state(struct r300_context* r300,
     }*/
 
     OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
-    OUT_CS(dsa->z_buffer_control);
-    OUT_CS(dsa->z_stencil_control);
+
+    if (r300->framebuffer_state.zsbuf) {
+        OUT_CS(dsa->z_buffer_control);
+        OUT_CS(dsa->z_stencil_control);
+    } else {
+        OUT_CS(0);
+        OUT_CS(0);
+    }
+
     OUT_CS(dsa->stencil_ref_mask);
-    OUT_CS_REG(R300_ZB_ZTOP, r300->ztop_state.z_buffer_top);
 
     /* XXX it seems r3xx doesn't support STENCILREFMASK_BF */
     if (r300screen->caps->is_r500) {
@@ -138,6 +145,8 @@ static const float * get_shader_constant(
     struct rc_constant * constant,
     struct r300_constant_buffer * externals)
 {
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
     static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
     struct pipe_texture *tex;
 
@@ -160,11 +169,31 @@ static const float * get_shader_constant(
 
                 /* Texture compare-fail value. */
                 /* XXX Since Gallium doesn't support GL_ARB_shadow_ambient,
-                 * this is always (0,0,0,0). */
+                 * this is always (0,0,0,0), right? */
                 case RC_STATE_SHADOW_AMBIENT:
                     vec[3] = 0;
                     break;
 
+                case RC_STATE_R300_VIEWPORT_SCALE:
+                    if (r300->tcl_bypass) {
+                        vec[0] = 1;
+                        vec[1] = 1;
+                        vec[2] = 1;
+                    } else {
+                        vec[0] = viewport->xscale;
+                        vec[1] = viewport->yscale;
+                        vec[2] = viewport->zscale;
+                    }
+                    break;
+
+                case RC_STATE_R300_VIEWPORT_OFFSET:
+                    if (!r300->tcl_bypass) {
+                        vec[0] = viewport->xoffset;
+                        vec[1] = viewport->yoffset;
+                        vec[2] = viewport->zoffset;
+                    }
+                    break;
+
                 default:
                     debug_printf("r300: Implementation error: "
                         "Unknown RC_CONSTANT type %d\n", constant->u.State[0]);
@@ -283,6 +312,22 @@ void r300_emit_fs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
+static void r300_emit_fragment_depth_config(struct r300_context* r300,
+                                            struct r300_fragment_shader* fs)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(4);
+    if (r300_fragment_shader_writes_depth(fs)) {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SHADER);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W24 | R300_W_SRC_US);
+    } else {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SCAN);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0 | R300_W_SRC_US);
+    }
+    END_CS;
+}
+
 void r500_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code)
 {
@@ -531,8 +576,9 @@ void r300_emit_query_end(struct r300_context* r300)
         r300_emit_query_finish(r300, query);
 }
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
+void r300_emit_rs_state(struct r300_context* r300, void* state)
 {
+    struct r300_rs_state* rs = (struct r300_rs_state*)state;
     CS_LOCALS(r300);
 
     BEGIN_CS(22);
@@ -607,10 +653,11 @@ static void r300_emit_scissor_regs(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor)
+void r300_emit_scissor_state(struct r300_context* r300, void* state)
 {
-    if (r300->rs_state->rs.scissor) {
+    struct r300_scissor_state* scissor = (struct r300_scissor_state*)state;
+    /* XXX argfl! */
+    if (((struct r300_rs_state*)r300->rs_state.state)->rs.scissor) {
         r300_emit_scissor_regs(r300, &scissor->scissor);
     } else {
         r300_emit_scissor_regs(r300, &scissor->framebuffer);
@@ -867,26 +914,27 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport)
+void r300_emit_viewport_state(struct r300_context* r300, void* state)
 {
+    struct r300_viewport_state* viewport = (struct r300_viewport_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(9);
-    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F(viewport->xscale);
-    OUT_CS_32F(viewport->xoffset);
-    OUT_CS_32F(viewport->yscale);
-    OUT_CS_32F(viewport->yoffset);
-    OUT_CS_32F(viewport->zscale);
-    OUT_CS_32F(viewport->zoffset);
-
-    if (r300->rs_state->enable_vte) {
-        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
-    } else {
+    if (r300->tcl_bypass) {
+        BEGIN_CS(2);
         OUT_CS_REG(R300_VAP_VTE_CNTL, 0);
+        END_CS;
+    } else {
+        BEGIN_CS(9);
+        OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+        OUT_CS_32F(viewport->xscale);
+        OUT_CS_32F(viewport->xoffset);
+        OUT_CS_32F(viewport->yscale);
+        OUT_CS_32F(viewport->yoffset);
+        OUT_CS_32F(viewport->zscale);
+        OUT_CS_32F(viewport->zoffset);
+        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
+        END_CS;
     }
-    END_CS;
 }
 
 void r300_emit_texture_count(struct r300_context* r300)
@@ -910,6 +958,16 @@ void r300_emit_texture_count(struct r300_context* r300)
 
 }
 
+void r300_emit_ztop_state(struct r300_context* r300, void* state)
+{
+    struct r300_ztop_state* ztop = (struct r300_ztop_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_ZB_ZTOP, ztop->z_buffer_top);
+    END_CS;
+}
+
 void r300_flush_textures(struct r300_context* r300)
 {
     CS_LOCALS(r300);
@@ -933,13 +991,10 @@ void r300_emit_dirty_state(struct r300_context* r300)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_texture* tex;
+    struct r300_atom* atom;
     int i, dirty_tex = 0;
     boolean invalid = FALSE;
 
-    if (!(r300->dirty_state)) {
-        return;
-    }
-
     /* Check size of CS. */
     /* Make sure we have at least 8*1024 spare dwords. */
     /* XXX It would be nice to know the number of dwords we really need to
@@ -997,7 +1052,7 @@ validate:
             goto validate;
         }
     } else {
-        // debug_printf("No VBO while emitting dirty state!\n");
+        /* debug_printf("No VBO while emitting dirty state!\n"); */
     }
     if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
@@ -1015,27 +1070,15 @@ validate:
         r300->dirty_state &= ~R300_NEW_QUERY;
     }
 
-    if (r300->dirty_state & R300_NEW_BLEND) {
-        r300_emit_blend_state(r300, r300->blend_state);
-        r300->dirty_state &= ~R300_NEW_BLEND;
-    }
-
-    if (r300->dirty_state & R300_NEW_BLEND_COLOR) {
-        r300_emit_blend_color_state(r300, r300->blend_color_state);
-        r300->dirty_state &= ~R300_NEW_BLEND_COLOR;
-    }
-
-    if (r300->dirty_state & R300_NEW_CLIP) {
-        r300_emit_clip_state(r300, &r300->clip_state);
-        r300->dirty_state &= ~R300_NEW_CLIP;
-    }
-
-    if (r300->dirty_state & R300_NEW_DSA) {
-        r300_emit_dsa_state(r300, r300->dsa_state);
-        r300->dirty_state &= ~R300_NEW_DSA;
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty) {
+            atom->emit(r300, atom->state);
+            atom->dirty = FALSE;
+        }
     }
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
+        r300_emit_fragment_depth_config(r300, r300->fs);
         if (r300screen->caps->is_r500) {
             r500_emit_fragment_program_code(r300, &r300->fs->shader->code);
         } else {
@@ -1060,21 +1103,11 @@ validate:
         r300->dirty_state &= ~R300_NEW_FRAMEBUFFERS;
     }
 
-    if (r300->dirty_state & R300_NEW_RASTERIZER) {
-        r300_emit_rs_state(r300, r300->rs_state);
-        r300->dirty_state &= ~R300_NEW_RASTERIZER;
-    }
-
     if (r300->dirty_state & R300_NEW_RS_BLOCK) {
         r300_emit_rs_block_state(r300, r300->rs_block);
         r300->dirty_state &= ~R300_NEW_RS_BLOCK;
     }
 
-    if (r300->dirty_state & R300_NEW_SCISSOR) {
-        r300_emit_scissor_state(r300, r300->scissor_state);
-        r300->dirty_state &= ~R300_NEW_SCISSOR;
-    }
-
     /* Samplers and textures are tracked separately but emitted together. */
     if (r300->dirty_state &
             (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
@@ -1096,11 +1129,6 @@ validate:
         r300->dirty_state &= ~(R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES);
     }
 
-    if (r300->dirty_state & R300_NEW_VIEWPORT) {
-        r300_emit_viewport_state(r300, r300->viewport_state);
-        r300->dirty_state &= ~R300_NEW_VIEWPORT;
-    }
-
     if (dirty_tex) {
         r300_flush_textures(r300);
     }
@@ -1129,7 +1157,7 @@ validate:
     */
 
     /* Finally, emit the VBO. */
-    //r300_emit_vertex_buffer(r300);
+    /* r300_emit_vertex_buffer(r300); */
 
     r300->dirty_hw++;
 }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 3797d3d332..05a6bfeae8 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -31,17 +31,13 @@ struct r300_vertex_program_code;
 
 void r300_emit_aos(struct r300_context* r300, unsigned offset);
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend);
+void r300_emit_blend_state(struct r300_context* r300, void* state);
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc);
+void r300_emit_blend_color_state(struct r300_context* r300, void* state);
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip);
+void r300_emit_clip_state(struct r300_context* r300, void* state);
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                         struct r300_dsa_state* dsa);
+void r300_emit_dsa_state(struct r300_context* r300, void* state);
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code);
@@ -63,13 +59,12 @@ void r300_emit_query_begin(struct r300_context* r300,
 
 void r300_emit_query_end(struct r300_context* r300);
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
+void r300_emit_rs_state(struct r300_context* r300, void* state);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
                               struct r300_rs_block* rs);
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor);
+void r300_emit_scissor_state(struct r300_context* r300, void* state);
 
 void r300_emit_texture(struct r300_context* r300,
                        struct r300_sampler_state* sampler,
@@ -89,11 +84,12 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs);
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport);
+void r300_emit_viewport_state(struct r300_context* r300, void* state);
 
 void r300_emit_texture_count(struct r300_context* r300);
 
+void r300_emit_ztop_state(struct r300_context* r300, void* state);
+
 void r300_flush_textures(struct r300_context* r300);
 
 /* Emit all dirty state. */
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 4e1b61ca40..60ea9c171d 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -63,6 +63,11 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
                 fs_inputs->fog = i;
                 break;
 
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                fs_inputs->wpos = i;
+                break;
+
             default:
                 assert(0);
         }
@@ -114,6 +119,9 @@ static void allocate_hardware_inputs(
     if (inputs->fog != ATTR_UNUSED) {
         allocate(mydata, inputs->fog, reg++);
     }
+    if (inputs->wpos != ATTR_UNUSED) {
+        allocate(mydata, inputs->wpos, reg++);
+    }
 }
 
 static void get_compare_state(
@@ -144,6 +152,7 @@ static void r300_translate_fragment_shader(
     struct r300_fragment_shader* fs = r300->fs;
     struct r300_fragment_program_compiler compiler;
     struct tgsi_to_rc ttr;
+    int wpos = fs->inputs.wpos;
 
     /* Setup the compiler. */
     memset(&compiler, 0, sizeof(compiler));
@@ -171,6 +180,18 @@ static void r300_translate_fragment_shader(
 
     fs->shadow_samplers = compiler.Base.Program.ShadowSamplers;
 
+    /**
+     * Transform the program to support WPOS.
+     *
+     * Introduce a small fragment at the start of the program that will be
+     * the only code that directly reads the WPOS input.
+     * All other code pieces that reference that input will be rewritten
+     * to read from a newly allocated temporary. */
+    if (wpos != ATTR_UNUSED) {
+        /* Moving the input to some other reg is not really necessary. */
+        rc_transform_fragment_wpos(&compiler.Base, wpos, wpos, TRUE);
+    }
+
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
     if (compiler.Base.Error) {
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index d8d08fbe26..034bfc15cf 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -2186,6 +2186,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+#       define R500_SRC_ALPHA_0_NO_READ                (1 << 30)
+#       define R500_SRC_ALPHA_1_NO_READ                (1 << 31)
 
 /* the following are shared between CBLEND and ABLEND */
 #       define R300_FCN_MASK                         (3  << 12)
@@ -2638,7 +2640,7 @@ enum {
 	VE_COND_MUX_GTE			= 25,
 	VE_SET_GREATER_THAN		= 26,
 	VE_SET_EQUAL			= 27,
-	VE_SET_NOT_EQUAL		= 28,
+	VE_SET_NOT_EQUAL		= 28
 };
 
 enum {
@@ -2672,20 +2674,20 @@ enum {
 	ME_PRED_SET_CLR			= 25,
 	ME_PRED_SET_INV			= 26,
 	ME_PRED_SET_POP			= 27,
-	ME_PRED_SET_RESTORE		= 28,
+	ME_PRED_SET_RESTORE		= 28
 };
 
 enum {
 	/* R3XX */
 	PVS_MACRO_OP_2CLK_MADD		= 0,
-	PVS_MACRO_OP_2CLK_M2X_ADD	= 1,
+	PVS_MACRO_OP_2CLK_M2X_ADD	= 1
 };
 
 enum {
 	PVS_SRC_REG_TEMPORARY		= 0,	/* Intermediate Storage */
 	PVS_SRC_REG_INPUT		= 1,	/* Input Vertex Storage */
 	PVS_SRC_REG_CONSTANT		= 2,	/* Constant State Storage */
-	PVS_SRC_REG_ALT_TEMPORARY	= 3,	/* Alternate Intermediate Storage */
+	PVS_SRC_REG_ALT_TEMPORARY	= 3	/* Alternate Intermediate Storage */
 };
 
 enum {
@@ -2694,7 +2696,7 @@ enum {
 	PVS_DST_REG_OUT			= 2,	/* Output Memory. Used for all outputs */
 	PVS_DST_REG_OUT_REPL_X		= 3,	/* Output Memory & Replicate X to all channels */
 	PVS_DST_REG_ALT_TEMPORARY	= 4,	/* Alternate Intermediate Storage */
-	PVS_DST_REG_INPUT		= 5,	/* Output Memory & Replicate X to all channels */
+	PVS_DST_REG_INPUT		= 5	/* Output Memory & Replicate X to all channels */
 };
 
 enum {
@@ -2703,7 +2705,7 @@ enum {
 	PVS_SRC_SELECT_Z		= 2,	/* Select Z Component */
 	PVS_SRC_SELECT_W		= 3,	/* Select W Component */
 	PVS_SRC_SELECT_FORCE_0		= 4,	/* Force Component to 0.0 */
-	PVS_SRC_SELECT_FORCE_1		= 5,	/* Force Component to 1.0 */
+	PVS_SRC_SELECT_FORCE_1		= 5	/* Force Component to 1.0 */
 };
 
 /* PVS Opcode & Destination Operand Description */
@@ -2742,7 +2744,7 @@ enum {
 	PVS_DST_ADDR_SEL_MASK		= 0x3,
 	PVS_DST_ADDR_SEL_SHIFT		= 29,
 	PVS_DST_ADDR_MODE_0_MASK	= 0x1,
-	PVS_DST_ADDR_MODE_0_SHIFT	= 31,
+	PVS_DST_ADDR_MODE_0_SHIFT	= 31
 };
 
 /* PVS Source Operand Description */
@@ -2777,7 +2779,7 @@ enum {
 	PVS_SRC_ADDR_SEL_MASK		= 0x3,
 	PVS_SRC_ADDR_SEL_SHIFT		= 29,
 	PVS_SRC_ADDR_MODE_1_MASK	= 0x0,
-	PVS_SRC_ADDR_MODE_1_SHIFT	= 32,
+	PVS_SRC_ADDR_MODE_1_SHIFT	= 32
 };
 
 /*\}*/
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index a89cb633e0..ee43421cdb 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -69,16 +69,11 @@ uint32_t r300_translate_primitive(unsigned prim)
     }
 }
 
-static boolean r300_nothing_to_draw(struct r300_context *r300)
-{
-    return r300->rs_state->rs.scissor &&
-           r300->scissor_state->scissor.empty_area;
-}
-
 static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
                                             unsigned mode)
 {
-    uint32_t color_control = r300->rs_state->color_control;
+    struct r300_rs_state* rs = (struct r300_rs_state*)r300->rs_state.state;
+    uint32_t color_control = rs->color_control;
 
     /* By default (see r300_state.c:r300_create_rs_state) color_control is
      * initialized to provoking the first vertex.
@@ -98,7 +93,7 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
      * ~ C.
      */
 
-    if (r300->rs_state->rs.flatshade_first) {
+    if (rs->rs.flatshade_first) {
         switch (mode) {
             case PIPE_PRIM_TRIANGLE_FAN:
                 color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND;
@@ -213,7 +208,7 @@ validate:
 }
 
 /* This is the fast-path drawing & emission for HW TCL. */
-boolean r300_draw_range_elements(struct pipe_context* pipe,
+void r300_draw_range_elements(struct pipe_context* pipe,
                                  struct pipe_buffer* indexBuffer,
                                  unsigned indexSize,
                                  unsigned minIndex,
@@ -225,30 +220,29 @@ boolean r300_draw_range_elements(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+       /* XXX: use aux/indices functions to split this into smaller
+        * primitives.
+        */
+        return;
     }
 
     r300_update_derived_state(r300);
 
     if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
+        return;
     }
 
     if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
                                   RADEON_GEM_DOMAIN_GTT, 0)) {
-        return FALSE;
+        return;
     }
 
     if (!r300->winsys->validate(r300->winsys)) {
-        return FALSE;
+        return;
     }
 
     r300_emit_dirty_state(r300);
@@ -257,41 +251,38 @@ boolean r300_draw_range_elements(struct pipe_context* pipe,
 
     r300_emit_draw_elements(r300, indexBuffer, indexSize, minIndex, maxIndex,
                             mode, start, count);
-
-    return TRUE;
 }
 
 /* Simple helpers for context setup. Should probably be moved to util. */
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count)
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count)
 {
-    return pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
-                                     mode, start, count);
+   pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
+                             mode, start, count);
 }
 
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
                          unsigned start, unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        /* XXX: driver needs to handle this -- use the functions in
+         * aux/indices to split this into several smaller primitives.
+         */
+        return;
     }
 
     r300_update_derived_state(r300);
 
     if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
+        return;
     }
 
     r300_emit_dirty_state(r300);
@@ -299,8 +290,6 @@ boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
     r300_emit_aos(r300, start);
 
     r300_emit_draw_arrays(r300, mode, count);
-
-    return TRUE;
 }
 
 /****************************************************************************
@@ -309,7 +298,7 @@ boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
  ***************************************************************************/
 
 /* SW TCL arrays, using Draw. */
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
                                unsigned mode,
                                unsigned start,
                                unsigned count)
@@ -318,11 +307,7 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
     int i;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -346,12 +331,10 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
         pipe_buffer_unmap(pipe->screen, r300->vertex_buffer[i].buffer);
         draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
     }
-
-    return TRUE;
 }
 
 /* SW TCL elements, using Draw. */
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
                                        struct pipe_buffer* indexBuffer,
                                        unsigned indexSize,
                                        unsigned minIndex,
@@ -365,11 +348,7 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
     void* indices;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -400,8 +379,6 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
     pipe_buffer_unmap(pipe->screen, indexBuffer);
     draw_set_mapped_element_buffer_range(r300->draw, 0, start,
                                          start + count - 1, NULL);
-
-    return TRUE;
 }
 
 /* Object for rendering using Draw. */
diff --git a/src/gallium/drivers/r300/r300_render.h b/src/gallium/drivers/r300/r300_render.h
index da83069083..27b5e6a963 100644
--- a/src/gallium/drivers/r300/r300_render.h
+++ b/src/gallium/drivers/r300/r300_render.h
@@ -25,35 +25,35 @@
 
 uint32_t r300_translate_primitive(unsigned prim);
 
-boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count);
+void r300_draw_range_elements(struct pipe_context* pipe,
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count);
 
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count);
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count);
 
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count);
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                      unsigned start, unsigned count);
 
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
-                               unsigned mode,
-                               unsigned start,
-                               unsigned count);
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                            unsigned mode,
+                            unsigned start,
+                            unsigned count);
 
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
-                                       struct pipe_buffer* indexBuffer,
-                                       unsigned indexSize,
-                                       unsigned minIndex,
-                                       unsigned maxIndex,
-                                       unsigned mode,
-                                       unsigned start,
-                                       unsigned count);
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                    struct pipe_buffer* indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned minIndex,
+                                    unsigned maxIndex,
+                                    unsigned mode,
+                                    unsigned start,
+                                    unsigned count);
 
 #endif /* R300_RENDER_H */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 2a8667d483..287664b1d2 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -83,6 +83,7 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
 
     switch (param) {
         case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
             /* XXX I'm told this goes up to 16 */
             return 8;
         case PIPE_CAP_NPOT_TEXTURES:
@@ -143,9 +144,11 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
             return 1;
         case PIPE_CAP_SM3:
-            return 1;
-        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-            return 8;
+            if (r300screen->caps->is_r500) {
+                return 1;
+            } else {
+                return 0;
+            }
         default:
             debug_printf("r300: Implementation error: Bad param %d\n",
                 param);
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index 85184e2cfd..6796841b29 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -40,6 +40,7 @@ struct r300_shader_semantics {
     int bcolor[ATTR_COLOR_COUNT];
     int generic[ATTR_GENERIC_COUNT];
     int fog;
+    int wpos;
 };
 
 static INLINE void r300_shader_semantics_reset(
@@ -50,6 +51,7 @@ static INLINE void r300_shader_semantics_reset(
     info->pos = ATTR_UNUSED;
     info->psize = ATTR_UNUSED;
     info->fog = ATTR_UNUSED;
+    info->wpos = ATTR_UNUSED;
 
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         info->color[i] = ATTR_UNUSED;
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 49072462ec..78764ddc98 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -41,6 +42,120 @@
 /* r300_state: Functions used to intialize state context by translating
  * Gallium state objects into semi-native r300 state objects. */
 
+static boolean blend_discard_if_src_alpha_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 0, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 1, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (0,0,0), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (1,1,1), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_0(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (0,0,0,0), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_1(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (1,1,1,1), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
 /* Create a new blend state based on the CSO blend state.
  *
  * This encompasses alpha blending, logic/raster ops, and blend dithering. */
@@ -66,7 +181,11 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
             ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
 
-        /* optimization: some operations do not require the destination color */
+        /* Optimization: some operations do not require the destination color.
+         *
+         * When SRC_ALPHA_SATURATE is used, colorbuffer reads must be enabled,
+         * otherwise blending gives incorrect results. It seems to be
+         * a hardware bug. */
         if (eqRGB == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MIN ||
             eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MAX ||
             dstRGB != PIPE_BLENDFACTOR_ZERO ||
@@ -78,11 +197,81 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             srcA == PIPE_BLENDFACTOR_DST_COLOR ||
             srcA == PIPE_BLENDFACTOR_DST_ALPHA ||
             srcA == PIPE_BLENDFACTOR_INV_DST_COLOR ||
-            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA)
+            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) {
+            /* Enable reading from the colorbuffer. */
             blend->blend_control |= R300_READ_ENABLE;
 
-        /* XXX implement the optimization with DISCARD_SRC_PIXELS*/
-        /* XXX implement the optimization with SRC_ALPHA_?_NO_READ */
+            if (r300_screen(r300_context(pipe)->context.screen)->caps->is_r500) {
+                /* Optimization: Depending on incoming pixels, we can
+                 * conditionally disable the reading in hardware... */
+                if (eqRGB != PIPE_BLEND_MIN && eqA != PIPE_BLEND_MIN &&
+                    eqRGB != PIPE_BLEND_MAX && eqA != PIPE_BLEND_MAX) {
+                    /* Disable reading if SRC_ALPHA == 0. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_0_NO_READ;
+                    }
+
+                    /* Disable reading if SRC_ALPHA == 1. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_1_NO_READ;
+                    }
+                }
+            }
+        }
+
+        /* Optimization: discard pixels which don't change the colorbuffer.
+         *
+         * The code below is non-trivial and some math is involved.
+         *
+         * Discarding pixels must be disabled when FP16 AA is enabled.
+         * This is a hardware bug. Also, this implementation wouldn't work
+         * with FP blending enabled and equation clamping disabled.
+         *
+         * Equations other than ADD are rarely used and therefore won't be
+         * optimized. */
+        if ((eqRGB == PIPE_BLEND_ADD || eqRGB == PIPE_BLEND_REVERSE_SUBTRACT) &&
+            (eqA == PIPE_BLEND_ADD || eqA == PIPE_BLEND_REVERSE_SUBTRACT)) {
+            /* ADD: X+Y
+             * REVERSE_SUBTRACT: Y-X
+             *
+             * The idea is:
+             * If X = src*srcFactor = 0 and Y = dst*dstFactor = 1,
+             * then CB will not be changed.
+             *
+             * Given the srcFactor and dstFactor variables, we can derive
+             * what src and dst should be equal to and discard appropriate
+             * pixels.
+             */
+            if (blend_discard_if_src_alpha_0(srcRGB, srcA, dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+            } else if (blend_discard_if_src_alpha_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
+            } else if (blend_discard_if_src_color_0(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
+            } else if (blend_discard_if_src_color_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
+            } else if (blend_discard_if_src_alpha_color_0(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0;
+            } else if (blend_discard_if_src_alpha_color_1(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1;
+            }
+        }
 
         /* separate alpha */
         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
@@ -128,8 +317,8 @@ static void r300_bind_blend_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    r300->blend_state = (struct r300_blend_state*)state;
-    r300->dirty_state |= R300_NEW_BLEND;
+    r300->blend_state.state = state;
+    r300->blend_state.dirty = TRUE;
 }
 
 /* Free blend state. */
@@ -151,20 +340,22 @@ static void r300_set_blend_color(struct pipe_context* pipe,
                                  const struct pipe_blend_color* color)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_blend_color_state* state =
+        (struct r300_blend_color_state*)r300->blend_color_state.state;
     union util_color uc;
 
     util_pack_color(color->color, PIPE_FORMAT_A8R8G8B8_UNORM, &uc);
-    r300->blend_color_state->blend_color = uc.ui;
+    state->blend_color = uc.ui;
 
     /* XXX if FP16 blending is enabled, we should use the FP16 format */
-    r300->blend_color_state->blend_color_red_alpha =
+    state->blend_color_red_alpha =
         float_to_fixed10(color->color[0]) |
         (float_to_fixed10(color->color[3]) << 16);
-    r300->blend_color_state->blend_color_green_blue =
+    state->blend_color_green_blue =
         float_to_fixed10(color->color[2]) |
         (float_to_fixed10(color->color[1]) << 16);
 
-    r300->dirty_state |= R300_NEW_BLEND_COLOR;
+    r300->blend_color_state.dirty = TRUE;
 }
 
 static void r300_set_clip_state(struct pipe_context* pipe,
@@ -173,8 +364,8 @@ static void r300_set_clip_state(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
 
     if (r300_screen(pipe->screen)->caps->has_tcl) {
-        r300->clip_state = *state;
-        r300->dirty_state |= R300_NEW_CLIP;
+        memcpy(r300->clip_state.state, state, sizeof(struct pipe_clip_state));
+        r300->clip_state.dirty = TRUE;
     } else {
         draw_flush(r300->draw);
         draw_set_clip_state(r300->draw, state);
@@ -272,8 +463,8 @@ static void r300_bind_dsa_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    r300->dsa_state = (struct r300_dsa_state*)state;
-    r300->dirty_state |= R300_NEW_DSA;
+    r300->dsa_state.state = state;
+    r300->dsa_state.dirty = TRUE;
 }
 
 /* Free DSA state. */
@@ -303,9 +494,6 @@ static void r300_set_scissor_regs(const struct pipe_scissor_state* state,
             (((state->maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
             (((state->maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
     }
-
-    scissor->empty_area = state->minx >= state->maxx ||
-                          state->miny >= state->maxy;
 }
 
 static void
@@ -313,7 +501,9 @@ static void
                                const struct pipe_framebuffer_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct pipe_scissor_state scissor;
+    struct r300_scissor_state* scissor =
+        (struct r300_scissor_state*)r300->scissor_state.state;
+    struct pipe_scissor_state pscissor;
 
     if (r300->draw) {
         draw_flush(r300->draw);
@@ -321,18 +511,19 @@ static void
 
     r300->framebuffer_state = *state;
 
-    scissor.minx = scissor.miny = 0;
-    scissor.maxx = state->width;
-    scissor.maxy = state->height;
-    r300_set_scissor_regs(&scissor, &r300->scissor_state->framebuffer,
+    /* XXX Arg. This is silly. */
+    pscissor.minx = pscissor.miny = 0;
+    pscissor.maxx = state->width;
+    pscissor.maxy = state->height;
+    r300_set_scissor_regs(&pscissor, &scissor->framebuffer,
                           r300_screen(r300->context.screen)->caps->is_r500);
 
     /* Don't rely on the order of states being set for the first time. */
-    if (!r300->rs_state || !r300->rs_state->rs.scissor) {
-        r300->dirty_state |= R300_NEW_SCISSOR;
-    }
     r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
-    r300->dirty_state |= R300_NEW_BLEND;
+
+    r300->blend_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+    r300->scissor_state.dirty = TRUE;
 }
 
 /* Create fragment shader state. */
@@ -367,6 +558,10 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
     r300->fs = fs;
     r300_pick_fragment_shader(r300);
 
+    if (r300->vs && r300_vertex_shader_setup_wpos(r300)) {
+        r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
+    }
+
     r300->dirty_state |= R300_NEW_FRAGMENT_SHADER | R300_NEW_FRAGMENT_SHADER_CONSTANTS;
 }
 
@@ -407,8 +602,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     /* Copy rasterizer state for Draw. */
     rs->rs = *state;
 
-    rs->enable_vte = !state->bypass_vs_clip_and_viewport;
-
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
     rs->vap_control_status = R300_VC_NO_SWAP;
 #else
@@ -524,12 +717,19 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
         draw_set_rasterizer_state(r300->draw, &rs->rs);
     }
 
-    r300->rs_state = rs;
+    r300->tcl_bypass = rs->rs.bypass_vs_clip_and_viewport;
+
+    r300->rs_state.state = rs;
+    r300->rs_state.dirty = TRUE;
+    /* XXX Why is this still needed, dammit!? */
+    r300->scissor_state.dirty = TRUE;
+    r300->viewport_state.dirty = TRUE;
+
     /* XXX Clean these up when we move to atom emits */
-    r300->dirty_state |= R300_NEW_RASTERIZER;
     r300->dirty_state |= R300_NEW_RS_BLOCK;
-    r300->dirty_state |= R300_NEW_SCISSOR;
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 /* Free rasterizer state. */
@@ -556,7 +756,8 @@ static void*
 
     sampler->filter0 |= r300_translate_tex_filters(state->min_img_filter,
                                                    state->mag_img_filter,
-                                                   state->min_mip_filter);
+                                                   state->min_mip_filter,
+                                                   state->max_anisotropy > 1.0);
 
     /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
     /* We must pass these to the emit function to clamp them properly. */
@@ -663,50 +864,54 @@ static void r300_set_scissor_state(struct pipe_context* pipe,
                                    const struct pipe_scissor_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_scissor_state* scissor =
+        (struct r300_scissor_state*)r300->scissor_state.state;
 
-    r300_set_scissor_regs(state, &r300->scissor_state->scissor,
+    r300_set_scissor_regs(state, &scissor->scissor,
                           r300_screen(r300->context.screen)->caps->is_r500);
 
-    /* Don't rely on the order of states being set for the first time. */
-    if (!r300->rs_state || r300->rs_state->rs.scissor) {
-        r300->dirty_state |= R300_NEW_SCISSOR;
-    }
+    r300->scissor_state.dirty = TRUE;
 }
 
 static void r300_set_viewport_state(struct pipe_context* pipe,
                                     const struct pipe_viewport_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
 
     /* Do the transform in HW. */
-    r300->viewport_state->vte_control = R300_VTX_W0_FMT;
+    viewport->vte_control = R300_VTX_W0_FMT;
 
     if (state->scale[0] != 1.0f) {
-        r300->viewport_state->xscale = state->scale[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_SCALE_ENA;
+        viewport->xscale = state->scale[0];
+        viewport->vte_control |= R300_VPORT_X_SCALE_ENA;
     }
     if (state->scale[1] != 1.0f) {
-        r300->viewport_state->yscale = state->scale[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_SCALE_ENA;
+        viewport->yscale = state->scale[1];
+        viewport->vte_control |= R300_VPORT_Y_SCALE_ENA;
     }
     if (state->scale[2] != 1.0f) {
-        r300->viewport_state->zscale = state->scale[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_SCALE_ENA;
+        viewport->zscale = state->scale[2];
+        viewport->vte_control |= R300_VPORT_Z_SCALE_ENA;
     }
     if (state->translate[0] != 0.0f) {
-        r300->viewport_state->xoffset = state->translate[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_OFFSET_ENA;
+        viewport->xoffset = state->translate[0];
+        viewport->vte_control |= R300_VPORT_X_OFFSET_ENA;
     }
     if (state->translate[1] != 0.0f) {
-        r300->viewport_state->yoffset = state->translate[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_OFFSET_ENA;
+        viewport->yoffset = state->translate[1];
+        viewport->vte_control |= R300_VPORT_Y_OFFSET_ENA;
     }
     if (state->translate[2] != 0.0f) {
-        r300->viewport_state->zoffset = state->translate[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_OFFSET_ENA;
+        viewport->zoffset = state->translate[2];
+        viewport->vte_control |= R300_VPORT_Z_OFFSET_ENA;
     }
 
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    r300->viewport_state.dirty = TRUE;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 static void r300_set_vertex_buffers(struct pipe_context* pipe,
@@ -778,7 +983,13 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
         }
 
         r300->vs = vs;
-        r300->dirty_state |= R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
+        if (r300->fs) {
+            r300_vertex_shader_setup_wpos(r300);
+        }
+
+        r300->dirty_state |=
+            R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS |
+            R300_NEW_VERTEX_FORMAT;
     } else {
         draw_flush(r300->draw);
         draw_bind_vertex_shader(r300->draw,
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 727ae7ade6..192846411b 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -139,10 +139,10 @@ static void r300_vertex_psc(struct r300_context* r300)
 
     /* If TCL is bypassed, map vertex streams to equivalent VS output
      * locations. */
-    if (r300->rs_state->enable_vte) {
-        stream_tab = identity;
-    } else {
+    if (r300->tcl_bypass) {
         stream_tab = r300->vs->stream_loc_notcl;
+    } else {
+        stream_tab = identity;
     }
 
     /* Vertex shaders have no semantics on their inputs,
@@ -333,6 +333,8 @@ static void r300_update_rs_block(struct r300_context* r300,
     void (*rX00_rs_col_write)(struct r300_rs_block*, int, int);
     void (*rX00_rs_tex)(struct r300_rs_block*, int, int, boolean);
     void (*rX00_rs_tex_write)(struct r300_rs_block*, int, int);
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
         rX00_rs_col       = r500_rs_col;
@@ -348,7 +350,7 @@ static void r300_update_rs_block(struct r300_context* r300,
 
     /* Rasterize colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             /* Always rasterize if it's written by the VS,
              * otherwise it locks up. */
             rX00_rs_col(rs, col_count, i, FALSE);
@@ -410,6 +412,16 @@ static void r300_update_rs_block(struct r300_context* r300,
         }
     }
 
+    /* Rasterize WPOS. */
+    /* If the FS doesn't need it, it's not written by the VS. */
+    if (fs_inputs->wpos != ATTR_UNUSED) {
+        rX00_rs_tex(rs, tex_count, tex_count, FALSE);
+        rX00_rs_tex_write(rs, tex_count, fp_offset);
+
+        fp_offset++;
+        tex_count++;
+    }
+
     /* Rasterize at least one color, or bad things happen. */
     if (col_count == 0 && tex_count == 0) {
         rX00_rs_col(rs, 0, 0, TRUE);
@@ -496,7 +508,8 @@ static boolean r300_dsa_alpha_test_enabled(struct r300_dsa_state* dsa)
 
 static void r300_update_ztop(struct r300_context* r300)
 {
-    r300->ztop_state.z_buffer_top = R300_ZTOP_ENABLE;
+    struct r300_ztop_state* ztop_state =
+        (struct r300_ztop_state*)r300->ztop_state.state;
 
     /* This is important enough that I felt it warranted a comment.
      *
@@ -518,31 +531,37 @@ static void r300_update_ztop(struct r300_context* r300)
      * 5) Depth writes in fragment shader
      * 6) Outstanding occlusion queries
      *
+     * This register causes stalls all the way from SC to CB when changed,
+     * but it is buffered on-chip so it does not hurt to write it if it has
+     * not changed.
+     *
      * ~C.
      */
 
     /* ZS writes */
-    if (r300_dsa_writes_depth_stencil(r300->dsa_state) &&
-           (r300_dsa_alpha_test_enabled(r300->dsa_state) ||   /* (1) */
-            r300->fs->info.uses_kill)) {                      /* (2) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300_fragment_shader_writes_depth(r300->fs)) { /* (5) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300->query_current) {                         /* (6) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    if (r300_dsa_writes_depth_stencil(r300->dsa_state.state) &&
+           (r300_dsa_alpha_test_enabled(r300->dsa_state.state) ||/* (1) */
+            r300->fs->info.uses_kill)) {                         /* (2) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300_fragment_shader_writes_depth(r300->fs)) {    /* (5) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->query_current) {                            /* (6) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else {
+        ztop_state->z_buffer_top = R300_ZTOP_ENABLE;
     }
+
+    r300->ztop_state.dirty = TRUE;
 }
 
 void r300_update_derived_state(struct r300_context* r300)
 {
+    /* XXX */
     if (r300->dirty_state &
         (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER |
-         R300_NEW_VERTEX_FORMAT)) {
+         R300_NEW_VERTEX_FORMAT) || r300->rs_state.dirty) {
         r300_update_derived_shader_state(r300);
     }
 
-    if (r300->dirty_state &
-            (R300_NEW_DSA | R300_NEW_FRAGMENT_SHADER | R300_NEW_QUERY)) {
-        r300_update_ztop(r300);
-    }
+    r300_update_ztop(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index dbe42edd91..35be00e1b0 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -257,38 +257,37 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
+static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+                                                  int is_anisotropic)
 {
     uint32_t retval = 0;
-    switch (min) {
+    if (is_anisotropic)
+        retval |= R300_TX_MIN_FILTER_ANISO | R300_TX_MAG_FILTER_ANISO;
+    else {
+        switch (min) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MIN_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MIN_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MIN_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", min);
             assert(0);
             break;
-    }
-    switch (mag) {
+        }
+        switch (mag) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MAG_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MAG_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MAG_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", mag);
             assert(0);
             break;
+        }
     }
     switch (mip) {
         case PIPE_TEX_MIPFILTER_NONE:
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index bcd4c030f9..f25f3ca217 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -43,7 +43,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
 
-    BEGIN_CS(20 + (caps->has_tcl ? 2: 0));
+    BEGIN_CS(16 + (caps->has_tcl ? 2: 0));
 
     /*** Graphics Backend (GB) ***/
     /* Various GB enables */
@@ -66,8 +66,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
-    OUT_CS_REG(R300_FG_DEPTH_SRC, 0x0);
-    OUT_CS_REG(R300_US_W_FMT, 0x0);
 
     /*** VAP ***/
     /* Sign/normalize control */
@@ -118,8 +116,8 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
     OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
     if (caps->is_r500) {
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFFFFFFFF);
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
     }
     OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
     OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 096cdb20bb..a792c2cf98 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -120,7 +120,7 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_NOT: return RC_OPCODE_NOT; */
      /* case TGSI_OPCODE_TRUNC: return RC_OPCODE_TRUNC; */
      /* case TGSI_OPCODE_SHL: return RC_OPCODE_SHL; */
-     /* case TGSI_OPCODE_SHR: return RC_OPCODE_SHR; */
+     /* case TGSI_OPCODE_ISHR: return RC_OPCODE_SHR; */
      /* case TGSI_OPCODE_AND: return RC_OPCODE_AND; */
      /* case TGSI_OPCODE_OR: return RC_OPCODE_OR; */
      /* case TGSI_OPCODE_MOD: return RC_OPCODE_MOD; */
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index c4ed0d712f..68aef70872 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_vs.h"
+#include "r300_fs.h"
 
 #include "r300_context.h"
 #include "r300_screen.h"
@@ -33,6 +34,8 @@
 
 #include "radeon_compiler.h"
 
+#include "util/u_math.h"
+
 /* Convert info about VS output semantics into r300_shader_semantics. */
 static void r300_shader_read_vs_outputs(
     struct tgsi_shader_info* info,
@@ -88,11 +91,13 @@ static void r300_shader_read_vs_outputs(
     }
 }
 
-static void r300_shader_vap_output_fmt(
-    struct r300_shader_semantics* vs_outputs,
-    uint* hwfmt)
+static void r300_shader_vap_output_fmt(struct r300_vertex_shader* vs)
 {
+    struct r300_shader_semantics* vs_outputs = &vs->outputs;
+    uint32_t* hwfmt = vs->hwfmt;
     int i, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Do the actual vertex_info setup.
      *
@@ -119,13 +124,19 @@ static void r300_shader_vap_output_fmt(
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             hwfmt[1] |= R300_INPUT_CNTL_COLOR;
             hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i;
         }
     }
 
-    /* XXX Back-face colors. */
+    /* Back-face colors. */
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+            hwfmt[1] |= R300_INPUT_CNTL_COLOR;
+            hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << (2+i);
+        }
+    }
 
     /* Texture coordinates. */
     gen_count = 0;
@@ -146,6 +157,9 @@ static void r300_shader_vap_output_fmt(
 
     /* XXX magic */
     assert(gen_count <= 8);
+
+    /* WPOS. */
+    vs->wpos_tex_output = gen_count;
 }
 
 /* Sets up stream mapping to equivalent VS outputs if TCL is bypassed
@@ -155,6 +169,8 @@ static void r300_stream_locations_notcl(
     int* stream_loc)
 {
     int i, tabi = 0, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Position. */
     stream_loc[tabi++] = 0;
@@ -166,14 +182,14 @@ static void r300_stream_locations_notcl(
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             stream_loc[tabi++] = 2 + i;
         }
     }
 
     /* Back-face colors. */
-    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
             stream_loc[tabi++] = 4 + i;
         }
     }
@@ -181,7 +197,7 @@ static void r300_stream_locations_notcl(
     /* Texture coordinates. */
     gen_count = 0;
     for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
-        if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
             assert(tabi < 16);
             stream_loc[tabi++] = 6 + gen_count;
             gen_count++;
@@ -195,8 +211,12 @@ static void r300_stream_locations_notcl(
         gen_count++;
     }
 
-    /* XXX magic */
-    assert(gen_count <= 8);
+    /* WPOS. */
+    if (vs_outputs->wpos != ATTR_UNUSED) {
+        assert(tabi < 16);
+        stream_loc[tabi++] = 6 + gen_count;
+        gen_count++;
+    }
 
     for (; tabi < 16;) {
         stream_loc[tabi++] = -1;
@@ -209,6 +229,8 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     struct r300_shader_semantics* outputs = &vs->outputs;
     struct tgsi_shader_info* info = &vs->info;
     int i, reg = 0;
+    boolean any_bcolor_used = outputs->bcolor[0] != ATTR_UNUSED ||
+                              outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Fill in the input mapping */
     for (i = 0; i < info->num_inputs; i++)
@@ -226,14 +248,30 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
         c->code->outputs[outputs->psize] = reg++;
     }
 
+    /* If we're writing back facing colors we need to send
+     * four colors to make front/back face colors selection work.
+     * If the vertex program doesn't write all 4 colors, lets
+     * pretend it does by skipping output index reg so the colors
+     * get written into appropriate output vectors.
+     */
+
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (outputs->color[i] != ATTR_UNUSED) {
             c->code->outputs[outputs->color[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
         }
     }
 
-    /* XXX Back-face colors. */
+    /* Back-face colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->bcolor[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->bcolor[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
+        }
+    }
 
     /* Texture coordinates. */
     for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
@@ -246,6 +284,33 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     if (outputs->fog != ATTR_UNUSED) {
         c->code->outputs[outputs->fog] = reg++;
     }
+
+    /* WPOS. */
+    if (outputs->wpos != ATTR_UNUSED) {
+        c->code->outputs[outputs->wpos] = reg++;
+    }
+}
+
+static void r300_insert_wpos(struct r300_vertex_program_compiler* c,
+                             struct r300_shader_semantics* outputs)
+{
+    int i, lastOutput = 0;
+
+    /* Find the max output index. */
+    lastOutput = MAX2(lastOutput, outputs->psize);
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->color[i]);
+        lastOutput = MAX2(lastOutput, outputs->bcolor[i]);
+    }
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->generic[i]);
+    }
+    lastOutput = MAX2(lastOutput, outputs->fog);
+
+    /* Set WPOS after the last output. */
+    lastOutput++;
+    rc_copy_output(&c->Base, 0, lastOutput); /* out[lastOutput] = out[0]; */
+    outputs->wpos = lastOutput;
 }
 
 void r300_translate_vertex_shader(struct r300_context* r300,
@@ -256,8 +321,6 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
     /* Initialize. */
     r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
-    r300_shader_vap_output_fmt(&vs->outputs, vs->hwfmt);
-    r300_stream_locations_notcl(&vs->outputs, vs->stream_loc_notcl);
 
     /* Setup the compiler */
     rc_init(&compiler.Base);
@@ -277,9 +340,15 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
     r300_tgsi_to_rc(&ttr, vs->state.tokens);
 
-    compiler.RequiredOutputs = ~(~0 << vs->info.num_outputs);
+    compiler.RequiredOutputs = ~(~0 << (vs->info.num_outputs+1));
     compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
 
+    /* Insert the WPOS output. */
+    r300_insert_wpos(&compiler, &vs->outputs);
+
+    r300_shader_vap_output_fmt(vs);
+    r300_stream_locations_notcl(&vs->outputs, vs->stream_loc_notcl);
+
     /* Invoke the compiler */
     r3xx_compile_vertex_program(&compiler);
     if (compiler.Base.Error) {
@@ -292,3 +361,30 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     rc_destroy(&compiler.Base);
     vs->translated = TRUE;
 }
+
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300)
+{
+    struct r300_vertex_shader* vs = r300->vs;
+    int tex_output = r300->vs->wpos_tex_output;
+    uint32_t tex_fmt = R300_INPUT_CNTL_TC0 << tex_output;
+    uint32_t* hwfmt = vs->hwfmt;
+
+    if (r300->fs->inputs.wpos != ATTR_UNUSED) {
+        /* Enable WPOS in VAP. */
+        if (!(hwfmt[1] & tex_fmt)) {
+            hwfmt[1] |= tex_fmt;
+            hwfmt[3] |= (4 << (3 * tex_output));
+
+            assert(tex_output < 8);
+            return TRUE;
+        }
+    } else {
+        /* Disable WPOS in VAP. */
+        if (hwfmt[1] & tex_fmt) {
+            hwfmt[1] &= ~tex_fmt;
+            hwfmt[3] &= ~(4 << (3 * tex_output));
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index 67e9db5366..18cfeee3cd 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -43,6 +43,9 @@ struct r300_vertex_shader {
     /* Stream locations for SWTCL or if TCL is bypassed. */
     int stream_loc_notcl[16];
 
+    /* Output stream location for WPOS. */
+    int wpos_tex_output;
+
     /* Has this shader been translated yet? */
     boolean translated;
 
@@ -53,4 +56,7 @@ struct r300_vertex_shader {
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs);
 
+/* Return TRUE if VAP (hwfmt) needs to be re-emitted. */
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300);
+
 #endif /* R300_VS_H */
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index f98087deb8..5f130453c3 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,6 +36,7 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_tile_cache.h"
 
 
@@ -55,6 +56,9 @@ softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
    if (softpipe->no_rast)
       return;
 
+   if (!softpipe_check_render_cond(softpipe))
+      return;
+
 #if 0
    softpipe_update_derived(softpipe); /* not needed?? */
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 82173a3c2a..f3ac6760db 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -176,6 +176,19 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
 }
 
 
+static void
+softpipe_render_condition( struct pipe_context *pipe,
+                           struct pipe_query *query,
+                           uint mode )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+
+   softpipe->render_cond_query = query;
+   softpipe->render_cond_mode = mode;
+}
+
+
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -252,6 +265,8 @@ softpipe_create( struct pipe_screen *screen )
 
    softpipe_init_query_funcs( softpipe );
 
+   softpipe->pipe.render_condition = softpipe_render_condition;
+
    /*
     * Alloc caches for accessing drawing surfaces and textures.
     * Must be before quad stage setup!
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 6a89bd4b06..73fa744f9d 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -116,6 +116,10 @@ struct softpipe_context {
 
    unsigned line_stipple_counter;
 
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+
    /** Software quad rendering pipeline */
    struct {
       struct quad_stage *shade;
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 9ea5d6fb9f..03d35fb3cb 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -38,6 +38,7 @@
 #include "util/u_prim.h"
 
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_state.h"
 
 #include "draw/draw_context.h"
@@ -97,11 +98,11 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
 }
 
 
-boolean
+void
 softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
@@ -110,7 +111,7 @@ softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
  * Basically, map the vertex buffers (and drawing surfaces), then hand off
  * the drawing to the 'draw' module.
  */
-boolean
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -122,6 +123,9 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
    struct draw_context *draw = sp->draw;
    unsigned i;
 
+   if (!softpipe_check_render_cond(sp))
+      return;
+
    sp->reduced_api_prim = u_reduced_prim(mode);
 
    if (sp->dirty)
@@ -177,19 +181,17 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
    softpipe_unmap_constant_buffers(sp);
 
    sp->dirty_render_cache = TRUE;
-   
-   return TRUE;
 }
 
 
-boolean
+void
 softpipe_draw_elements(struct pipe_context *pipe,
                        struct pipe_buffer *indexBuffer,
                        unsigned indexSize,
                        unsigned mode, unsigned start, unsigned count)
 {
-   return softpipe_draw_range_elements( pipe, indexBuffer,
-                                        indexSize,
-                                        0, 0xffffffff,
-                                        mode, start, count );
+   softpipe_draw_range_elements( pipe, indexBuffer,
+                                 indexSize,
+                                 0, 0xffffffff,
+                                 mode, start, count );
 }
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 379cf4ad06..4ef5d9f7b1 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -99,6 +99,32 @@ softpipe_get_query_result(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called by rendering function to check rendering is conditional.
+ * \return TRUE if we should render, FALSE if we should skip rendering
+ */
+boolean
+softpipe_check_render_cond(struct softpipe_context *sp)
+{
+   struct pipe_context *pipe = &sp->pipe;
+   boolean b, wait;
+   uint64_t result;
+
+   if (!sp->render_cond_query) {
+      return TRUE;  /* no query predicate, draw normally */
+   }
+
+   wait = (sp->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+           sp->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(pipe, sp->render_cond_query, wait, &result);
+   if (b)
+      return result > 0;
+   else
+      return TRUE;
+}
+
+
 void softpipe_init_query_funcs(struct softpipe_context *softpipe )
 {
    softpipe->pipe.create_query = softpipe_create_query;
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
index 05060a4575..736c033897 100644
--- a/src/gallium/drivers/softpipe/sp_query.h
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -32,6 +32,10 @@
 #ifndef SP_QUERY_H
 #define SP_QUERY_H
 
+extern boolean
+softpipe_check_render_cond(struct softpipe_context *sp);
+
+
 struct softpipe_context;
 extern void softpipe_init_query_funcs(struct softpipe_context * );
 
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 5a32d211d6..9b18dac67b 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -184,14 +184,14 @@ void softpipe_set_vertex_buffers(struct pipe_context *,
 void softpipe_update_derived( struct softpipe_context *softpipe );
 
 
-boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
-			     unsigned start, unsigned count);
+void softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                          unsigned start, unsigned count);
 
-boolean softpipe_draw_elements(struct pipe_context *pipe,
-			       struct pipe_buffer *indexBuffer,
-			       unsigned indexSize,
-			       unsigned mode, unsigned start, unsigned count);
-boolean
+void softpipe_draw_elements(struct pipe_context *pipe,
+                            struct pipe_buffer *indexBuffer,
+                            unsigned indexSize,
+                            unsigned mode, unsigned start, unsigned count);
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index e26153b1d9..1ae8fecacf 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2,7 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
+ * Copyright 2008-2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -514,21 +514,15 @@ static float
 compute_lambda_1d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
    float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
    float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
    float rho = MAX2(dsdx, dsdy) * texture->width0;
-   float lambda;
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -536,8 +530,7 @@ static float
 compute_lambda_2d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -548,13 +541,8 @@ compute_lambda_2d(const struct sp_sampler_varient *samp,
    float maxx = MAX2(dsdx, dsdy) * texture->width0;
    float maxy = MAX2(dtdx, dtdy) * texture->height0;
    float rho  = MAX2(maxx, maxy);
-   float lambda;
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -562,8 +550,7 @@ static float
 compute_lambda_3d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -576,31 +563,26 @@ compute_lambda_3d(const struct sp_sampler_varient *samp,
    float maxx = MAX2(dsdx, dsdy) * texture->width0;
    float maxy = MAX2(dtdx, dtdy) * texture->height0;
    float maxz = MAX2(dpdx, dpdy) * texture->depth0;
-   float rho, lambda;
+   float rho;
 
    rho = MAX2(maxx, maxy);
    rho = MAX2(rho, maxz);
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
 /**
  * Compute lambda for a vertex texture sampler.
- * Since there aren't derivatives to use, just return the LOD bias.
+ * Since there aren't derivatives to use, just return 0.
  */
 static float
 compute_lambda_vert(const struct sp_sampler_varient *samp,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias)
+                    const float p[QUAD_SIZE])
 {
-   return lodbias;
+   return 0.0f;
 }
 
 
@@ -769,7 +751,8 @@ img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -827,7 +810,8 @@ img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                  const float s[QUAD_SIZE],
                                  const float t[QUAD_SIZE],
                                  const float p[QUAD_SIZE],
-                                 float lodbias,
+                                 const float c0[QUAD_SIZE],
+                                 enum tgsi_sampler_control control,
                                  float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -866,7 +850,8 @@ img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -914,7 +899,8 @@ img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -949,7 +935,8 @@ img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -996,7 +983,8 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1035,7 +1023,8 @@ img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1076,7 +1065,8 @@ img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1115,7 +1105,8 @@ img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1161,7 +1152,8 @@ img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1209,7 +1201,8 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1261,29 +1254,60 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
 }
 
 
+/* Calculate level of detail for every fragment.
+ * Note that lambda has already been biased by global LOD bias.
+ */
+static INLINE void
+compute_lod(const struct pipe_sampler_state *sampler,
+            const float biased_lambda,
+            const float lodbias[QUAD_SIZE],
+            float lod[QUAD_SIZE])
+{
+   uint i;
+
+   for (i = 0; i < QUAD_SIZE; i++) {
+      lod[i] = biased_lambda + lodbias[i];
+      lod[i] = CLAMP(lod[i], sampler->min_lod, sampler->max_lod);
+   }
+}
+
+
 static void
 mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
-                  float lodbias,
+                  const float c0[QUAD_SIZE],
+                  enum tgsi_sampler_control control,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else if (level0 >= texture->last_level) {
       samp->level = texture->last_level;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1292,10 +1316,10 @@ mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
       int c,j;
 
       samp->level = level0;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1311,23 +1335,36 @@ mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
                    const float s[QUAD_SIZE],
                    const float t[QUAD_SIZE],
                    const float p[QUAD_SIZE],
-                   float lodbias,
+                   const float c0[QUAD_SIZE],
+                   enum tgsi_sampler_control control,
                    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       samp->level = (int)(lambda + 0.5) ;
       samp->level = MIN2(samp->level, (int)texture->last_level);
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 
 #if 0
@@ -1345,17 +1382,32 @@ mip_filter_none(struct tgsi_sampler *tgsi_sampler,
                 const float s[QUAD_SIZE],
                 const float t[QUAD_SIZE],
                 const float p[QUAD_SIZE],
-                float lodbias,
+                const float c0[QUAD_SIZE],
+                enum tgsi_sampler_control control,
                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
-   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 }
 
@@ -1371,15 +1423,28 @@ mip_filter_linear_2d_linear_repeat_POT(
    const float s[QUAD_SIZE],
    const float t[QUAD_SIZE],
    const float p[QUAD_SIZE],
-   float lodbias,
+   const float c0[QUAD_SIZE],
+   enum tgsi_sampler_control control,
    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    /* Catches both negative and large values of level0:
@@ -1390,7 +1455,7 @@ mip_filter_linear_2d_linear_repeat_POT(
       else
          samp->level = texture->last_level;
 
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1399,10 +1464,10 @@ mip_filter_linear_2d_linear_repeat_POT(
       int c,j;
 
       samp->level = level0;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1422,7 +1487,8 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
-               float lodbias,
+               const float c0[QUAD_SIZE],
+               enum tgsi_sampler_control control,
                float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1430,7 +1496,7 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
    int j, k0, k1, k2, k3;
    float val;
 
-   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+   samp->mip_filter(tgsi_sampler, s, t, p, c0, control, rgba);
 
    /**
     * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
@@ -1508,7 +1574,8 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
             const float s[QUAD_SIZE],
             const float t[QUAD_SIZE],
             const float p[QUAD_SIZE],
-            float lodbias,
+            const float c0[QUAD_SIZE],
+            enum tgsi_sampler_control control,
             float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1589,7 +1656,7 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
     * is not active, this will point somewhere deeper into the
     * pipeline, eg. to mip_filter or even img_filter.
     */
-   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
 }
 
 
@@ -1862,7 +1929,7 @@ sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
       break;
    }
 
-   if (sampler->compare_mode != FALSE) {
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       samp->compare = sample_compare;
    }
    else {
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index b0797711d3..b6e66c998a 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -46,14 +47,14 @@ typedef void (*wrap_linear_func)(const float s[4],
 typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
                                      const float s[QUAD_SIZE],
                                      const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias);
+                                     const float p[QUAD_SIZE]);
 
 typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
                             const float s[QUAD_SIZE],
                             const float t[QUAD_SIZE],
                             const float p[QUAD_SIZE],
-                            float lodbias,
+                            const float c0[QUAD_SIZE],
+                            enum tgsi_sampler_control control,
                             float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index c3de12b4a3..af99c9de37 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -29,6 +29,7 @@
 #include "pipe/p_inlines.h"
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "util/u_upload_mgr.h"
 
 #include "svga_context.h"
@@ -61,6 +62,9 @@ static void svga_destroy( struct pipe_context *pipe )
    u_upload_destroy( svga->upload_vb );
    u_upload_destroy( svga->upload_ib );
 
+   util_bitmask_destroy( svga->vs_bm );
+   util_bitmask_destroy( svga->fs_bm );
+
    for(shader = 0; shader < PIPE_SHADER_TYPES; ++shader)
       pipe_buffer_reference( &svga->curr.cb[shader], NULL );
 
@@ -130,7 +134,7 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    svga = CALLOC_STRUCT(svga_context);
    if (svga == NULL)
-      goto error1;
+      goto no_svga;
 
    svga->pipe.winsys = screen->winsys;
    svga->pipe.screen = screen;
@@ -142,7 +146,7 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    svga->swc = svgascreen->sws->context_create(svgascreen->sws);
    if(!svga->swc)
-      goto error2;
+      goto no_swc;
 
    svga_init_blend_functions(svga);
    svga_init_blit_functions(svga);
@@ -165,32 +169,40 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
    svga->debug.disable_shader = debug_get_num_option("SVGA_DISABLE_SHADER", ~0);
 
    if (!svga_init_swtnl(svga))
-      goto error3;
+      goto no_swtnl;
+
+   svga->fs_bm = util_bitmask_create();
+   if (svga->fs_bm == NULL)
+      goto no_fs_bm;
+
+   svga->vs_bm = util_bitmask_create();
+   if (svga->vs_bm == NULL)
+      goto no_vs_bm;
 
    svga->upload_ib = u_upload_create( svga->pipe.screen,
                                       32 * 1024,
                                       16,
                                       PIPE_BUFFER_USAGE_INDEX );
    if (svga->upload_ib == NULL)
-      goto error4;
+      goto no_upload_ib;
 
    svga->upload_vb = u_upload_create( svga->pipe.screen,
                                       128 * 1024,
                                       16,
                                       PIPE_BUFFER_USAGE_VERTEX );
    if (svga->upload_vb == NULL)
-      goto error5;
+      goto no_upload_vb;
 
    svga->hwtnl = svga_hwtnl_create( svga,
                                     svga->upload_ib,
                                     svga->swc );
    if (svga->hwtnl == NULL)
-      goto error6;
+      goto no_hwtnl;
 
 
    ret = svga_emit_initial_state( svga );
    if (ret)
-      goto error7;
+      goto no_state;
    
    /* Avoid shortcircuiting state with initial value of zero.
     */
@@ -209,19 +221,23 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    return &svga->pipe;
 
-error7:
+no_state:
    svga_hwtnl_destroy( svga->hwtnl );
-error6:
+no_hwtnl:
    u_upload_destroy( svga->upload_vb );
-error5:
+no_upload_vb:
    u_upload_destroy( svga->upload_ib );
-error4:
+no_upload_ib:
+   util_bitmask_destroy( svga->vs_bm );
+no_vs_bm:
+   util_bitmask_destroy( svga->fs_bm );
+no_fs_bm:
    svga_destroy_swtnl(svga);
-error3:
+no_swtnl:
    svga->swc->destroy(svga->swc);
-error2:
+no_swc:
    FREE(svga);
-error1:
+no_svga:
    return NULL;
 }
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 0885d9ca74..66259fd010 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -41,6 +41,7 @@
 struct draw_vertex_shader;
 struct svga_shader_result;
 struct SVGACmdMemory;
+struct util_bitmask;
 struct u_upload_mgr;
 
 
@@ -265,8 +266,6 @@ struct svga_hw_draw_state
    unsigned ts[16][TS_MAX];
    float cb[PIPE_SHADER_TYPES][CB_MAX][4];
 
-   unsigned shader_id[PIPE_SHADER_TYPES];
-   
    struct svga_shader_result *fs;
    struct svga_shader_result *vs;
    struct svga_hw_view_state views[PIPE_MAX_SAMPLERS];
@@ -319,12 +318,14 @@ struct svga_context
       boolean new_vdecl;
    } swtnl;
 
+   /* Bitmask of used shader IDs */
+   struct util_bitmask *fs_bm;
+   struct util_bitmask *vs_bm;
+
    struct {
       unsigned dirty[4];
 
       unsigned texture_timestamp;
-      unsigned next_fs_id;
-      unsigned next_vs_id;
 
       /* Internally generated shaders:
        */
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 8db40d0fd5..ca73cf9d5a 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -164,7 +164,8 @@ svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
       }
 
       SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
-               svga_surface(svga->curr.framebuffer.cbufs[0])->handle,
+               svga->curr.framebuffer.cbufs[0] ?
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle : NULL,
                hwtnl->cmd.prim_count);
 
       ret = SVGA3D_BeginDrawPrimitives(swc, 
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 71a552862e..0f24ef4ee8 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -149,7 +149,7 @@ retry:
 
 
 
-static boolean
+static void
 svga_draw_range_elements( struct pipe_context *pipe,
                           struct pipe_buffer *index_buffer,
                           unsigned index_size,
@@ -162,7 +162,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
    enum pipe_error ret = 0;
 
    if (!u_trim_pipe_prim( prim, &count ))
-      return TRUE;
+      return;
 
    /*
     * Mark currently bound target surfaces as dirty
@@ -183,7 +183,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
 #ifdef DEBUG
    if (svga->curr.vs->base.id == svga->debug.disable_shader ||
        svga->curr.fs->base.id == svga->debug.disable_shader)
-      return 0;
+      return;
 #endif
 
    if (svga->state.sw.need_swtnl)
@@ -225,31 +225,29 @@ svga_draw_range_elements( struct pipe_context *pipe,
       svga_hwtnl_flush_retry( svga );
       svga_context_flush(svga, NULL);
    }
-
-   return ret == PIPE_OK;
 }
 
 
-static boolean 
+static void
 svga_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *index_buffer,
                     unsigned index_size,
                     unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements( pipe, index_buffer,
-                                    index_size,
-                                    0, 0xffffffff,
-                                    prim, start, count );
+   svga_draw_range_elements( pipe, index_buffer,
+                             index_size,
+                             0, 0xffffffff,
+                             prim, start, count );
 }
 
-static boolean 
+static void
 svga_draw_arrays( struct pipe_context *pipe,
                   unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements(pipe, NULL, 0, 
-                                   start, start + count - 1, 
-                                   prim, 
-                                   start, count);
+   svga_draw_range_elements(pipe, NULL, 0, 
+                            start, start + count - 1, 
+                            prim, 
+                            start, count);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
index e3be840d92..5f1213e46a 100644
--- a/src/gallium/drivers/svga/svga_pipe_fs.c
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -26,6 +26,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_text.h"
 
@@ -107,7 +108,16 @@ void svga_delete_fs_state(struct pipe_context *pipe, void *shader)
          assert(ret == PIPE_OK);
       }
 
+      util_bitmask_clear( svga->fs_bm, result->id );
+
       svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.fs)
+         svga->state.hw_draw.fs = NULL;
    }
 
    FREE((void *)fs->base.tokens);
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 78053e755e..460a101f8c 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -76,7 +76,6 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:   return SVGA3D_TEX_FILTER_ANISOTROPIC;
    default:
       assert(0);
       return SVGA3D_TEX_FILTER_NEAREST;
@@ -107,6 +106,8 @@ svga_create_sampler_state(struct pipe_context *pipe,
    cso->magfilter = translate_img_filter( sampler->mag_img_filter );
    cso->minfilter = translate_img_filter( sampler->min_img_filter );
    cso->aniso_level = MAX2( (unsigned) sampler->max_anisotropy, 1 );
+   if(cso->aniso_level != 1)
+      cso->magfilter = cso->minfilter = SVGA3D_TEX_FILTER_ANISOTROPIC;
    cso->lod_bias = sampler->lod_bias;
    cso->addressu = translate_wrap_mode(sampler->wrap_s);
    cso->addressv = translate_wrap_mode(sampler->wrap_t);
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
index c104c41f5f..7e6ab576ad 100644
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -27,6 +27,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_text.h"
 
@@ -172,7 +173,16 @@ static void svga_delete_vs_state(struct pipe_context *pipe, void *shader)
          assert(ret == PIPE_OK);
       }
 
+      util_bitmask_clear( svga->vs_bm, result->id );
+
       svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.vs)
+         svga->state.hw_draw.vs = NULL;
    }
 
    FREE((void *)vs->base.tokens);
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 6ec38ed3e4..d29f3762d2 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -26,6 +26,7 @@
 #include "pipe/p_inlines.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
+#include "util/u_bitmask.h"
 
 #include "svga_context.h"
 #include "svga_state.h"
@@ -39,8 +40,13 @@
 static INLINE int compare_fs_keys( const struct svga_fs_compile_key *a,
                                    const struct svga_fs_compile_key *b )
 {
-   unsigned keysize = svga_fs_key_size( a );
-   return memcmp( a, b, keysize );
+   unsigned keysize_a = svga_fs_key_size( a );
+   unsigned keysize_b = svga_fs_key_size( b );
+
+   if (keysize_a != keysize_b) {
+      return (int)(keysize_a - keysize_b);
+   }
+   return memcmp( a, b, keysize_a );
 }
 
 
@@ -66,7 +72,7 @@ static enum pipe_error compile_fs( struct svga_context *svga,
                                    struct svga_shader_result **out_result )
 {
    struct svga_shader_result *result;
-   enum pipe_error ret;
+   enum pipe_error ret = PIPE_ERROR;
 
    result = svga_translate_fragment_program( fs, key );
    if (result == NULL) {
@@ -74,9 +80,12 @@ static enum pipe_error compile_fs( struct svga_context *svga,
       goto fail;
    }
 
+   result->id = util_bitmask_add(svga->fs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+      goto fail;
 
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_fs_id,
+                             result->id,
                              SVGA3D_SHADERTYPE_PS,
                              result->tokens, 
                              result->nr_tokens * sizeof result->tokens[0]);
@@ -84,14 +93,16 @@ static enum pipe_error compile_fs( struct svga_context *svga,
       goto fail;
 
    *out_result = result;
-   result->id = svga->state.next_fs_id++;
    result->next = fs->base.results;
    fs->base.results = result;
    return PIPE_OK;
 
 fail:
-   if (result)
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->fs_bm, result->id );
       svga_destroy_shader_result( result );
+   }
    return ret;
 }
 
@@ -116,7 +127,7 @@ fail:
  */
 static int emit_white_fs( struct svga_context *svga )
 {
-   int ret;
+   int ret = PIPE_ERROR;
 
    /* ps_3_0
     * def c0, 1.000000, 0.000000, 0.000000, 1.000000
@@ -137,16 +148,26 @@ static int emit_white_fs( struct svga_context *svga )
       0x0000ffff,
    };
 
+   assert(SVGA3D_INVALID_ID == UTIL_BITMASK_INVALID_INDEX);
+   svga->state.white_fs_id = util_bitmask_add(svga->fs_bm);
+   if(svga->state.white_fs_id == SVGA3D_INVALID_ID)
+      goto no_fs_id;
+
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_fs_id,
+                             svga->state.white_fs_id,
                              SVGA3D_SHADERTYPE_PS,
                              white_tokens, 
                              sizeof(white_tokens));
    if (ret)
-      return ret;
+      goto no_definition;
 
-   svga->state.white_fs_id = svga->state.next_fs_id++;
    return 0;
+
+no_definition:
+   util_bitmask_clear(svga->fs_bm, svga->state.white_fs_id);
+   svga->state.white_fs_id = SVGA3D_INVALID_ID;
+no_fs_id:
+   return ret;
 }
 
 
@@ -251,15 +272,14 @@ static int emit_hw_fs( struct svga_context *svga,
 
    assert(id != SVGA3D_INVALID_ID);
 
-   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT]) {
-      ret = SVGA3D_SetShader(svga->swc, 
-                             SVGA3D_SHADERTYPE_PS, 
+   if (result != svga->state.hw_draw.fs) {
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_PS,
                              id );
       if (ret)
          return ret;
 
       svga->dirty |= SVGA_NEW_FS_RESULT;
-      svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT] = id;
       svga->state.hw_draw.fs = result;      
    }
 
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 44b7ceb4fa..ae1e77e7d4 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -27,6 +27,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_bitmask.h"
 #include "translate/translate.h"
 
 #include "svga_context.h"
@@ -78,8 +79,12 @@ static enum pipe_error compile_vs( struct svga_context *svga,
       goto fail;
    }
 
+   result->id = util_bitmask_add(svga->vs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+      goto fail;
+
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_vs_id,
+                             result->id,
                              SVGA3D_SHADERTYPE_VS,
                              result->tokens, 
                              result->nr_tokens * sizeof result->tokens[0]);
@@ -87,14 +92,16 @@ static enum pipe_error compile_vs( struct svga_context *svga,
       goto fail;
 
    *out_result = result;
-   result->id = svga->state.next_vs_id++;
    result->next = vs->base.results;
    vs->base.results = result;
    return PIPE_OK;
 
 fail:
-   if (result)
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->vs_bm, result->id );
       svga_destroy_shader_result( result );
+   }
    return ret;
 }
 
@@ -142,15 +149,14 @@ static int emit_hw_vs( struct svga_context *svga,
       id = result->id;
    }
 
-   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX]) {
-      ret = SVGA3D_SetShader(svga->swc, 
-                             SVGA3D_SHADERTYPE_VS, 
+   if (result != svga->state.hw_draw.vs) {
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_VS,
                              id );
       if (ret)
          return ret;
 
       svga->dirty |= SVGA_NEW_VS_RESULT;
-      svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX] = id;
       svga->state.hw_draw.vs = result;      
    }
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index b8ef137c01..0cd620189b 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -31,6 +31,7 @@
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 
 #include "svgadump/svga_shader_dump.h"
 
@@ -221,6 +222,7 @@ svga_tgsi_translate( const struct svga_shader *shader,
    result->tokens = (const unsigned *)emit.buf;
    result->nr_tokens = (emit.ptr - emit.buf) / sizeof(unsigned);
    memcpy(&result->key, &key, sizeof key);
+   result->id = UTIL_BITMASK_INVALID_INDEX;
 
    if (SVGA_DEBUG & DEBUG_TGSI) 
    {
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index 896c90a89a..737a2213af 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -39,26 +39,24 @@ struct tgsi_token;
 
 struct svga_vs_compile_key
 {
-   ubyte need_prescale:1;
-   ubyte allow_psiz:1;
    unsigned zero_stride_vertex_elements;
-   ubyte num_zero_stride_vertex_elements:6;
+   unsigned need_prescale:1;
+   unsigned allow_psiz:1;
+   unsigned num_zero_stride_vertex_elements:6;
 };
 
 struct svga_fs_compile_key
 {
-   boolean light_twoside:1;
-   boolean front_cw:1;
-   ubyte num_textures;
-   ubyte num_unnormalized_coords;
+   unsigned light_twoside:1;
+   unsigned front_cw:1;
+   unsigned num_textures:8;
+   unsigned num_unnormalized_coords:8;
    struct {
-      ubyte compare_mode       : 1;
-      ubyte compare_func       : 3;
-      ubyte unnormalized       : 1;
-
-      ubyte width_height_idx   : 7;
-
-      ubyte texture_target;
+      unsigned compare_mode:1;
+      unsigned compare_func:3;
+      unsigned unnormalized:1;
+      unsigned width_height_idx:7;
+      unsigned texture_target:8;
    } tex[PIPE_MAX_SAMPLERS];
 };
 
@@ -121,8 +119,7 @@ static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 
 static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
-   return (const char *)&key->tex[key->num_textures].texture_target -
-      (const char *)key;
+   return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
 
 struct svga_shader_result *
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 1670da8bfa..dc5eb8fc60 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -2109,7 +2109,7 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_NOT:
    case TGSI_OPCODE_SHL:
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
    case TGSI_OPCODE_XOR:
       return FALSE;
 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ad47a56fba..075e4f9a0b 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -161,16 +161,15 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    pipe_mutex_unlock(tr_ctx->draw_mutex);
 }
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_arrays(struct pipe_context *_pipe,
                           unsigned mode, unsigned start, unsigned count)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -181,19 +180,15 @@ trace_context_draw_arrays(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_arrays(pipe, mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_arrays(pipe, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_elements(struct pipe_context *_pipe,
                           struct pipe_buffer *_indexBuffer,
                           unsigned indexSize,
@@ -203,10 +198,9 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -221,19 +215,15 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_range_elements(struct pipe_context *_pipe,
                                   struct pipe_buffer *_indexBuffer,
                                   unsigned indexSize,
@@ -247,10 +237,9 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -267,18 +256,14 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_range_elements(pipe,
-                                      indexBuffer,
-                                      indexSize, minIndex, maxIndex,
-                                      mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize, minIndex, maxIndex,
+                             mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 0102cc1876..86237e03bc 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -409,7 +409,7 @@ void trace_dump_sampler_state(const struct pipe_sampler_state *state)
    trace_dump_member(uint, state, min_img_filter);
    trace_dump_member(uint, state, min_mip_filter);
    trace_dump_member(uint, state, mag_img_filter);
-   trace_dump_member(bool, state, compare_mode);
+   trace_dump_member(uint, state, compare_mode);
    trace_dump_member(uint, state, compare_func);
    trace_dump_member(bool, state, normalized_coords);
    trace_dump_member(uint, state, prefilter);
diff --git a/src/gallium/drivers/trace/tr_state.h b/src/gallium/drivers/trace/tr_state.h
index 1c16042ee5..e2f981d051 100644
--- a/src/gallium/drivers/trace/tr_state.h
+++ b/src/gallium/drivers/trace/tr_state.h
@@ -32,7 +32,7 @@ struct tgsi_token;
 enum trace_shader_type {
    TRACE_SHADER_FRAGMENT = 0,
    TRACE_SHADER_VERTEX   = 1,
-   TRACE_SHADER_GEOMETRY = 2,
+   TRACE_SHADER_GEOMETRY = 2
 };
 
 struct trace_shader
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index f7368bb95b..26a940593f 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -52,45 +52,15 @@
 #endif /* _MSC_VER */
 
 
-#if defined(_MSC_VER)
-
-typedef __int8             int8_t;
-typedef unsigned __int8    uint8_t;
-typedef __int16            int16_t;
-typedef unsigned __int16   uint16_t;
-#ifndef __eglplatform_h_
-typedef __int32            int32_t;
-#endif
-typedef unsigned __int32   uint32_t;
-typedef __int64            int64_t;
-typedef unsigned __int64   uint64_t;
-
-#if defined(_WIN64)
-typedef __int64            intptr_t;
-typedef unsigned __int64   uintptr_t;
-#else
-typedef __int32            intptr_t;
-typedef unsigned __int32   uintptr_t;
-#endif
-
-#define INT64_C(__val) __val##i64
-#define UINT64_C(__val) __val##ui64
-
-#ifndef __cplusplus
-#define false   0
-#define true    1
-#define bool    _Bool
-typedef int     _Bool;
-#define __bool_true_false_are_defined   1
-#endif /* !__cplusplus */
-
-#else
+/*
+ * Alternative stdint.h and stdbool.h headers are supplied in include/c99 for
+ * systems that lack it.
+ */
 #ifndef __STDC_LIMIT_MACROS
 #define __STDC_LIMIT_MACROS 1
 #endif
 #include <stdint.h>
 #include <stdbool.h>
-#endif
 
 
 #ifndef __HAIKU__
@@ -99,11 +69,7 @@ typedef unsigned short     ushort;
 #endif
 typedef unsigned char      ubyte;
 
-#if 0
-#define boolean bool
-#else
 typedef unsigned char boolean;
-#endif
 #ifndef TRUE
 #define TRUE  true
 #endif
@@ -135,6 +101,17 @@ typedef unsigned char boolean;
 #  endif
 #endif
 
+
+/* Function visibility */
+#ifndef PUBLIC
+#  if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+#    define PUBLIC __attribute__((visibility("default")))
+#  else
+#    define PUBLIC
+#  endif
+#endif
+
+
 /* The __FUNCTION__ gcc variable is generally only used for debugging.
  * If we're not using gcc, define __FUNCTION__ as a cpp symbol here.
  */
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 6c06fb9027..d2f8085b42 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -61,29 +61,37 @@ struct pipe_context {
     * VBO drawing (return false on fallbacks (temporary??))
     */
    /*@{*/
-   boolean (*draw_arrays)( struct pipe_context *pipe,
-			   unsigned mode, unsigned start, unsigned count);
+   void (*draw_arrays)( struct pipe_context *pipe,
+                        unsigned mode, unsigned start, unsigned count);
 
-   boolean (*draw_elements)( struct pipe_context *pipe,
-			     struct pipe_buffer *indexBuffer,
-			     unsigned indexSize,
-			     unsigned mode, unsigned start, unsigned count);
+   void (*draw_elements)( struct pipe_context *pipe,
+                          struct pipe_buffer *indexBuffer,
+                          unsigned indexSize,
+                          unsigned mode, unsigned start, unsigned count);
 
    /* XXX: this is (probably) a temporary entrypoint, as the range
     * information should be available from the vertex_buffer state.
     * Using this to quickly evaluate a specialized path in the draw
     * module.
     */
-   boolean (*draw_range_elements)( struct pipe_context *pipe,
-                                   struct pipe_buffer *indexBuffer,
-                                   unsigned indexSize,
-                                   unsigned minIndex,
-                                   unsigned maxIndex,
-                                   unsigned mode, 
-                                   unsigned start, 
-                                   unsigned count);
+   void (*draw_range_elements)( struct pipe_context *pipe,
+                                struct pipe_buffer *indexBuffer,
+                                unsigned indexSize,
+                                unsigned minIndex,
+                                unsigned maxIndex,
+                                unsigned mode, 
+                                unsigned start, 
+                                unsigned count);
    /*@}*/
 
+   /**
+    * Predicate subsequent rendering on occlusion query result
+    * \param query  the query predicate, or NULL if no predicate
+    * \param mode  one of PIPE_COND_RENDER_x
+    */
+   void (*render_condition)( struct pipe_context *pipe,
+                             struct pipe_query *query,
+                             uint mode );
 
    /**
     * Query objects
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 2cda408fec..35f3830ebc 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -171,8 +171,6 @@ enum pipe_texture_target {
  */
 #define PIPE_TEX_FILTER_NEAREST      0
 #define PIPE_TEX_FILTER_LINEAR       1
-#define PIPE_TEX_FILTER_ANISO        2 
-
 
 #define PIPE_TEX_COMPARE_NONE          0
 #define PIPE_TEX_COMPARE_R_TO_TEXTURE  1
@@ -355,6 +353,15 @@ enum pipe_transfer_usage {
 
 
 /**
+ * Conditional rendering modes
+ */
+#define PIPE_RENDER_COND_WAIT              0
+#define PIPE_RENDER_COND_NO_WAIT           1
+#define PIPE_RENDER_COND_BY_REGION_WAIT    2
+#define PIPE_RENDER_COND_BY_REGION_NO_WAIT 3
+
+
+/**
  * Point sprite coord modes
  */
 #define PIPE_SPRITE_COORD_NONE       0
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index f0a4de5df3..b8e001a6b0 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -266,6 +266,11 @@ struct pipe_screen {
 
    void (*video_surface_destroy)( struct pipe_video_surface *vsfc );
 
+   /**
+    * Do any special operations to ensure buffer size is correct
+    */
+   void (*update_buffer)( struct pipe_screen *ws,
+                          void *context_private );
 
    /**
     * Do any special operations to ensure frontbuffer contents are
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 7b19364b97..550e2abc32 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -141,6 +141,8 @@ struct tgsi_declaration_semantic
 };
 
 #define TGSI_IMM_FLOAT32   0
+#define TGSI_IMM_UINT32    1
+#define TGSI_IMM_INT32     2
 
 struct tgsi_immediate
 {
@@ -153,6 +155,8 @@ struct tgsi_immediate
 union tgsi_immediate_data
 {
    float Float;
+   unsigned Uint;
+   int Int;
 };
 
 #define TGSI_PROPERTY_GS_INPUT_PRIM          0
@@ -264,7 +268,7 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_NOT                 85
 #define TGSI_OPCODE_TRUNC               86
 #define TGSI_OPCODE_SHL                 87
-#define TGSI_OPCODE_SHR                 88
+                                /* gap */
 #define TGSI_OPCODE_AND                 89
 #define TGSI_OPCODE_OR                  90
 #define TGSI_OPCODE_MOD                 91
@@ -289,7 +293,33 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_KIL                 116  /* conditional kill */
 #define TGSI_OPCODE_END                 117  /* aka HALT */
                                 /* gap */
-#define TGSI_OPCODE_LAST                119
+#define TGSI_OPCODE_F2I                 119
+#define TGSI_OPCODE_IDIV                120
+#define TGSI_OPCODE_IMAX                121
+#define TGSI_OPCODE_IMIN                122
+#define TGSI_OPCODE_INEG                123
+#define TGSI_OPCODE_ISGE                124
+#define TGSI_OPCODE_ISHR                125
+#define TGSI_OPCODE_ISLT                126
+#define TGSI_OPCODE_F2U                 127
+#define TGSI_OPCODE_U2F                 128
+#define TGSI_OPCODE_UADD                129
+#define TGSI_OPCODE_UDIV                130
+#define TGSI_OPCODE_UMAD                131
+#define TGSI_OPCODE_UMAX                132
+#define TGSI_OPCODE_UMIN                133
+#define TGSI_OPCODE_UMOD                134
+#define TGSI_OPCODE_UMUL                135
+#define TGSI_OPCODE_USEQ                136
+#define TGSI_OPCODE_USGE                137
+#define TGSI_OPCODE_USHR                138
+#define TGSI_OPCODE_USLT                139
+#define TGSI_OPCODE_USNE                140
+#define TGSI_OPCODE_SWITCH              141
+#define TGSI_OPCODE_CASE                142
+#define TGSI_OPCODE_DEFAULT             143
+#define TGSI_OPCODE_ENDSWITCH           144
+#define TGSI_OPCODE_LAST                145
 
 #define TGSI_SAT_NONE            0  /* do not saturate */
 #define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
diff --git a/src/gallium/state_trackers/dri/dri_context.c b/src/gallium/state_trackers/dri/dri_context.c
index 8819936fca..f2e5f3fb23 100644
--- a/src/gallium/state_trackers/dri/dri_context.c
+++ b/src/gallium/state_trackers/dri/dri_context.c
@@ -44,9 +44,9 @@
 
 GLboolean
 dri_create_context(const __GLcontextModes * visual,
-		   __DRIcontextPrivate * cPriv, void *sharedContextPrivate)
+		   __DRIcontext * cPriv, void *sharedContextPrivate)
 {
-   __DRIscreenPrivate *sPriv = cPriv->driScreenPriv;
+   __DRIscreen *sPriv = cPriv->driScreenPriv;
    struct dri_screen *screen = dri_screen(sPriv);
    struct dri_context *ctx = NULL;
    struct st_context *st_share = NULL;
@@ -97,7 +97,7 @@ dri_create_context(const __GLcontextModes * visual,
 }
 
 void
-dri_destroy_context(__DRIcontextPrivate * cPriv)
+dri_destroy_context(__DRIcontext * cPriv)
 {
    struct dri_context *ctx = dri_context(cPriv);
 
@@ -116,7 +116,7 @@ dri_destroy_context(__DRIcontextPrivate * cPriv)
 }
 
 GLboolean
-dri_unbind_context(__DRIcontextPrivate * cPriv)
+dri_unbind_context(__DRIcontext * cPriv)
 {
    if (cPriv) {
       struct dri_context *ctx = dri_context(cPriv);
@@ -133,9 +133,9 @@ dri_unbind_context(__DRIcontextPrivate * cPriv)
 }
 
 GLboolean
-dri_make_current(__DRIcontextPrivate * cPriv,
-		 __DRIdrawablePrivate * driDrawPriv,
-		 __DRIdrawablePrivate * driReadPriv)
+dri_make_current(__DRIcontext * cPriv,
+		 __DRIdrawable * driDrawPriv,
+		 __DRIdrawable * driReadPriv)
 {
    if (cPriv) {
       struct dri_context *ctx = dri_context(cPriv);
diff --git a/src/gallium/state_trackers/dri/dri_context.h b/src/gallium/state_trackers/dri/dri_context.h
index 4650178734..13f497462f 100644
--- a/src/gallium/state_trackers/dri/dri_context.h
+++ b/src/gallium/state_trackers/dri/dri_context.h
@@ -44,10 +44,10 @@ struct dri_drawable;
 struct dri_context
 {
    /* dri */
-   __DRIscreenPrivate *sPriv;
-   __DRIcontextPrivate *cPriv;
-   __DRIdrawablePrivate *dPriv;
-   __DRIdrawablePrivate *rPriv;
+   __DRIscreen *sPriv;
+   __DRIcontext *cPriv;
+   __DRIdrawable *dPriv;
+   __DRIdrawable *rPriv;
 
    driOptionCache optionCache;
 
@@ -67,7 +67,7 @@ struct dri_context
 };
 
 static INLINE struct dri_context *
-dri_context(__DRIcontextPrivate * driContextPriv)
+dri_context(__DRIcontext * driContextPriv)
 {
    return (struct dri_context *)driContextPriv->driverPrivate;
 }
@@ -99,18 +99,18 @@ dri_unlock(struct dri_context *ctx)
  */
 extern struct dri1_api_lock_funcs dri1_lf;
 
-void dri_destroy_context(__DRIcontextPrivate * driContextPriv);
+void dri_destroy_context(__DRIcontext * driContextPriv);
 
-boolean dri_unbind_context(__DRIcontextPrivate * driContextPriv);
+boolean dri_unbind_context(__DRIcontext * driContextPriv);
 
 boolean
-dri_make_current(__DRIcontextPrivate * driContextPriv,
-		 __DRIdrawablePrivate * driDrawPriv,
-		 __DRIdrawablePrivate * driReadPriv);
+dri_make_current(__DRIcontext * driContextPriv,
+		 __DRIdrawable * driDrawPriv,
+		 __DRIdrawable * driReadPriv);
 
 boolean
 dri_create_context(const __GLcontextModes * visual,
-		   __DRIcontextPrivate * driContextPriv,
+		   __DRIcontext * driContextPriv,
 		   void *sharedContextPrivate);
 
 /***********************************************************************
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index 4b12243ddf..f131e77ac5 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -118,7 +118,7 @@ dri2_check_if_pixmap(__DRIbuffer *buffers, int count)
  * This will be called a drawable is known to have been resized.
  */
 void
-dri_get_buffers(__DRIdrawablePrivate * dPriv)
+dri_get_buffers(__DRIdrawable * dPriv)
 {
 
    struct dri_drawable *drawable = dri_drawable(dPriv);
@@ -268,6 +268,14 @@ void dri2_set_tex_buffer(__DRIcontext *pDRICtx, GLint target,
 }
 
 void
+dri_update_buffer(struct pipe_screen *screen, void *context_private)
+{
+   struct dri_context *ctx = (struct dri_context *)context_private;
+
+   dri_get_buffers(ctx->dPriv);
+}
+
+void
 dri_flush_frontbuffer(struct pipe_screen *screen,
 		      struct pipe_surface *surf, void *context_private)
 {
@@ -299,8 +307,8 @@ dri_flush_frontbuffer(struct pipe_screen *screen,
  * This is called when we need to set up GL rendering to a new X window.
  */
 boolean
-dri_create_buffer(__DRIscreenPrivate * sPriv,
-		  __DRIdrawablePrivate * dPriv,
+dri_create_buffer(__DRIscreen * sPriv,
+		  __DRIdrawable * dPriv,
 		  const __GLcontextModes * visual, boolean isPixmap)
 {
    struct dri_screen *screen = sPriv->private;
@@ -416,7 +424,7 @@ dri_swap_fences_push_back(struct dri_drawable *draw,
 }
 
 void
-dri_destroy_buffer(__DRIdrawablePrivate * dPriv)
+dri_destroy_buffer(__DRIdrawable * dPriv)
 {
    struct dri_drawable *drawable = dri_drawable(dPriv);
    struct pipe_fence_handle *fence;
@@ -434,8 +442,8 @@ dri_destroy_buffer(__DRIdrawablePrivate * dPriv)
 
 static void
 dri1_update_drawables_locked(struct dri_context *ctx,
-			     __DRIdrawablePrivate * driDrawPriv,
-			     __DRIdrawablePrivate * driReadPriv)
+			     __DRIdrawable * driDrawPriv,
+			     __DRIdrawable * driReadPriv)
 {
    if (ctx->stLostLock) {
       ctx->stLostLock = FALSE;
@@ -458,8 +466,8 @@ dri1_update_drawables_locked(struct dri_context *ctx,
 static void
 dri1_propagate_drawable_change(struct dri_context *ctx)
 {
-   __DRIdrawablePrivate *dPriv = ctx->dPriv;
-   __DRIdrawablePrivate *rPriv = ctx->rPriv;
+   __DRIdrawable *dPriv = ctx->dPriv;
+   __DRIdrawable *rPriv = ctx->rPriv;
    boolean flushed = FALSE;
 
    if (dPriv && ctx->d_stamp != dPriv->lastStamp) {
@@ -532,7 +540,7 @@ static void
 dri1_swap_copy(struct dri_context *ctx,
 	       struct pipe_surface *dst,
 	       struct pipe_surface *src,
-	       __DRIdrawablePrivate * dPriv, const struct drm_clip_rect *bbox)
+	       __DRIdrawable * dPriv, const struct drm_clip_rect *bbox)
 {
    struct pipe_context *pipe = ctx->pipe;
    struct drm_clip_rect clip;
@@ -563,7 +571,7 @@ dri1_swap_copy(struct dri_context *ctx,
 static void
 dri1_copy_to_front(struct dri_context *ctx,
 		   struct pipe_surface *surf,
-		   __DRIdrawablePrivate * dPriv,
+		   __DRIdrawable * dPriv,
 		   const struct drm_clip_rect *sub_box,
 		   struct pipe_fence_handle **fence)
 {
@@ -636,7 +644,7 @@ dri1_flush_frontbuffer(struct pipe_screen *screen,
 }
 
 void
-dri_swap_buffers(__DRIdrawablePrivate * dPriv)
+dri_swap_buffers(__DRIdrawable * dPriv)
 {
    struct dri_context *ctx;
    struct pipe_surface *back_surf;
@@ -668,7 +676,7 @@ dri_swap_buffers(__DRIdrawablePrivate * dPriv)
 }
 
 void
-dri_copy_sub_buffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
+dri_copy_sub_buffer(__DRIdrawable * dPriv, int x, int y, int w, int h)
 {
    struct pipe_screen *screen = dri_screen(dPriv->driScreenPriv)->pipe_screen;
    struct drm_clip_rect sub_bbox;
diff --git a/src/gallium/state_trackers/dri/dri_drawable.h b/src/gallium/state_trackers/dri/dri_drawable.h
index b910930db4..8bc59cb4c3 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/dri_drawable.h
@@ -41,8 +41,8 @@ struct dri_context;
 struct dri_drawable
 {
    /* dri */
-   __DRIdrawablePrivate *dPriv;
-   __DRIscreenPrivate *sPriv;
+   __DRIdrawable *dPriv;
+   __DRIscreen *sPriv;
 
    unsigned attachments[8];
    unsigned num_attachments;
@@ -67,7 +67,7 @@ struct dri_drawable
 };
 
 static INLINE struct dri_drawable *
-dri_drawable(__DRIdrawablePrivate * driDrawPriv)
+dri_drawable(__DRIdrawable * driDrawPriv)
 {
    return (struct dri_drawable *)driDrawPriv->driverPrivate;
 }
@@ -76,22 +76,25 @@ dri_drawable(__DRIdrawablePrivate * driDrawPriv)
  * dri_drawable.c
  */
 boolean
-dri_create_buffer(__DRIscreenPrivate * sPriv,
-		  __DRIdrawablePrivate * dPriv,
+dri_create_buffer(__DRIscreen * sPriv,
+		  __DRIdrawable * dPriv,
 		  const __GLcontextModes * visual, boolean isPixmap);
 
 void
+dri_update_buffer(struct pipe_screen *screen, void *context_private);
+
+void
 dri_flush_frontbuffer(struct pipe_screen *screen,
 		      struct pipe_surface *surf, void *context_private);
 
-void dri_swap_buffers(__DRIdrawablePrivate * dPriv);
+void dri_swap_buffers(__DRIdrawable * dPriv);
 
 void
-dri_copy_sub_buffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h);
+dri_copy_sub_buffer(__DRIdrawable * dPriv, int x, int y, int w, int h);
 
-void dri_get_buffers(__DRIdrawablePrivate * dPriv);
+void dri_get_buffers(__DRIdrawable * dPriv);
 
-void dri_destroy_buffer(__DRIdrawablePrivate * dPriv);
+void dri_destroy_buffer(__DRIdrawable * dPriv);
 
 void dri2_set_tex_buffer2(__DRIcontext *pDRICtx, GLint target,
                           GLint glx_texture_format, __DRIdrawable *dPriv);
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index cb864d45d5..793db087ee 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -202,7 +202,7 @@ dri_fill_in_modes(struct dri_screen *screen,
  * Get information about previous buffer swaps.
  */
 static int
-dri_get_swap_info(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
+dri_get_swap_info(__DRIdrawable * dPriv, __DRIswapInfo * sInfo)
 {
    if (dPriv == NULL || dPriv->driverPrivate == NULL || sInfo == NULL)
       return -1;
@@ -220,7 +220,7 @@ dri_copy_version(struct dri1_api_version *dst,
 }
 
 static const __DRIconfig **
-dri_init_screen(__DRIscreenPrivate * sPriv)
+dri_init_screen(__DRIscreen * sPriv)
 {
    struct dri_screen *screen;
    const __DRIconfig **configs;
@@ -285,7 +285,7 @@ dri_init_screen(__DRIscreenPrivate * sPriv)
  * Returns the __GLcontextModes supported by this driver.
  */
 static const __DRIconfig **
-dri_init_screen2(__DRIscreenPrivate * sPriv)
+dri_init_screen2(__DRIscreen * sPriv)
 {
    struct dri_screen *screen;
    struct drm_create_screen_arg arg;
@@ -308,6 +308,7 @@ dri_init_screen2(__DRIscreenPrivate * sPriv)
    }
 
    /* We need to hook in here */
+   screen->pipe_screen->update_buffer = dri_update_buffer;
    screen->pipe_screen->flush_frontbuffer = dri_flush_frontbuffer;
 
    driParseOptionInfo(&screen->optionCache,
@@ -319,7 +320,7 @@ dri_init_screen2(__DRIscreenPrivate * sPriv)
 }
 
 static void
-dri_destroy_screen(__DRIscreenPrivate * sPriv)
+dri_destroy_screen(__DRIscreen * sPriv)
 {
    struct dri_screen *screen = dri_screen(sPriv);
 
@@ -346,4 +347,12 @@ PUBLIC const struct __DriverAPIRec driDriverAPI = {
    .InitScreen2 = dri_init_screen2,
 };
 
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    &driDRI2Extension.base,
+    NULL
+};
+
 /* vim: set sw=3 ts=8 sts=3 expandtab: */
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index f6c56d0f0c..03387a0e81 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -42,7 +42,7 @@
 struct dri_screen
 {
    /* dri */
-   __DRIscreenPrivate *sPriv;
+   __DRIscreen *sPriv;
 
    /**
     * Configuration cache with default values for all contexts
@@ -63,7 +63,7 @@ struct dri_screen
 
 /** cast wrapper */
 static INLINE struct dri_screen *
-dri_screen(__DRIscreenPrivate * sPriv)
+dri_screen(__DRIscreen * sPriv)
 {
    return (struct dri_screen *)sPriv->private;
 }
diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c
index 228ac9a20e..3caf56e924 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_api.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_api.c
@@ -1007,7 +1007,7 @@ choose_visual( Display *dpy, int screen, const int *list, GLboolean fbConfig )
 }
 
 
-XVisualInfo *
+PUBLIC XVisualInfo *
 glXChooseVisual( Display *dpy, int screen, int *list )
 {
    XMesaVisual xmvis;
@@ -1029,7 +1029,7 @@ glXChooseVisual( Display *dpy, int screen, int *list )
 }
 
 
-GLXContext
+PUBLIC GLXContext
 glXCreateContext( Display *dpy, XVisualInfo *visinfo,
                   GLXContext share_list, Bool direct )
 {
@@ -1084,7 +1084,7 @@ static XMesaBuffer MakeCurrent_PrevReadBuffer = 0;
 
 
 /* GLX 1.3 and later */
-Bool
+PUBLIC Bool
 glXMakeContextCurrent( Display *dpy, GLXDrawable draw,
                        GLXDrawable read, GLXContext ctx )
 {
@@ -1180,21 +1180,21 @@ glXMakeContextCurrent( Display *dpy, GLXDrawable draw,
 }
 
 
-Bool
+PUBLIC Bool
 glXMakeCurrent( Display *dpy, GLXDrawable drawable, GLXContext ctx )
 {
    return glXMakeContextCurrent( dpy, drawable, drawable, ctx );
 }
 
 
-GLXContext
+PUBLIC GLXContext
 glXGetCurrentContext(void)
 {
    return GetCurrentContext();
 }
 
 
-Display *
+PUBLIC Display *
 glXGetCurrentDisplay(void)
 {
    GLXContext glxCtx = glXGetCurrentContext();
@@ -1203,14 +1203,14 @@ glXGetCurrentDisplay(void)
 }
 
 
-Display *
+PUBLIC Display *
 glXGetCurrentDisplayEXT(void)
 {
    return glXGetCurrentDisplay();
 }
 
 
-GLXDrawable
+PUBLIC GLXDrawable
 glXGetCurrentDrawable(void)
 {
    GLXContext gc = glXGetCurrentContext();
@@ -1218,7 +1218,7 @@ glXGetCurrentDrawable(void)
 }
 
 
-GLXDrawable
+PUBLIC GLXDrawable
 glXGetCurrentReadDrawable(void)
 {
    GLXContext gc = glXGetCurrentContext();
@@ -1226,14 +1226,14 @@ glXGetCurrentReadDrawable(void)
 }
 
 
-GLXDrawable
+PUBLIC GLXDrawable
 glXGetCurrentReadDrawableSGI(void)
 {
    return glXGetCurrentReadDrawable();
 }
 
 
-GLXPixmap
+PUBLIC GLXPixmap
 glXCreateGLXPixmap( Display *dpy, XVisualInfo *visinfo, Pixmap pixmap )
 {
    XMesaVisual v;
@@ -1258,7 +1258,7 @@ glXCreateGLXPixmap( Display *dpy, XVisualInfo *visinfo, Pixmap pixmap )
 
 /*** GLX_MESA_pixmap_colormap ***/
 
-GLXPixmap
+PUBLIC GLXPixmap
 glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visinfo,
                         Pixmap pixmap, Colormap cmap )
 {
@@ -1282,7 +1282,7 @@ glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visinfo,
 }
 
 
-void
+PUBLIC void
 glXDestroyGLXPixmap( Display *dpy, GLXPixmap pixmap )
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, pixmap);
@@ -1295,7 +1295,7 @@ glXDestroyGLXPixmap( Display *dpy, GLXPixmap pixmap )
 }
 
 
-void
+PUBLIC void
 glXCopyContext( Display *dpy, GLXContext src, GLXContext dst,
                 unsigned long mask )
 {
@@ -1309,7 +1309,7 @@ glXCopyContext( Display *dpy, GLXContext src, GLXContext dst,
 }
 
 
-Bool
+PUBLIC Bool
 glXQueryExtension( Display *dpy, int *errorBase, int *eventBase )
 {
    int op, ev, err;
@@ -1324,7 +1324,7 @@ glXQueryExtension( Display *dpy, int *errorBase, int *eventBase )
 }
 
 
-void
+PUBLIC void
 glXDestroyContext( Display *dpy, GLXContext ctx )
 {
    GLXContext glxCtx = ctx;
@@ -1340,7 +1340,7 @@ glXDestroyContext( Display *dpy, GLXContext ctx )
 }
 
 
-Bool
+PUBLIC Bool
 glXIsDirect( Display *dpy, GLXContext ctx )
 {
    GLXContext glxCtx = ctx;
@@ -1350,7 +1350,7 @@ glXIsDirect( Display *dpy, GLXContext ctx )
 
 
 
-void
+PUBLIC void
 glXSwapBuffers( Display *dpy, GLXDrawable drawable )
 {
    XMesaBuffer buffer = XMesaFindBuffer( dpy, drawable );
@@ -1377,7 +1377,7 @@ glXSwapBuffers( Display *dpy, GLXDrawable drawable )
 
 /*** GLX_MESA_copy_sub_buffer ***/
 
-void
+PUBLIC void
 glXCopySubBufferMESA( Display *dpy, GLXDrawable drawable,
                            int x, int y, int width, int height )
 {
@@ -1391,7 +1391,7 @@ glXCopySubBufferMESA( Display *dpy, GLXDrawable drawable,
 }
 
 
-Bool
+PUBLIC Bool
 glXQueryVersion( Display *dpy, int *maj, int *min )
 {
    (void) dpy;
@@ -1608,7 +1608,7 @@ get_config( XMesaVisual xmvis, int attrib, int *value, GLboolean fbconfig )
 }
 
 
-int
+PUBLIC int
 glXGetConfig( Display *dpy, XVisualInfo *visinfo,
                    int attrib, int *value )
 {
@@ -1638,7 +1638,7 @@ glXGetConfig( Display *dpy, XVisualInfo *visinfo,
 }
 
 
-void
+PUBLIC void
 glXWaitGL( void )
 {
    XMesaContext xmesa = XMesaGetCurrentContext();
@@ -1647,7 +1647,7 @@ glXWaitGL( void )
 
 
 
-void
+PUBLIC void
 glXWaitX( void )
 {
    XMesaContext xmesa = XMesaGetCurrentContext();
@@ -1664,7 +1664,7 @@ get_extensions( void )
 
 
 /* GLX 1.1 and later */
-const char *
+PUBLIC const char *
 glXQueryExtensionsString( Display *dpy, int screen )
 {
    (void) dpy;
@@ -1675,7 +1675,7 @@ glXQueryExtensionsString( Display *dpy, int screen )
 
 
 /* GLX 1.1 and later */
-const char *
+PUBLIC const char *
 glXQueryServerString( Display *dpy, int screen, int name )
 {
    static char version[1000];
@@ -1700,7 +1700,7 @@ glXQueryServerString( Display *dpy, int screen, int name )
 
 
 /* GLX 1.1 and later */
-const char *
+PUBLIC const char *
 glXGetClientString( Display *dpy, int name )
 {
    static char version[1000];
@@ -1728,7 +1728,7 @@ glXGetClientString( Display *dpy, int name )
  */
 
 
-int
+PUBLIC int
 glXGetFBConfigAttrib( Display *dpy, GLXFBConfig config,
                            int attribute, int *value )
 {
@@ -1743,7 +1743,7 @@ glXGetFBConfigAttrib( Display *dpy, GLXFBConfig config,
 }
 
 
-GLXFBConfig *
+PUBLIC GLXFBConfig *
 glXGetFBConfigs( Display *dpy, int screen, int *nelements )
 {
    XVisualInfo *visuals, visTemplate;
@@ -1769,7 +1769,7 @@ glXGetFBConfigs( Display *dpy, int screen, int *nelements )
 }
 
 
-GLXFBConfig *
+PUBLIC GLXFBConfig *
 glXChooseFBConfig( Display *dpy, int screen,
                         const int *attribList, int *nitems )
 {
@@ -1798,7 +1798,7 @@ glXChooseFBConfig( Display *dpy, int screen,
 }
 
 
-XVisualInfo *
+PUBLIC XVisualInfo *
 glXGetVisualFromFBConfig( Display *dpy, GLXFBConfig config )
 {
    if (dpy && config) {
@@ -1820,7 +1820,7 @@ glXGetVisualFromFBConfig( Display *dpy, GLXFBConfig config )
 }
 
 
-GLXWindow
+PUBLIC GLXWindow
 glXCreateWindow( Display *dpy, GLXFBConfig config, Window win,
                       const int *attribList )
 {
@@ -1840,7 +1840,7 @@ glXCreateWindow( Display *dpy, GLXFBConfig config, Window win,
 }
 
 
-void
+PUBLIC void
 glXDestroyWindow( Display *dpy, GLXWindow window )
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, (Drawable) window);
@@ -1851,7 +1851,7 @@ glXDestroyWindow( Display *dpy, GLXWindow window )
 
 
 /* XXX untested */
-GLXPixmap
+PUBLIC GLXPixmap
 glXCreatePixmap( Display *dpy, GLXFBConfig config, Pixmap pixmap,
                       const int *attribList )
 {
@@ -1961,7 +1961,7 @@ glXCreatePixmap( Display *dpy, GLXFBConfig config, Pixmap pixmap,
 }
 
 
-void
+PUBLIC void
 glXDestroyPixmap( Display *dpy, GLXPixmap pixmap )
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, (Drawable)pixmap);
@@ -1971,7 +1971,7 @@ glXDestroyPixmap( Display *dpy, GLXPixmap pixmap )
 }
 
 
-GLXPbuffer
+PUBLIC GLXPbuffer
 glXCreatePbuffer( Display *dpy, GLXFBConfig config,
                        const int *attribList )
 {
@@ -2034,7 +2034,7 @@ glXCreatePbuffer( Display *dpy, GLXFBConfig config,
 }
 
 
-void
+PUBLIC void
 glXDestroyPbuffer( Display *dpy, GLXPbuffer pbuf )
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, pbuf);
@@ -2044,7 +2044,7 @@ glXDestroyPbuffer( Display *dpy, GLXPbuffer pbuf )
 }
 
 
-void
+PUBLIC void
 glXQueryDrawable( Display *dpy, GLXDrawable draw, int attribute,
                        unsigned int *value )
 {
@@ -2090,7 +2090,7 @@ glXQueryDrawable( Display *dpy, GLXDrawable draw, int attribute,
 }
 
 
-GLXContext
+PUBLIC GLXContext
 glXCreateNewContext( Display *dpy, GLXFBConfig config,
                           int renderType, GLXContext shareList, Bool direct )
 {
@@ -2124,7 +2124,7 @@ glXCreateNewContext( Display *dpy, GLXFBConfig config,
 }
 
 
-int
+PUBLIC int
 glXQueryContext( Display *dpy, GLXContext ctx, int attribute, int *value )
 {
    GLXContext glxCtx = ctx;
@@ -2153,7 +2153,7 @@ glXQueryContext( Display *dpy, GLXContext ctx, int attribute, int *value )
 }
 
 
-void
+PUBLIC void
 glXSelectEvent( Display *dpy, GLXDrawable drawable, unsigned long mask )
 {
    XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
@@ -2162,7 +2162,7 @@ glXSelectEvent( Display *dpy, GLXDrawable drawable, unsigned long mask )
 }
 
 
-void
+PUBLIC void
 glXGetSelectedEvent( Display *dpy, GLXDrawable drawable,
                           unsigned long *mask )
 {
@@ -2177,7 +2177,7 @@ glXGetSelectedEvent( Display *dpy, GLXDrawable drawable,
 
 /*** GLX_SGI_swap_control ***/
 
-int
+PUBLIC int
 glXSwapIntervalSGI(int interval)
 {
    (void) interval;
@@ -2190,7 +2190,7 @@ glXSwapIntervalSGI(int interval)
 
 static unsigned int FrameCounter = 0;
 
-int
+PUBLIC int
 glXGetVideoSyncSGI(unsigned int *count)
 {
    /* this is a bogus implementation */
@@ -2198,7 +2198,7 @@ glXGetVideoSyncSGI(unsigned int *count)
    return 0;
 }
 
-int
+PUBLIC int
 glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count)
 {
    if (divisor <= 0 || remainder < 0)
@@ -2215,7 +2215,7 @@ glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count)
 
 /*** GLX_SGI_make_current_read ***/
 
-Bool
+PUBLIC Bool
 glXMakeCurrentReadSGI(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx)
 {
    return glXMakeContextCurrent( dpy, draw, read, ctx );
@@ -2233,7 +2233,7 @@ glXGetCurrentReadDrawableSGI(void)
 /*** GLX_SGIX_video_source ***/
 #if defined(_VL_H)
 
-GLXVideoSourceSGIX
+PUBLIC GLXVideoSourceSGIX
 glXCreateGLXVideoSourceSGIX(Display *dpy, int screen, VLServer server, VLPath path, int nodeClass, VLNode drainNode)
 {
    (void) dpy;
@@ -2245,7 +2245,7 @@ glXCreateGLXVideoSourceSGIX(Display *dpy, int screen, VLServer server, VLPath pa
    return 0;
 }
 
-void
+PUBLIC void
 glXDestroyGLXVideoSourceSGIX(Display *dpy, GLXVideoSourceSGIX src)
 {
    (void) dpy;
@@ -2257,21 +2257,21 @@ glXDestroyGLXVideoSourceSGIX(Display *dpy, GLXVideoSourceSGIX src)
 
 /*** GLX_EXT_import_context ***/
 
-void
+PUBLIC void
 glXFreeContextEXT(Display *dpy, GLXContext context)
 {
    (void) dpy;
    (void) context;
 }
 
-GLXContextID
+PUBLIC GLXContextID
 glXGetContextIDEXT(const GLXContext context)
 {
    (void) context;
    return 0;
 }
 
-GLXContext
+PUBLIC GLXContext
 glXImportContextEXT(Display *dpy, GLXContextID contextID)
 {
    (void) dpy;
@@ -2279,7 +2279,7 @@ glXImportContextEXT(Display *dpy, GLXContextID contextID)
    return 0;
 }
 
-int
+PUBLIC int
 glXQueryContextInfoEXT(Display *dpy, GLXContext context, int attribute, int *value)
 {
    (void) dpy;
@@ -2293,20 +2293,20 @@ glXQueryContextInfoEXT(Display *dpy, GLXContext context, int attribute, int *val
 
 /*** GLX_SGIX_fbconfig ***/
 
-int
+PUBLIC int
 glXGetFBConfigAttribSGIX(Display *dpy, GLXFBConfigSGIX config, int attribute, int *value)
 {
    return glXGetFBConfigAttrib(dpy, config, attribute, value);
 }
 
-GLXFBConfigSGIX *
+PUBLIC GLXFBConfigSGIX *
 glXChooseFBConfigSGIX(Display *dpy, int screen, int *attrib_list, int *nelements)
 {
    return (GLXFBConfig *) glXChooseFBConfig(dpy, screen, attrib_list, nelements);
 }
 
 
-GLXPixmap
+PUBLIC GLXPixmap
 glXCreateGLXPixmapWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, Pixmap pixmap)
 {
    XMesaVisual xmvis = (XMesaVisual) config;
@@ -2315,7 +2315,7 @@ glXCreateGLXPixmapWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, Pixmap pi
 }
 
 
-GLXContext
+PUBLIC GLXContext
 glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int render_type, GLXContext share_list, Bool direct)
 {
    XMesaVisual xmvis = (XMesaVisual) config;
@@ -2344,14 +2344,14 @@ glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int render_
 }
 
 
-XVisualInfo *
+PUBLIC XVisualInfo *
 glXGetVisualFromFBConfigSGIX(Display *dpy, GLXFBConfigSGIX config)
 {
    return glXGetVisualFromFBConfig(dpy, config);
 }
 
 
-GLXFBConfigSGIX
+PUBLIC GLXFBConfigSGIX
 glXGetFBConfigFromVisualSGIX(Display *dpy, XVisualInfo *vis)
 {
    XMesaVisual xmvis = find_glx_visual(dpy, vis);
@@ -2367,7 +2367,7 @@ glXGetFBConfigFromVisualSGIX(Display *dpy, XVisualInfo *vis)
 
 /*** GLX_SGIX_pbuffer ***/
 
-GLXPbufferSGIX
+PUBLIC GLXPbufferSGIX
 glXCreateGLXPbufferSGIX(Display *dpy, GLXFBConfigSGIX config,
                              unsigned int width, unsigned int height,
                              int *attribList)
@@ -2406,7 +2406,7 @@ glXCreateGLXPbufferSGIX(Display *dpy, GLXFBConfigSGIX config,
 }
 
 
-void
+PUBLIC void
 glXDestroyGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf)
 {
    XMesaBuffer xmbuf = XMesaFindBuffer(dpy, pbuf);
@@ -2416,7 +2416,7 @@ glXDestroyGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf)
 }
 
 
-int
+PUBLIC int
 glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value)
 {
    const XMesaBuffer xmbuf = XMesaFindBuffer(dpy, pbuf);
@@ -2449,7 +2449,7 @@ glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigne
 }
 
 
-void
+PUBLIC void
 glXSelectEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long mask)
 {
    XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
@@ -2460,7 +2460,7 @@ glXSelectEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long mask)
 }
 
 
-void
+PUBLIC void
 glXGetSelectedEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long *mask)
 {
    XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
@@ -2476,7 +2476,7 @@ glXGetSelectedEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long *mask)
 
 /*** GLX_SGI_cushion ***/
 
-void
+PUBLIC void
 glXCushionSGI(Display *dpy, Window win, float cushion)
 {
    (void) dpy;
@@ -2488,7 +2488,7 @@ glXCushionSGI(Display *dpy, Window win, float cushion)
 
 /*** GLX_SGIX_video_resize ***/
 
-int
+PUBLIC int
 glXBindChannelToWindowSGIX(Display *dpy, int screen, int channel , Window window)
 {
    (void) dpy;
@@ -2498,7 +2498,7 @@ glXBindChannelToWindowSGIX(Display *dpy, int screen, int channel , Window window
    return 0;
 }
 
-int
+PUBLIC int
 glXChannelRectSGIX(Display *dpy, int screen, int channel, int x, int y, int w, int h)
 {
    (void) dpy;
@@ -2511,7 +2511,7 @@ glXChannelRectSGIX(Display *dpy, int screen, int channel, int x, int y, int w, i
    return 0;
 }
 
-int
+PUBLIC int
 glXQueryChannelRectSGIX(Display *dpy, int screen, int channel, int *x, int *y, int *w, int *h)
 {
    (void) dpy;
@@ -2524,7 +2524,7 @@ glXQueryChannelRectSGIX(Display *dpy, int screen, int channel, int *x, int *y, i
    return 0;
 }
 
-int
+PUBLIC int
 glXQueryChannelDeltasSGIX(Display *dpy, int screen, int channel, int *dx, int *dy, int *dw, int *dh)
 {
    (void) dpy;
@@ -2537,7 +2537,7 @@ glXQueryChannelDeltasSGIX(Display *dpy, int screen, int channel, int *dx, int *d
    return 0;
 }
 
-int
+PUBLIC int
 glXChannelRectSyncSGIX(Display *dpy, int screen, int channel, GLenum synctype)
 {
    (void) dpy;
@@ -2552,7 +2552,7 @@ glXChannelRectSyncSGIX(Display *dpy, int screen, int channel, GLenum synctype)
 /*** GLX_SGIX_dmbuffer **/
 
 #if defined(_DM_BUFFER_H_)
-Bool
+PUBLIC Bool
 glXAssociateDMPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params, DMbuffer dmbuffer)
 {
    (void) dpy;
@@ -2566,7 +2566,7 @@ glXAssociateDMPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params
 
 /*** GLX_SGIX_swap_group ***/
 
-void
+PUBLIC void
 glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable, GLXDrawable member)
 {
    (void) dpy;
@@ -2578,7 +2578,7 @@ glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable, GLXDrawable member)
 
 /*** GLX_SGIX_swap_barrier ***/
 
-void
+PUBLIC void
 glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable, int barrier)
 {
    (void) dpy;
@@ -2586,7 +2586,7 @@ glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable, int barrier)
    (void) barrier;
 }
 
-Bool
+PUBLIC Bool
 glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen, int *max)
 {
    (void) dpy;
@@ -2599,7 +2599,7 @@ glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen, int *max)
 
 /*** GLX_SUN_get_transparent_index ***/
 
-Status
+PUBLIC Status
 glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent)
 {
    (void) dpy;
@@ -2617,7 +2617,7 @@ glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *p
  * Release the depth, stencil, accum buffers attached to a GLXDrawable
  * (a window or pixmap) prior to destroying the GLXDrawable.
  */
-Bool
+PUBLIC Bool
 glXReleaseBuffersMESA( Display *dpy, GLXDrawable d )
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, d);
@@ -2630,7 +2630,7 @@ glXReleaseBuffersMESA( Display *dpy, GLXDrawable d )
 
 /*** GLX_EXT_texture_from_pixmap ***/
 
-void
+PUBLIC void
 glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer,
                         const int *attrib_list)
 {
@@ -2639,7 +2639,7 @@ glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer,
       XMesaBindTexImage(dpy, b, buffer, attrib_list);
 }
 
-void
+PUBLIC void
 glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
 {
    XMesaBuffer b = XMesaFindBuffer(dpy, drawable);
diff --git a/src/gallium/state_trackers/glx/xlib/glx_getproc.c b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
index ca7d88c922..84d47b12ed 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_getproc.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
@@ -34,6 +34,7 @@
 #include <string.h>
 #include "GL/glx.h"
 #include "glapi/glapi.h"
+#include "pipe/p_compiler.h"
 
 
 struct name_address_pair {
@@ -208,6 +209,7 @@ glXGetProcAddressARB(const GLubyte *procName)
 
 
 /* GLX 1.4 */
+PUBLIC
 void (*glXGetProcAddress(const GLubyte *procName))()
 {
    return glXGetProcAddressARB(procName);
diff --git a/src/gallium/state_trackers/glx/xlib/glx_usefont.c b/src/gallium/state_trackers/glx/xlib/glx_usefont.c
index acc64df62b..16e5ce642f 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_usefont.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_usefont.c
@@ -33,6 +33,7 @@
 #include "main/context.h"
 #include "main/imports.h"
 #include <GL/glx.h>
+#include "pipe/p_compiler.h"
 
 
 /* Some debugging info.  */
@@ -210,7 +211,7 @@ isvalid(XFontStruct * fs, unsigned int which)
 }
 
 
-void
+PUBLIC void
 glXUseXFont(Font font, int first, int count, int listbase)
 {
    Display *dpy;
diff --git a/src/gallium/state_trackers/python/SConscript b/src/gallium/state_trackers/python/SConscript
index d4fdd43688..8498a90812 100644
--- a/src/gallium/state_trackers/python/SConscript
+++ b/src/gallium/state_trackers/python/SConscript
@@ -28,14 +28,27 @@ if 'python' in env['statetrackers']:
             'X11',
         ])
 
+    sources = [
+        'gallium.i',
+        'st_device.c',
+        'st_sample.c',
+    ]
+
+    drivers = [
+        trace
+    ]
+
+    if 'llvmpipe' in env['drivers']:
+        env.Tool('llvm')
+        sources += ['st_llvmpipe_winsys.c']
+        drivers += [llvmpipe]
+    else:
+        sources += ['st_softpipe_winsys.c']
+        drivers += [softpipe]
+
     pyst = env.ConvenienceLibrary(
         target = 'pyst',
-        source = [
-            'gallium.i',
-            'st_device.c',
-            'st_sample.c',
-            'st_softpipe_winsys.c',
-        ],
+        source = sources,
     )
 
     env['no_import_lib'] = 1
@@ -45,5 +58,5 @@ if 'python' in env['statetrackers']:
         source = [
             'st_hardpipe_winsys.c',
         ],
-        LIBS = [pyst, softpipe, trace] + gallium + env['LIBS'],
+        LIBS = [pyst] + drivers + gallium + env['LIBS'],
     )
diff --git a/src/gallium/state_trackers/python/p_device.i b/src/gallium/state_trackers/python/p_device.i
index 2dc995adb0..0eba488a07 100644
--- a/src/gallium/state_trackers/python/p_device.i
+++ b/src/gallium/state_trackers/python/p_device.i
@@ -87,6 +87,10 @@ struct st_device {
                             enum pipe_texture_target target,
                             unsigned tex_usage, 
                             unsigned geom_flags ) {
+      /* We can't really display surfaces with the python statetracker so mask
+       * out that usage */
+      tex_usage &= ~PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
       return $self->screen->is_format_supported( $self->screen, 
                                                  format, 
                                                  target, 
@@ -110,6 +114,11 @@ struct st_device {
          unsigned tex_usage = 0
       ) {
       struct pipe_texture templat;
+
+      /* We can't really display surfaces with the python statetracker so mask
+       * out that usage */
+      tex_usage &= ~PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
       memset(&templat, 0, sizeof(templat));
       templat.format = format;
       templat.width0 = width;
@@ -118,6 +127,7 @@ struct st_device {
       templat.last_level = last_level;
       templat.target = target;
       templat.tex_usage = tex_usage;
+
       return $self->screen->texture_create($self->screen, &templat);
    }
    
diff --git a/src/gallium/state_trackers/python/samples/gs.py b/src/gallium/state_trackers/python/samples/gs.py
index 1ceead5f17..a07cf557f2 100644
--- a/src/gallium/state_trackers/python/samples/gs.py
+++ b/src/gallium/state_trackers/python/samples/gs.py
@@ -136,10 +136,10 @@ def test(dev):
     cbuf = dev.texture_create(
         PIPE_FORMAT_X8R8G8B8_UNORM,
         width, height,
-        tex_usage=PIPE_TEXTURE_USAGE_DISPLAY_TARGET,
+        tex_usage=PIPE_TEXTURE_USAGE_RENDER_TARGET,
     ).get_surface()
     zbuf = dev.texture_create(
-        PIPE_FORMAT_Z16_UNORM,
+        PIPE_FORMAT_Z32_UNORM,
         width, height,
         tex_usage=PIPE_TEXTURE_USAGE_DEPTH_STENCIL,
     ).get_surface()
diff --git a/src/gallium/state_trackers/python/samples/tri.py b/src/gallium/state_trackers/python/samples/tri.py
index af80426dc6..e5e168bdc8 100644
--- a/src/gallium/state_trackers/python/samples/tri.py
+++ b/src/gallium/state_trackers/python/samples/tri.py
@@ -136,10 +136,10 @@ def test(dev):
     cbuf = dev.texture_create(
         PIPE_FORMAT_X8R8G8B8_UNORM, 
         width, height,
-        tex_usage=PIPE_TEXTURE_USAGE_DISPLAY_TARGET,
+        tex_usage=PIPE_TEXTURE_USAGE_RENDER_TARGET,
     ).get_surface()
     zbuf = dev.texture_create(
-        PIPE_FORMAT_Z16_UNORM, 
+        PIPE_FORMAT_Z32_UNORM,
         width, height,
         tex_usage=PIPE_TEXTURE_USAGE_DEPTH_STENCIL,
     ).get_surface()
diff --git a/src/gallium/state_trackers/python/st_llvmpipe_winsys.c b/src/gallium/state_trackers/python/st_llvmpipe_winsys.c
new file mode 100644
index 0000000000..0096b18c99
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_llvmpipe_winsys.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Llvmpipe support. 
+ * 
+ * @author Jose Fonseca
+ */
+
+
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "llvmpipe/lp_winsys.h"
+#include "st_winsys.h"
+
+
+static boolean
+llvmpipe_ws_is_displaytarget_format_supported( struct llvmpipe_winsys *ws,
+      enum pipe_format format )
+{
+   return FALSE;
+}
+
+
+static void *
+llvmpipe_ws_displaytarget_map(struct llvmpipe_winsys *ws,
+                              struct llvmpipe_displaytarget *dt,
+                              unsigned flags )
+{
+   assert(0);
+   return NULL;
+}
+
+
+static void
+llvmpipe_ws_displaytarget_unmap(struct llvmpipe_winsys *ws,
+                                struct llvmpipe_displaytarget *dt )
+{
+   assert(0);
+}
+
+
+static void
+llvmpipe_ws_displaytarget_destroy(struct llvmpipe_winsys *winsys,
+                                  struct llvmpipe_displaytarget *dt)
+{
+   assert(0);
+}
+
+
+static struct llvmpipe_displaytarget *
+llvmpipe_ws_displaytarget_create(struct llvmpipe_winsys *winsys,
+                                 enum pipe_format format,
+                                 unsigned width, unsigned height,
+                                 unsigned alignment,
+                                 unsigned *stride)
+{
+   return NULL;
+}
+
+
+static void
+llvmpipe_ws_displaytarget_display(struct llvmpipe_winsys *winsys,
+                                  struct llvmpipe_displaytarget *dt,
+                                  void *context_private)
+{
+   assert(0);
+}
+
+
+static void
+llvmpipe_ws_destroy(struct llvmpipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+st_llvmpipe_screen_create(void)
+{
+   static struct llvmpipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(llvmpipe_winsys);
+   if (!winsys)
+      goto no_winsys;
+
+   winsys->destroy = llvmpipe_ws_destroy;
+   winsys->is_displaytarget_format_supported = llvmpipe_ws_is_displaytarget_format_supported;
+   winsys->displaytarget_create = llvmpipe_ws_displaytarget_create;
+   winsys->displaytarget_map = llvmpipe_ws_displaytarget_map;
+   winsys->displaytarget_unmap = llvmpipe_ws_displaytarget_unmap;
+   winsys->displaytarget_display = llvmpipe_ws_displaytarget_display;
+   winsys->displaytarget_destroy = llvmpipe_ws_displaytarget_destroy;
+
+   screen = llvmpipe_create_screen(winsys);
+   if (!screen)
+      goto no_screen;
+
+   return screen;
+
+no_screen:
+   FREE(winsys);
+no_winsys:
+   return NULL;
+}
+
+
+static struct pipe_context *
+st_llvmpipe_context_create(struct pipe_screen *screen)
+{
+   return llvmpipe_create(screen);
+}
+
+
+const struct st_winsys st_softpipe_winsys = {
+   &st_llvmpipe_screen_create,
+   &st_llvmpipe_context_create,
+};
diff --git a/src/gallium/state_trackers/python/tests/regress/fragment-shader/fragment-shader.py b/src/gallium/state_trackers/python/tests/regress/fragment-shader/fragment-shader.py
index eed6cdd1e6..8d3bf9d4d7 100644
--- a/src/gallium/state_trackers/python/tests/regress/fragment-shader/fragment-shader.py
+++ b/src/gallium/state_trackers/python/tests/regress/fragment-shader/fragment-shader.py
@@ -114,7 +114,7 @@ def test(dev, name):
     cbuf = dev.texture_create(
         PIPE_FORMAT_X8R8G8B8_UNORM,
         width, height,
-        tex_usage=PIPE_TEXTURE_USAGE_DISPLAY_TARGET,
+        tex_usage=PIPE_TEXTURE_USAGE_RENDER_TARGET,
     ).get_surface()
     fb = Framebuffer()
     fb.width = width
diff --git a/src/gallium/state_trackers/python/tests/regress/vertex-shader/vertex-shader.py b/src/gallium/state_trackers/python/tests/regress/vertex-shader/vertex-shader.py
index 41bebd0604..01bf5a3210 100644
--- a/src/gallium/state_trackers/python/tests/regress/vertex-shader/vertex-shader.py
+++ b/src/gallium/state_trackers/python/tests/regress/vertex-shader/vertex-shader.py
@@ -114,7 +114,7 @@ def test(dev, name):
     cbuf = dev.texture_create(
         PIPE_FORMAT_X8R8G8B8_UNORM,
         width, height,
-        tex_usage=PIPE_TEXTURE_USAGE_DISPLAY_TARGET,
+        tex_usage=PIPE_TEXTURE_USAGE_RENDER_TARGET,
     ).get_surface()
     fb = Framebuffer()
     fb.width = width
diff --git a/src/gallium/state_trackers/vega/image.c b/src/gallium/state_trackers/vega/image.c
index 278ba6d46e..1112ad9839 100644
--- a/src/gallium/state_trackers/vega/image.c
+++ b/src/gallium/state_trackers/vega/image.c
@@ -644,7 +644,7 @@ VGint image_sampler_filter(struct vg_context *ctx)
        return PIPE_TEX_FILTER_NEAREST;
        break;
     case VG_IMAGE_QUALITY_BETTER:
-       /*return PIPE_TEX_FILTER_ANISO;*/
+       /* possibly use anisotropic filtering */
        return PIPE_TEX_FILTER_LINEAR;
        break;
     default:
diff --git a/src/gallium/state_trackers/vega/vg_tracker.h b/src/gallium/state_trackers/vega/vg_tracker.h
index 5457631106..0f0c27f455 100644
--- a/src/gallium/state_trackers/vega/vg_tracker.h
+++ b/src/gallium/state_trackers/vega/vg_tracker.h
@@ -45,15 +45,19 @@ struct pipe_fence_handle;
 struct pipe_surface;
 
 
+PUBLIC
 struct vg_context *st_create_context(struct pipe_context *pipe,
                                      const void *visual,
                                      struct vg_context *share);
 
+PUBLIC
 void st_destroy_context( struct vg_context *st );
 
+PUBLIC
 void st_copy_context_state(struct vg_context *dst, struct vg_context *src,
                            uint mask);
 
+PUBLIC
 struct st_framebuffer *st_create_framebuffer(const void *visual,
                                              enum pipe_format colorFormat,
                                              enum pipe_format depthFormat,
@@ -61,47 +65,63 @@ struct st_framebuffer *st_create_framebuffer(const void *visual,
                                              uint width, uint height,
                                              void *privateData);
 
+PUBLIC
 void st_resize_framebuffer(struct st_framebuffer *stfb,
                            uint width, uint height);
 
+PUBLIC
 void st_set_framebuffer_surface(struct st_framebuffer *stfb,
                                 uint surfIndex, struct pipe_surface *surf);
 
+PUBLIC
 void st_get_framebuffer_dimensions( struct st_framebuffer *stfb,
 				    uint *width, uint *height);
 
+PUBLIC
 int st_bind_texture_surface(struct pipe_surface *ps, int target, int level,
                             enum pipe_format format);
 
+PUBLIC
 int st_unbind_texture_surface(struct pipe_surface *ps, int target, int level);
 
+PUBLIC
 int st_get_framebuffer_surface(struct st_framebuffer *stfb,
                                uint surfIndex, struct pipe_surface **surf);
 
+PUBLIC
 int st_get_framebuffer_texture(struct st_framebuffer *stfb,
                                uint surfIndex, struct pipe_texture **tex);
 
+PUBLIC
 void *st_framebuffer_private(struct st_framebuffer *stfb);
 
+PUBLIC
 void st_unreference_framebuffer(struct st_framebuffer *stfb);
 
+PUBLIC
 void st_make_current(struct vg_context *st,
                      struct st_framebuffer *draw,
                      struct st_framebuffer *read);
 
+PUBLIC
 struct vg_context *st_get_current(void);
 
+PUBLIC
 void st_flush(struct vg_context *st, uint pipeFlushFlags,
                struct pipe_fence_handle **fence);
+PUBLIC
 void st_finish(struct vg_context *st);
 
+PUBLIC
 void st_notify_swapbuffers(struct st_framebuffer *stfb);
+PUBLIC
 void st_notify_swapbuffers_complete(struct st_framebuffer *stfb);
 
 
 /** Generic function type */
 typedef void (*st_proc)();
 
+PUBLIC
 st_proc st_get_proc_address(const char *procname);
 
 #endif
diff --git a/src/gallium/state_trackers/xorg/xorg_crtc.c b/src/gallium/state_trackers/xorg/xorg_crtc.c
index e390ce29ae..650d2c0d1d 100644
--- a/src/gallium/state_trackers/xorg/xorg_crtc.c
+++ b/src/gallium/state_trackers/xorg/xorg_crtc.c
@@ -123,7 +123,8 @@ crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
     drm_mode.vrefresh = mode->VRefresh;
     if (!mode->name)
 	xf86SetModeDefaultName(mode);
-    strncpy(drm_mode.name, mode->name, DRM_DISPLAY_MODE_LEN);
+    strncpy(drm_mode.name, mode->name, DRM_DISPLAY_MODE_LEN - 1);
+    drm_mode.name[DRM_DISPLAY_MODE_LEN - 1] = '\0';
 
     ret = drmModeSetCrtc(ms->fd, drm_crtc->crtc_id, ms->fb_id, x, y,
 			 &drm_connector->connector_id, 1, &drm_mode);
diff --git a/src/gallium/state_trackers/xorg/xorg_driver.c b/src/gallium/state_trackers/xorg/xorg_driver.c
index 4d169a1d14..b02fe68f31 100644
--- a/src/gallium/state_trackers/xorg/xorg_driver.c
+++ b/src/gallium/state_trackers/xorg/xorg_driver.c
@@ -181,8 +181,7 @@ drv_crtc_resize(ScrnInfoPtr pScrn, int width, int height)
     if (!pScreen->ModifyPixmapHeader(rootPixmap, width, height, -1, -1, -1, NULL))
 	return FALSE;
 
-    /* HW dependent - FIXME */
-    pScrn->displayWidth = pScrn->virtualX;
+    pScrn->displayWidth = rootPixmap->devKind / (rootPixmap->drawable.bitsPerPixel / 8);
 
     /* now create new frontbuffer */
     return ms->create_front_buffer(pScrn) && ms->bind_front_buffer(pScrn);
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.c b/src/gallium/state_trackers/xorg/xorg_exa.c
index aa68570b9c..d9432babf1 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa.c
@@ -811,34 +811,7 @@ xorg_exa_set_shared_usage(PixmapPtr pPixmap)
     return 0;
 }
 
-unsigned
-xorg_exa_get_pixmap_handle(PixmapPtr pPixmap, unsigned *stride_out)
-{
-    ScreenPtr pScreen = pPixmap->drawable.pScreen;
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-    modesettingPtr ms = modesettingPTR(pScrn);
-    struct exa_pixmap_priv *priv;
-    unsigned handle;
-    unsigned stride;
 
-    if (!ms->exa) {
-	FatalError("NO MS->EXA\n");
-	return 0;
-    }
-
-    priv = exaGetPixmapDriverPrivate(pPixmap);
-
-    if (!priv) {
-	FatalError("NO PIXMAP PRIVATE\n");
-	return 0;
-    }
-
-    ms->api->local_handle_from_texture(ms->api, ms->screen, priv->tex, &stride, &handle);
-    if (stride_out)
-	*stride_out = stride;
-
-    return handle;
-}
 
 static Bool
 size_match( int width, int tex_width )
diff --git a/src/gallium/state_trackers/xorg/xorg_tracker.h b/src/gallium/state_trackers/xorg/xorg_tracker.h
index c0cfbe6061..4d5d4780dc 100644
--- a/src/gallium/state_trackers/xorg/xorg_tracker.h
+++ b/src/gallium/state_trackers/xorg/xorg_tracker.h
@@ -135,9 +135,6 @@ typedef struct _modesettingRec
 struct pipe_texture *
 xorg_exa_get_texture(PixmapPtr pPixmap);
 
-unsigned
-xorg_exa_get_pixmap_handle(PixmapPtr pPixmap, unsigned *stride);
-
 int
 xorg_exa_set_displayed_usage(PixmapPtr pPixmap);
 
diff --git a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
index 7106a06492..e5912ef77f 100644
--- a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
+++ b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
@@ -87,6 +87,7 @@ nouveau_drm_create_screen(struct drm_api *api, int fd,
 	case 0x60:
 		init = nv40_screen_create;
 		break;
+	case 0x50:
 	case 0x80:
 	case 0x90:
 	case 0xa0:
@@ -164,6 +165,7 @@ nouveau_drm_create_context(struct drm_api *api, struct pipe_screen *pscreen)
 	case 0x60:
 		init = nv40_create;
 		break;
+	case 0x50:
 	case 0x80:
 	case 0x90:
 	case 0xa0:
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
index d2367b245a..385fa857b5 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
@@ -146,16 +146,17 @@ static void *radeon_buffer_map(struct pipe_winsys *ws,
         (struct radeon_pipe_buffer*)buffer;
     int write = 0;
 
-    if (radeon_bo_is_referenced_by_cs(radeon_buffer->bo, priv->cs)) {
-        priv->flush_cb(priv->flush_data);
-    }
-
     if (flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
         uint32_t domain;
 
         if (radeon_bo_is_busy(radeon_buffer->bo, &domain))
             return NULL;
     }
+
+    if (radeon_bo_is_referenced_by_cs(radeon_buffer->bo, priv->cs)) {
+        priv->flush_cb(priv->flush_data);
+    }
+
     if (flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
         write = 1;
     }
@@ -280,58 +281,3 @@ struct radeon_winsys* radeon_pipe_winsys(int fd)
 
     return radeon_ws;
 }
-#if 0
-static struct pipe_buffer *radeon_buffer_from_handle(struct radeon_screen *radeon_screen,
-                                                  uint32_t handle)
-{
-    struct radeon_pipe_buffer *radeon_buffer;
-    struct radeon_bo *bo = NULL;
-
-    bo = radeon_bo_open(radeon_screen->bom, handle, 0, 0, 0, 0);
-    if (bo == NULL) {
-        return NULL;
-    }
-    radeon_buffer = calloc(1, sizeof(struct radeon_pipe_buffer));
-    if (radeon_buffer == NULL) {
-        radeon_bo_unref(bo);
-        return NULL;
-    }
-    pipe_reference_init(&radeon_buffer->base.reference, 1);
-    radeon_buffer->base.usage = PIPE_BUFFER_USAGE_PIXEL;
-    radeon_buffer->bo = bo;
-    return &radeon_buffer->base;
-}
-
-struct pipe_surface *radeon_surface_from_handle(struct radeon_context *radeon_context,
-                                             uint32_t handle,
-                                             enum pipe_format format,
-                                             int w, int h, int pitch)
-{
-    struct pipe_screen *pipe_screen = radeon_context->pipe_screen;
-    struct pipe_winsys *pipe_winsys = radeon_context->pipe_winsys;
-    struct pipe_texture tmpl;
-    struct pipe_surface *ps;
-    struct pipe_texture *pt;
-    struct pipe_buffer *pb;
-
-    pb = radeon_buffer_from_handle(radeon_context->radeon_screen, handle);
-    if (pb == NULL) {
-        return NULL;
-    }
-    memset(&tmpl, 0, sizeof(tmpl));
-    tmpl.tex_usage = PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
-    tmpl.target = PIPE_TEXTURE_2D;
-    tmpl.width0 = w;
-    tmpl.height0 = h;
-    tmpl.depth0 = 1;
-    tmpl.format = format;
-
-    pt = pipe_screen->texture_blanket(pipe_screen, &tmpl, &pitch, pb);
-    if (pt == NULL) {
-        pipe_buffer_reference(&pb, NULL);
-    }
-    ps = pipe_screen->get_tex_surface(pipe_screen, pt, 0, 0, 0,
-                                      PIPE_BUFFER_USAGE_GPU_WRITE);
-    return ps;
-}
-#endif
diff --git a/src/gallium/winsys/drm/vmware/xorg/SConscript b/src/gallium/winsys/drm/vmware/xorg/SConscript
index f7ce400a7a..1e5d8ff7fe 100644
--- a/src/gallium/winsys/drm/vmware/xorg/SConscript
+++ b/src/gallium/winsys/drm/vmware/xorg/SConscript
@@ -44,6 +44,7 @@ if env['platform'] == 'linux':
 	sources = [
 		'vmw_ioctl.c',
 		'vmw_screen.c',
+		'vmw_video.c',
 		'vmw_xorg.c',
 	]
 
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index 599973ce12..420dccc92c 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -751,24 +751,18 @@ xlib_eglReleaseTexImage(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
 static EGLBoolean
 xlib_eglSwapBuffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw)
 {
-   /* error checking step: */
-   if (!_eglSwapBuffers(drv, dpy, draw))
-      return EGL_FALSE;
-
-   {
-      struct xlib_egl_surface *xsurf = lookup_surface(draw);
-      struct pipe_winsys *pws = xsurf->winsys;
-      struct pipe_surface *psurf;
+   struct xlib_egl_surface *xsurf = lookup_surface(draw);
+   struct pipe_winsys *pws = xsurf->winsys;
+   struct pipe_surface *psurf;
 
-      st_get_framebuffer_surface(xsurf->Framebuffer, ST_SURFACE_BACK_LEFT,
-                                 &psurf);
+   st_get_framebuffer_surface(xsurf->Framebuffer, ST_SURFACE_BACK_LEFT,
+         &psurf);
 
-      st_notify_swapbuffers(xsurf->Framebuffer);
+   st_notify_swapbuffers(xsurf->Framebuffer);
 
-      display_surface(pws, psurf, xsurf);
+   display_surface(pws, psurf, xsurf);
 
-      check_and_update_buffer_size(xsurf);
-   }
+   check_and_update_buffer_size(xsurf);
 
    return EGL_TRUE;
 }
diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
index 713841aeb1..a4dabb7804 100644
--- a/src/gallium/winsys/xlib/SConscript
+++ b/src/gallium/winsys/xlib/SConscript
@@ -3,50 +3,66 @@
 
 Import('*')
 
-if env['platform'] == 'linux' \
-        and 'mesa' in env['statetrackers'] \
-        and set(('softpipe', 'llvmpipe', 'i915', 'trace')).intersection(env['drivers']) \
-        and not env['dri']:
+if env['platform'] != 'linux':
+    Return()
 
-    env = env.Clone()
+if 'mesa' not in env['statetrackers']:
+    print 'warning: Mesa state tracker disabled: skipping build of xlib libGL.so'
+    Return()
 
-    env.Append(CPPPATH = [
-        '#/src/mesa',
-        '#/src/mesa/main',
-        '#src/gallium/state_trackers/glx/xlib',
-    ])
+if env['dri']:
+    print 'warning: DRI enabled: skipping build of xlib libGL.so'
+    Return()
 
-    env.Append(CPPDEFINES = ['USE_XSHM'])
+if 'trace' not in env['drivers']:
+    print 'warning: trace pipe driver disabled: skipping build of xlib libGL.so'
+    Return()
 
-    sources = [
-        'xlib.c',
-    ]
+if not set(('softpipe', 'llvmpipe', 'trace')).intersection(env['drivers']):
+    print 'warning: no supported pipe driver: skipping build of xlib libGL.so'
+    Return()
 
-    drivers = [trace]
-        
-    if 'softpipe' in env['drivers']:
-        env.Append(CPPDEFINES = 'GALLIUM_SOFTPIPE')
-        sources += ['xlib_softpipe.c']
-        drivers += [softpipe]
+env = env.Clone()
 
-    if 'llvmpipe' in env['drivers']:
-        env.Tool('llvm')
-        if 'LLVM_VERSION' in env:
-            env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE')
-            env.Tool('udis86')
-            sources += ['xlib_llvmpipe.c']
-            drivers += [llvmpipe]
-        
-    if 'cell' in env['drivers']:
-        env.Append(CPPDEFINES = 'GALLIUM_CELL')
-        sources += ['xlib_cell.c']
-        drivers += [cell]
+env.Append(CPPPATH = [
+    '#/src/mesa',
+    '#/src/mesa/main',
+    '#src/gallium/state_trackers/glx/xlib',
+])
 
-    # TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
-    libgl = env.SharedLibrary(
-        target ='GL',
-        source = sources,
-        LIBS = st_xlib + glapi + mesa + glsl + drivers + gallium + env['LIBS'],
-    )
+env.Append(CPPDEFINES = ['USE_XSHM'])
 
+sources = [
+    'xlib.c',
+]
+
+drivers = [trace]
+    
+if 'softpipe' in env['drivers']:
+    env.Append(CPPDEFINES = 'GALLIUM_SOFTPIPE')
+    sources += ['xlib_softpipe.c']
+    drivers += [softpipe]
+
+if 'llvmpipe' in env['drivers']:
+    env.Tool('llvm')
+    if 'LLVM_VERSION' in env:
+        env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE')
+        env.Tool('udis86')
+        sources += ['xlib_llvmpipe.c']
+        drivers += [llvmpipe]
+    
+if 'cell' in env['drivers']:
+    env.Append(CPPDEFINES = 'GALLIUM_CELL')
+    sources += ['xlib_cell.c']
+    drivers += [cell]
+
+# TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+libgl = env.SharedLibrary(
+    target ='GL',
+    source = sources,
+    LIBS = st_xlib + glapi + mesa + glsl + drivers + gallium + env['LIBS'],
+)
+
+if not env['dri']:
+    # Only install this libGL.so if DRI not enabled
     env.InstallSharedLibrary(libgl, version=(1, 5))