23 files changed, 1822 insertions, 1194 deletions
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 5cafe8c3f0..8f7d3b7100 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -552,7 +552,7 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_SHL:
       break;
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       break;
    case TGSI_OPCODE_AND:
       break;
@@ -919,7 +919,7 @@ translate_instructionir(llvm::Module *module,
       break;
    case TGSI_OPCODE_SHL:
       break;
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       break;
    case TGSI_OPCODE_AND:
       break;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index a9375abd21..ba6f7b15f9 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -80,7 +80,7 @@ struct fenced_buffer_list
  */
 struct fenced_buffer
 {
-   /* 
+   /*
     * Immutable members.
     */
 
@@ -126,8 +126,8 @@ fenced_buffer(struct pb_buffer *buf)
 /**
  * Add the buffer to the fenced list.
  * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
- * order before calling this function.
+ * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this
+ * order, before calling this function.
  * 
  * Reference count should be incremented before calling this function.
  */
@@ -191,7 +191,7 @@ fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
  * Wait for the fence to expire, and remove it from the fenced list.
  * 
  * fenced_buffer::mutex must be held. fenced_buffer_list::mutex must not be 
- * held -- it will
+ * held -- it will be acquired internally.
  */
 static INLINE enum pipe_error
 fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
@@ -207,7 +207,10 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->fence);
 
-   /* Acquire the global lock */
+   /*
+    * Acquire the global lock. Must release buffer mutex first to preserve
+    * lock order.
+    */
    pipe_mutex_unlock(fenced_buf->mutex);
    pipe_mutex_lock(fenced_list->mutex);
    pipe_mutex_lock(fenced_buf->mutex);
@@ -217,7 +220,7 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
          /* Remove from the fenced list */
          /* TODO: remove consequents */
          fenced_buffer_remove_locked(fenced_list, fenced_buf);
-         
+
          p_atomic_dec(&fenced_buf->base.base.reference.count);
          assert(pipe_is_referenced(&fenced_buf->base.base.reference));
 
@@ -238,7 +241,7 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
  */
 static void
 fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list, 
-                               int wait)
+                                     int wait)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
    struct list_head *curr, *next;
@@ -274,7 +277,6 @@ fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list,
 
       pb_buf = &fenced_buf->base;
       pb_reference(&pb_buf, NULL);
-      
 
       curr = next; 
       next = curr->next;
@@ -329,7 +331,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
           ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
          /* Don't wait for the GPU to finish writing */
-         goto finish;
+         goto done;
       }
 
       /* Wait for the GPU to finish writing */
@@ -350,7 +352,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
 
-finish:
+done:
    pipe_mutex_unlock(fenced_buf->mutex);
    
    return map;
@@ -391,7 +393,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
    
    assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
@@ -401,7 +403,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    /* Buffer cannot be validated in two different lists */ 
    if(fenced_buf->vl && fenced_buf->vl != vl) {
       ret = PIPE_ERROR_RETRY;
-      goto finish;
+      goto done;
    }
    
 #if 0
@@ -409,7 +411,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
       /* TODO: wait for the thread that mapped the buffer to unmap it */
       ret = PIPE_ERROR_RETRY;
-      goto finish;
+      goto done;
    }
    /* Final sanity checking */
    assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
@@ -420,17 +422,17 @@ fenced_buffer_validate(struct pb_buffer *buf,
       (fenced_buf->validation_flags & flags) == flags) {
       /* Nothing to do -- buffer already validated */
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
    
    ret = pb_validate(fenced_buf->buffer, vl, flags);
    if (ret != PIPE_OK)
-      goto finish;
+      goto done;
    
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
    
-finish:
+done:
    pipe_mutex_unlock(fenced_buf->mutex);
 
    return ret;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 01811d5011..ffed768f97 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -41,6 +41,12 @@
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
+#if defined(PIPE_OS_WINDOWS)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+#endif
 
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
 
@@ -118,7 +124,29 @@ rtasm_exec_free(void *addr)
 }
 
 
-#else /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#elif defined(PIPE_OS_WINDOWS)
+
+
+/*
+ * Avoid Data Execution Prevention.
+ */
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+}
+
+
+void
+rtasm_exec_free(void *addr)
+{
+   VirtualFree(addr, 0, MEM_RELEASE);
+}
+
+
+#else
+
 
 /*
  * Just use regular memory.
@@ -138,4 +166,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 2c65ff16d8..e2e5394f86 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -128,7 +128,9 @@ static const char *semantic_names[] =
 
 static const char *immediate_type_names[] =
 {
-   "FLT32"
+   "FLT32",
+   "UINT32",
+   "INT32"
 };
 
 static const char *swizzle_names[] =
@@ -412,6 +414,12 @@ iter_immediate(
       case TGSI_IMM_FLOAT32:
          FLT( imm->u[i].Float );
          break;
+      case TGSI_IMM_UINT32:
+         UID(imm->u[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         SID(imm->u[i].Int);
+         break;
       default:
          assert( 0 );
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index ba89f2fbc3..2bcb33392a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -60,6 +61,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
+
 #define FAST_MATH 1
 
 #define TILE_TOP_LEFT     0
@@ -67,11 +69,329 @@
 #define TILE_BOTTOM_LEFT  2
 #define TILE_BOTTOM_RIGHT 3
 
+static void
+micro_abs(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = fabsf(src->f[0]);
+   dst->f[1] = fabsf(src->f[1]);
+   dst->f[2] = fabsf(src->f[2]);
+   dst->f[3] = fabsf(src->f[3]);
+}
+
+static void
+micro_arl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0]);
+   dst->i[1] = (int)floorf(src->f[1]);
+   dst->i[2] = (int)floorf(src->f[2]);
+   dst->i[3] = (int)floorf(src->f[3]);
+}
+
+static void
+micro_arr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
+   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
+   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
+   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_ceil(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->f[0] = ceilf(src->f[0]);
+   dst->f[1] = ceilf(src->f[1]);
+   dst->f[2] = ceilf(src->f[2]);
+   dst->f[3] = ceilf(src->f[3]);
+}
+
+static void
+micro_cos(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = cosf(src->f[0]);
+   dst->f[1] = cosf(src->f[1]);
+   dst->f[2] = cosf(src->f[2]);
+   dst->f[3] = cosf(src->f[3]);
+}
+
+static void
+micro_ddx(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
+}
+
+static void
+micro_exp2(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_exp2(src->f[0]);
+   dst->f[1] = util_fast_exp2(src->f[1]);
+   dst->f[2] = util_fast_exp2(src->f[2]);
+   dst->f[3] = util_fast_exp2(src->f[3]);
+#else
+#if DEBUG
+   /* Inf is okay for this instruction, so clamp it to silence assertions. */
+   uint i;
+   union tgsi_exec_channel clamped;
+
+   for (i = 0; i < 4; i++) {
+      if (src->f[i] > 127.99999f) {
+         clamped.f[i] = 127.99999f;
+      } else if (src->f[i] < -126.99999f) {
+         clamped.f[i] = -126.99999f;
+      } else {
+         clamped.f[i] = src->f[i];
+      }
+   }
+   src = &clamped;
+#endif /* DEBUG */
+
+   dst->f[0] = powf(2.0f, src->f[0]);
+   dst->f[1] = powf(2.0f, src->f[1]);
+   dst->f[2] = powf(2.0f, src->f[2]);
+   dst->f[3] = powf(2.0f, src->f[3]);
+#endif /* FAST_MATH */
+}
+
+static void
+micro_flr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0]);
+   dst->f[1] = floorf(src->f[1]);
+   dst->f[2] = floorf(src->f[2]);
+   dst->f[3] = floorf(src->f[3]);
+}
+
+static void
+micro_frc(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] - floorf(src->f[0]);
+   dst->f[1] = src->f[1] - floorf(src->f[1]);
+   dst->f[2] = src->f[2] - floorf(src->f[2]);
+   dst->f[3] = src->f[3] - floorf(src->f[3]);
+}
+
+static void
+micro_iabs(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
+   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
+   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
+   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
+}
+
+static void
+micro_ineg(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_lg2(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_log2(src->f[0]);
+   dst->f[1] = util_fast_log2(src->f[1]);
+   dst->f[2] = util_fast_log2(src->f[2]);
+   dst->f[3] = util_fast_log2(src->f[3]);
+#else
+   dst->f[0] = logf(src->f[0]) * 1.442695f;
+   dst->f[1] = logf(src->f[1]) * 1.442695f;
+   dst->f[2] = logf(src->f[2]) * 1.442695f;
+   dst->f[3] = logf(src->f[3]) * 1.442695f;
+#endif
+}
+
+static void
+micro_lrp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
+   dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
+   dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
+   dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
+}
+
+static void
+micro_mad(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
+   dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
+   dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
+   dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
+}
+
+static void
+micro_mov(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src->u[0];
+   dst->u[1] = src->u[1];
+   dst->u[2] = src->u[2];
+   dst->u[3] = src->u[3];
+}
+
+static void
+micro_rcp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = 1.0f / src->f[0];
+   dst->f[1] = 1.0f / src->f[1];
+   dst->f[2] = 1.0f / src->f[2];
+   dst->f[3] = 1.0f / src->f[3];
+}
+
+static void
+micro_rnd(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0] + 0.5f);
+   dst->f[1] = floorf(src->f[1] + 0.5f);
+   dst->f[2] = floorf(src->f[2] + 0.5f);
+   dst->f[3] = floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_rsq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
+   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
+   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
+   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
+}
+
+static void
+micro_seq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sge(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgn(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
+   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
+   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
+   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sin(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = sinf(src->f[0]);
+   dst->f[1] = sinf(src->f[1]);
+   dst->f[2] = sinf(src->f[2]);
+   dst->f[3] = sinf(src->f[3]);
+}
+
+static void
+micro_sle(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_slt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sne(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_trunc(union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)(int)src->f[0];
+   dst->f[1] = (float)(int)src->f[1];
+   dst->f[2] = (float)(int)src->f[2];
+   dst->f[3] = (float)(int)src->f[3];
+}
+
+
 #define CHAN_X  0
 #define CHAN_Y  1
 #define CHAN_Z  2
 #define CHAN_W  3
 
+enum tgsi_exec_datatype {
+   TGSI_EXEC_DATA_FLOAT,
+   TGSI_EXEC_DATA_INT,
+   TGSI_EXEC_DATA_UINT
+};
+
 /*
  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  */
@@ -123,23 +443,19 @@
 
 /** The execution mask depends on the conditional mask and the loop mask */
 #define UPDATE_EXEC_MASK(MACH) \
-      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 
 
 static const union tgsi_exec_channel ZeroVec =
    { { 0.0, 0.0, 0.0, 0.0 } };
 
 
-#ifdef DEBUG
-static void
-check_inf_or_nan(const union tgsi_exec_channel *chan)
-{
-   assert(!util_is_inf_or_nan(chan->f[0]));
-   assert(!util_is_inf_or_nan(chan->f[1]));
-   assert(!util_is_inf_or_nan(chan->f[2]));
-   assert(!util_is_inf_or_nan(chan->f[3]));
-}
-#endif
+#define CHECK_INF_OR_NAN(chan) do {\
+      assert(!util_is_inf_or_nan((chan)->f[0]));\
+      assert(!util_is_inf_or_nan((chan)->f[1]));\
+      assert(!util_is_inf_or_nan((chan)->f[2]));\
+      assert(!util_is_inf_or_nan((chan)->f[3]));\
+   } while (0)
 
 
 #ifdef DEBUG
@@ -422,18 +738,6 @@ tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
    align_free(mach);
 }
 
-
-static void
-micro_abs(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = fabsf( src->f[0] );
-   dst->f[1] = fabsf( src->f[1] );
-   dst->f[2] = fabsf( src->f[2] );
-   dst->f[3] = fabsf( src->f[3] );
-}
-
 static void
 micro_add(
    union tgsi_exec_channel *dst,
@@ -446,76 +750,6 @@ micro_add(
    dst->f[3] = src0->f[3] + src1->f[3];
 }
 
-#if 0
-static void
-micro_iadd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-#endif
-
-static void
-micro_and(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = ceilf( src->f[0] );
-   dst->f[1] = ceilf( src->f[1] );
-   dst->f[2] = ceilf( src->f[2] );
-   dst->f[3] = ceilf( src->f[3] );
-}
-
-static void
-micro_cos(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = cosf( src->f[0] );
-   dst->f[1] = cosf( src->f[1] );
-   dst->f[2] = cosf( src->f[2] );
-   dst->f[3] = cosf( src->f[3] );
-}
-
-static void
-micro_ddx(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
-}
-
 static void
 micro_div(
    union tgsi_exec_channel *dst,
@@ -536,99 +770,6 @@ micro_div(
    }
 }
 
-#if 0
-static void
-micro_udiv(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-#endif
-
-static void
-micro_eq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-#if 0
-static void
-micro_ieq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-static void
-micro_exp2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src)
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_exp2( src->f[0] );
-   dst->f[1] = util_fast_exp2( src->f[1] );
-   dst->f[2] = util_fast_exp2( src->f[2] );
-   dst->f[3] = util_fast_exp2( src->f[3] );
-#else
-
-#if DEBUG
-   /* Inf is okay for this instruction, so clamp it to silence assertions. */
-   uint i;
-   union tgsi_exec_channel clamped;
-
-   for (i = 0; i < 4; i++) {
-      if (src->f[i] > 127.99999f) {
-         clamped.f[i] = 127.99999f;
-      } else if (src->f[i] < -126.99999f) {
-         clamped.f[i] = -126.99999f;
-      } else {
-         clamped.f[i] = src->f[i];
-      }
-   }
-   src = &clamped;
-#endif
-
-   dst->f[0] = powf( 2.0f, src->f[0] );
-   dst->f[1] = powf( 2.0f, src->f[1] );
-   dst->f[2] = powf( 2.0f, src->f[2] );
-   dst->f[3] = powf( 2.0f, src->f[3] );
-#endif
-}
-
-#if 0
-static void
-micro_f2ut(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-#endif
-
 static void
 micro_float_clamp(union tgsi_exec_channel *dst,
                   const union tgsi_exec_channel *src)
@@ -656,71 +797,6 @@ micro_float_clamp(union tgsi_exec_channel *dst,
 }
 
 static void
-micro_flr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] );
-   dst->f[1] = floorf( src->f[1] );
-   dst->f[2] = floorf( src->f[2] );
-   dst->f[3] = floorf( src->f[3] );
-}
-
-static void
-micro_frc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] - floorf( src->f[0] );
-   dst->f[1] = src->f[1] - floorf( src->f[1] );
-   dst->f[2] = src->f[2] - floorf( src->f[2] );
-   dst->f[3] = src->f[3] - floorf( src->f[3] );
-}
-
-static void
-micro_i2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_log2( src->f[0] );
-   dst->f[1] = util_fast_log2( src->f[1] );
-   dst->f[2] = util_fast_log2( src->f[2] );
-   dst->f[3] = util_fast_log2( src->f[3] );
-#else
-   dst->f[0] = logf( src->f[0] ) * 1.442695f;
-   dst->f[1] = logf( src->f[1] ) * 1.442695f;
-   dst->f[2] = logf( src->f[2] ) * 1.442695f;
-   dst->f[3] = logf( src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_le(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
 micro_lt(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src0,
@@ -734,38 +810,6 @@ micro_lt(
    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 }
 
-#if 0
-static void
-micro_ilt(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_ult(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
-}
-#endif
-
 static void
 micro_max(
    union tgsi_exec_channel *dst,
@@ -778,34 +822,6 @@ micro_max(
    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
-static void
-micro_imax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_umax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
 static void
 micro_min(
    union tgsi_exec_channel *dst,
@@ -818,48 +834,6 @@ micro_min(
    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
-static void
-micro_imin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_umin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
-#if 0
-static void
-micro_umod(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
-}
-#endif
-
 static void
 micro_mul(
    union tgsi_exec_channel *dst,
@@ -874,20 +848,6 @@ micro_mul(
 
 #if 0
 static void
-micro_imul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-#endif
-
-#if 0
-static void
 micro_imul64(
    union tgsi_exec_channel *dst0,
    union tgsi_exec_channel *dst1,
@@ -951,42 +911,6 @@ micro_neg(
    dst->f[3] = -src->f[3];
 }
 
-#if 0
-static void
-micro_ineg(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
-}
-#endif
-
-static void
-micro_not(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
-
-static void
-micro_or(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
-}
-
 static void
 micro_pow(
    union tgsi_exec_channel *dst,
@@ -1007,88 +931,6 @@ micro_pow(
 }
 
 static void
-micro_rnd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] + 0.5f );
-   dst->f[1] = floorf( src->f[1] + 0.5f );
-   dst->f[2] = floorf( src->f[2] + 0.5f );
-   dst->f[3] = floorf( src->f[3] + 0.5f );
-}
-
-static void
-micro_sgn(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
-   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
-   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
-   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
-}
-
-static void
-micro_shl(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
-}
-
-static void
-micro_ishr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
-}
-
-static void
-micro_trunc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0 )
-{
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
-}
-
-#if 0
-static void
-micro_ushr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
-#endif
-
-static void
-micro_sin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sinf( src->f[0] );
-   dst->f[1] = sinf( src->f[1] );
-   dst->f[2] = sinf( src->f[2] );
-   dst->f[3] = sinf( src->f[3] );
-}
-
-static void
 micro_sqrt( union tgsi_exec_channel *dst,
             const union tgsi_exec_channel *src )
 {
@@ -1110,31 +952,6 @@ micro_sub(
    dst->f[3] = src0->f[3] - src1->f[3];
 }
 
-#if 0
-static void
-micro_u2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
-}
-#endif
-
-static void
-micro_xor(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
-}
-
 static void
 fetch_src_file_channel(
    const struct tgsi_exec_machine *mach,
@@ -1233,11 +1050,11 @@ fetch_src_file_channel(
 }
 
 static void
-fetch_source(
-   const struct tgsi_exec_machine *mach,
-   union tgsi_exec_channel *chan,
-   const struct tgsi_full_src_register *reg,
-   const uint chan_index )
+fetch_source(const struct tgsi_exec_machine *mach,
+             union tgsi_exec_channel *chan,
+             const struct tgsi_full_src_register *reg,
+             const uint chan_index,
+             enum tgsi_exec_datatype src_datatype)
 {
    union tgsi_exec_channel index;
    uint swizzle;
@@ -1286,10 +1103,10 @@ fetch_source(
          &indir_index );
 
       /* add value of address register to the offset */
-      index.i[0] += (int) indir_index.f[0];
-      index.i[1] += (int) indir_index.f[1];
-      index.i[2] += (int) indir_index.f[2];
-      index.i[3] += (int) indir_index.f[3];
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
 
       /* for disabled execution channels, zero-out the index to
        * avoid using a potential garbage value.
@@ -1366,10 +1183,10 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += (int) indir_index.f[0];
-         index.i[1] += (int) indir_index.f[1];
-         index.i[2] += (int) indir_index.f[2];
-         index.i[3] += (int) indir_index.f[3];
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
 
          /* for disabled execution channels, zero-out the index to
           * avoid using a potential garbage value.
@@ -1394,32 +1211,30 @@ fetch_source(
       &index,
       chan );
 
-   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
-      break;
+   if (reg->Register.Absolute) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_abs(chan, chan);
+      } else {
+         micro_iabs(chan, chan);
+      }
+   }
 
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   if (reg->Register.Negate) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_neg(chan, chan);
+      } else {
+         micro_ineg(chan, chan);
+      }
    }
 }
 
 static void
-store_dest(
-   struct tgsi_exec_machine *mach,
-   const union tgsi_exec_channel *chan,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   uint chan_index )
+store_dest(struct tgsi_exec_machine *mach,
+           const union tgsi_exec_channel *chan,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           uint chan_index,
+           enum tgsi_exec_datatype dst_datatype)
 {
    uint i;
    union tgsi_exec_channel null;
@@ -1428,9 +1243,9 @@ store_dest(
    int offset = 0;  /* indirection offset */
    int index;
 
-#ifdef DEBUG
-   check_inf_or_nan(chan);
-#endif
+   if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
+      CHECK_INF_OR_NAN(chan);
+   }
 
    /* There is an extra source register that indirectly subscripts
     * a register file. The direct index now becomes an offset
@@ -1465,7 +1280,7 @@ store_dest(
          &indir_index );
 
       /* save indirection offset */
-      offset = (int) indir_index.f[0];
+      offset = indir_index.i[0];
    }
 
    switch (reg->Register.File) {
@@ -1595,10 +1410,10 @@ store_dest(
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
-    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
+    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
 
 #define STORE(VAL,INDEX,CHAN)\
-    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
+   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
 
 
 /**
@@ -1694,7 +1509,8 @@ fetch_texel( struct tgsi_sampler *sampler,
              const union tgsi_exec_channel *s,
              const union tgsi_exec_channel *t,
              const union tgsi_exec_channel *p,
-             float lodbias,  /* XXX should be float[4] */
+             const union tgsi_exec_channel *c0,
+             enum tgsi_sampler_control control,
              union tgsi_exec_channel *r,
              union tgsi_exec_channel *g,
              union tgsi_exec_channel *b,
@@ -1703,7 +1519,7 @@ fetch_texel( struct tgsi_sampler *sampler,
    uint j;
    float rgba[NUM_CHANNELS][QUAD_SIZE];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
 
    for (j = 0; j < 4; j++) {
       r->f[j] = rgba[0][j];
@@ -1714,102 +1530,95 @@ fetch_texel( struct tgsi_sampler *sampler,
 }
 
 
+#define TEX_MODIFIER_NONE           0
+#define TEX_MODIFIER_PROJECTED      1
+#define TEX_MODIFIER_LOD_BIAS       2
+#define TEX_MODIFIER_EXPLICIT_LOD   3
+
+
 static void
 exec_tex(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
-         boolean biasLod,
-         boolean projected)
+         uint modifier)
 {
    const uint unit = inst->Src[1].Register.Index;
    union tgsi_exec_channel r[4];
+   const union tgsi_exec_channel *lod = &ZeroVec;
+   enum tgsi_sampler_control control;
    uint chan_index;
-   float lodBias;
 
-   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+   if (modifier != TEX_MODIFIER_NONE) {
+      FETCH(&r[3], 0, CHAN_W);
+      if (modifier != TEX_MODIFIER_PROJECTED) {
+         lod = &r[3];
+      }
+   }
+
+   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
+      control = tgsi_sampler_lod_explicit;
+   } else {
+      control = tgsi_sampler_lod_bias;
+   }
 
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-
       FETCH(&r[0], 0, CHAN_X);
 
-      if (projected) {
-         FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[1], 0, CHAN_W);
-         lodBias = r[2].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
+                  control,
+                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], lod,
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
    default:
-      assert (0);
+      assert(0);
    }
 
-   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE( &r[chan_index], 0, chan_index );
+   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(&r[chan_index], 0, chan_index);
    }
 }
 
@@ -1832,8 +1641,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[0], 0, CHAN_X);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
@@ -1846,8 +1656,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
-                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
@@ -1858,7 +1669,8 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,
+                  &r[0], &r[1], &r[2], &ZeroVec,
+                  tgsi_sampler_lod_bias,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
@@ -1955,7 +1767,7 @@ exec_declaration(struct tgsi_exec_machine *mach,
          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
             assert(decl->Semantic.Index == 0);
             assert(first == last);
-            assert(mask = TGSI_WRITEMASK_XYZW);
+            assert(mask == TGSI_WRITEMASK_XYZW);
 
             mach->Inputs[first] = mach->QuadPos;
          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
@@ -2001,6 +1813,585 @@ exec_declaration(struct tgsi_exec_machine *mach,
    }
 }
 
+typedef void (* micro_op)(union tgsi_exec_channel *dst,
+                          const union tgsi_exec_channel *src);
+
+static void
+exec_scalar_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   union tgsi_exec_channel src;
+   union tgsi_exec_channel dst;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
+   op(&dst, &src);
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src;
+
+         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
+         op(&dst.xyzw[chan], &src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_binary(struct tgsi_exec_machine *mach,
+                   const struct tgsi_full_instruction *inst,
+                   micro_op op,
+                   enum tgsi_exec_datatype dst_datatype,
+                   enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[2];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         op(&dst.xyzw[chan], src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_trinary(struct tgsi_exec_machine *mach,
+                    const struct tgsi_full_instruction *inst,
+                    micro_op op,
+                    enum tgsi_exec_datatype dst_datatype,
+                    enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[3];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
+         op(&dst.xyzw[chan], src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_dp3(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], arg);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp4(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], arg);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2a(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], arg);
+
+   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dph(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], arg);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], arg);
+
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], arg);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_break(struct tgsi_exec_machine *mach)
+{
+   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+   } else {
+      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
+
+      mach->Switch.mask = 0x0;
+
+      UPDATE_EXEC_MASK(mach);
+   }
+}
+
+static void
+exec_switch(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+   mach->Switch.mask = 0x0;
+   mach->Switch.defaultMask = 0x0;
+
+   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_case(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+   union tgsi_exec_channel src;
+   uint mask = 0;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+
+   if (mach->Switch.selector.u[0] == src.u[0]) {
+      mask |= 0x1;
+   }
+   if (mach->Switch.selector.u[1] == src.u[1]) {
+      mask |= 0x2;
+   }
+   if (mach->Switch.selector.u[2] == src.u[2]) {
+      mask |= 0x4;
+   }
+   if (mach->Switch.selector.u[3] == src.u[3]) {
+      mask |= 0x8;
+   }
+
+   mach->Switch.defaultMask |= mask;
+
+   mach->Switch.mask |= mask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_default(struct tgsi_exec_machine *mach)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+
+   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_endswitch(struct tgsi_exec_machine *mach)
+{
+   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
+   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+micro_i2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->i[0];
+   dst->f[1] = (float)src->i[1];
+   dst->f[2] = (float)src->i[2];
+   dst->f[3] = (float)src->i[3];
+}
+
+static void
+micro_not(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_shl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] << src[1].u[0];
+   dst->u[1] = src[0].u[1] << src[1].u[1];
+   dst->u[2] = src[0].u[2] << src[1].u[2];
+   dst->u[3] = src[0].u[3] << src[1].u[3];
+}
+
+static void
+micro_and(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] & src[1].u[0];
+   dst->u[1] = src[0].u[1] & src[1].u[1];
+   dst->u[2] = src[0].u[2] & src[1].u[2];
+   dst->u[3] = src[0].u[3] & src[1].u[3];
+}
+
+static void
+micro_or(union tgsi_exec_channel *dst,
+         const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] | src[1].u[0];
+   dst->u[1] = src[0].u[1] | src[1].u[1];
+   dst->u[2] = src[0].u[2] | src[1].u[2];
+   dst->u[3] = src[0].u[3] | src[1].u[3];
+}
+
+static void
+micro_xor(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] ^ src[1].u[0];
+   dst->u[1] = src[0].u[1] ^ src[1].u[1];
+   dst->u[2] = src[0].u[2] ^ src[1].u[2];
+   dst->u[3] = src[0].u[3] ^ src[1].u[3];
+}
+
+static void
+micro_f2i(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)src->f[0];
+   dst->i[1] = (int)src->f[1];
+   dst->i[2] = (int)src->f[2];
+   dst->i[3] = (int)src->f[3];
+}
+
+static void
+micro_idiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] / src[1].i[0];
+   dst->i[1] = src[0].i[1] / src[1].i[1];
+   dst->i[2] = src[0].i[2] / src[1].i[2];
+   dst->i[3] = src[0].i[3] / src[1].i[3];
+}
+
+static void
+micro_imax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
+   dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
+   dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
+   dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
+}
+
+static void
+micro_imin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
+   dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
+   dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
+   dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
+}
+
+static void
+micro_isge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
+   dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
+   dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
+   dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
+}
+
+static void
+micro_ishr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] >> src[1].i[0];
+   dst->i[1] = src[0].i[1] >> src[1].i[1];
+   dst->i[2] = src[0].i[2] >> src[1].i[2];
+   dst->i[3] = src[0].i[3] >> src[1].i[3];
+}
+
+static void
+micro_islt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
+   dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
+   dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
+   dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
+}
+
+static void
+micro_f2u(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = (uint)src->f[0];
+   dst->u[1] = (uint)src->f[1];
+   dst->u[2] = (uint)src->f[2];
+   dst->u[3] = (uint)src->f[3];
+}
+
+static void
+micro_u2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->u[0];
+   dst->f[1] = (float)src->u[1];
+   dst->f[2] = (float)src->u[2];
+   dst->f[3] = (float)src->u[3];
+}
+
+static void
+micro_uadd(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] + src[1].u[0];
+   dst->u[1] = src[0].u[1] + src[1].u[1];
+   dst->u[2] = src[0].u[2] + src[1].u[2];
+   dst->u[3] = src[0].u[3] + src[1].u[3];
+}
+
+static void
+micro_udiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] / src[1].u[0];
+   dst->u[1] = src[0].u[1] / src[1].u[1];
+   dst->u[2] = src[0].u[2] / src[1].u[2];
+   dst->u[3] = src[0].u[3] / src[1].u[3];
+}
+
+static void
+micro_umad(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
+   dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
+   dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
+   dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
+}
+
+static void
+micro_umax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
+   dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
+   dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
+   dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
+}
+
+static void
+micro_umin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
+   dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
+   dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
+   dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
+}
+
+static void
+micro_umod(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] % src[1].u[0];
+   dst->u[1] = src[0].u[1] % src[1].u[1];
+   dst->u[2] = src[0].u[2] % src[1].u[2];
+   dst->u[3] = src[0].u[3] % src[1].u[3];
+}
+
+static void
+micro_umul(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] * src[1].u[0];
+   dst->u[1] = src[0].u[1] * src[1].u[1];
+   dst->u[2] = src[0].u[2] * src[1].u[2];
+   dst->u[3] = src[0].u[3] * src[1].u[3];
+}
+
+static void
+micro_useq(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_usge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_ushr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] >> src[1].u[0];
+   dst->u[1] = src[0].u[1] >> src[1].u[1];
+   dst->u[2] = src[0].u[2] >> src[1].u[2];
+   dst->u[3] = src[0].u[3] >> src[1].u[3];
+}
+
+static void
+micro_uslt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
+}
+
+static void
+micro_usne(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
+   dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
+   dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
+   dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
+}
+
 static void
 exec_instruction(
    struct tgsi_exec_machine *mach,
@@ -2015,23 +2406,11 @@ exec_instruction(
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
-   case TGSI_OPCODE_FLR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_flr(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MOV:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&d[chan_index], 0, chan_index);
-      }
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LIT:
@@ -2068,23 +2447,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_abs( &r[0], &r[0] );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EXP:
@@ -2151,54 +2518,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Z );
-      FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp3(mach, inst);
       break;
 
-    case TGSI_OPCODE_DP4:
-    /* TGSI_OPCODE_DOT4 */
-       FETCH(&r[0], 0, CHAN_X);
-       FETCH(&r[1], 1, CHAN_X);
-
-       micro_mul( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Y);
-       FETCH(&r[2], 1, CHAN_Y);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Z);
-       FETCH(&r[2], 1, CHAN_Z);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_W);
-       FETCH(&r[2], 1, CHAN_W);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+   case TGSI_OPCODE_DP4:
+      exec_dp4(mach, inst);
       break;
 
    case TGSI_OPCODE_DST:
@@ -2255,41 +2579,15 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SUB:
@@ -2304,17 +2602,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_LRP:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add(&d[chan_index], &r[0], &r[2]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CND:
@@ -2330,31 +2618,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP2A:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[2], 2, CHAN_X );
-      micro_add( &r[0], &r[0], &r[2] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2a(mach, inst);
       break;
 
    case TGSI_OPCODE_FRC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_frc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CLAMP:
@@ -2370,33 +2638,20 @@ exec_instruction(
       }
       break;
 
+   case TGSI_OPCODE_FLR:
+      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
    case TGSI_OPCODE_ROUND:
-   case TGSI_OPCODE_ARR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_rnd(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EX2:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_exp2( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LG2:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_POW:
@@ -2449,15 +2704,9 @@ exec_instruction(
       }
       break;
 
-    case TGSI_OPCODE_ABS:
-       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-          FETCH(&r[0], 0, chan_index);
-          micro_abs(&d[chan_index], &r[0]);
-       }
-       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-       break;
+   case TGSI_OPCODE_ABS:
+      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
 
    case TGSI_OPCODE_RCC:
       FETCH(&r[0], 0, CHAN_X);
@@ -2469,60 +2718,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DPH:
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 1, CHAN_X);
-
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 1, CHAN_Y);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Z);
-      FETCH(&r[2], 1, CHAN_Z);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 1, CHAN_W);
-
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dph(mach, inst);
       break;
 
    case TGSI_OPCODE_COS:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_cos( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddx(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDY:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddy(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_KILP:
@@ -2599,14 +2807,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2616,44 +2817,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SIN:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SLE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SNE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_STR:
@@ -2666,14 +2842,14 @@ exec_instruction(
       /* simple texture lookup */
       /* src[0] = texcoord */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_NONE);
       break;
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
       /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
       break;
 
    case TGSI_OPCODE_TXD:
@@ -2689,14 +2865,14 @@ exec_instruction(
       /* Texture lookup with explit LOD */
       /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
       break;
 
    case TGSI_OPCODE_TXP:
       /* Texture lookup with projection */
       /* src[0] = texcoord (src[0].w = projection) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, TRUE);
+      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
       break;
 
    case TGSI_OPCODE_UP2H:
@@ -2758,6 +2934,10 @@ exec_instruction(
       assert (0);
       break;
 
+   case TGSI_OPCODE_ARR:
+      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
    case TGSI_OPCODE_BRA:
       assert (0);
       break;
@@ -2777,6 +2957,8 @@ exec_instruction(
          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
+         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
+         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
          /* note that PC was already incremented above */
          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
 
@@ -2784,12 +2966,17 @@ exec_instruction(
 
          /* Second, push the Cond, Loop, Cont, Func stacks */
          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
-         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* Finally, jump to the subroutine */
@@ -2822,6 +3009,12 @@ exec_instruction(
          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
          mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
          assert(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -2832,14 +3025,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_sgn(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CMP:
@@ -2946,18 +3132,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2(mach, inst);
       break;
 
    case TGSI_OPCODE_IF:
@@ -3023,87 +3198,31 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CEIL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ceil(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_I2F:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_i2f(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
       break;
 
    case TGSI_OPCODE_NOT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_not(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_TRUNC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_trunc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SHL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_shl(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_SHR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_ishr(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_AND:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_and(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_OR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_or(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_MOD:
@@ -3111,14 +3230,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_XOR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_xor(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_SAD:
@@ -3167,11 +3279,15 @@ exec_instruction(
    case TGSI_OPCODE_BGNLOOP:
       /* push LoopMask and ContMasks */
       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
+      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
       break;
 
    case TGSI_OPCODE_ENDFOR:
@@ -3218,6 +3334,8 @@ exec_instruction(
          --mach->LoopLabelStackTop;
          assert(mach->LoopCounterStackTop > 0);
          --mach->LoopCounterStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
@@ -3241,15 +3359,14 @@ exec_instruction(
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
          assert(mach->LoopLabelStackTop > 0);
          --mach->LoopLabelStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
 
    case TGSI_OPCODE_BRK:
-      /* turn off loop channels for each enabled exec channel */
-      mach->LoopMask &= ~mach->ExecMask;
-      /* Todo: if mach->LoopMask == 0, jump to end of loop */
-      UPDATE_EXEC_MASK(mach);
+      exec_break(mach);
       break;
 
    case TGSI_OPCODE_CONT:
@@ -3280,6 +3397,12 @@ exec_instruction(
       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
       mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
       assert(mach->FuncStackTop > 0);
       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -3310,11 +3433,116 @@ exec_instruction(
       UPDATE_EXEC_MASK(mach);
       break;
 
+   case TGSI_OPCODE_F2I:
+      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_IDIV:
+      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMAX:
+      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMIN:
+      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_INEG:
+      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISGE:
+      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISLT:
+      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_F2U:
+      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_U2F:
+      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UADD:
+      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UDIV:
+      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAD:
+      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAX:
+      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMIN:
+      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMOD:
+      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMUL:
+      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USEQ:
+      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USGE:
+      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USHR:
+      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USLT:
+      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USNE:
+      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_SWITCH:
+      exec_switch(mach, inst);
+      break;
+
+   case TGSI_OPCODE_CASE:
+      exec_case(mach, inst);
+      break;
+
+   case TGSI_OPCODE_DEFAULT:
+      exec_default(mach);
+      break;
+
+   case TGSI_OPCODE_ENDSWITCH:
+      exec_endswitch(mach);
+      break;
+
    default:
       assert( 0 );
    }
 }
 
+
 #define DEBUG_EXECUTION 0
 
 
@@ -3334,9 +3562,13 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
+   mach->Switch.mask = 0xf;
+
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
@@ -3393,11 +3625,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("(%6f, %6f, %6f, %6f)\n",
-                               temps[i].xyzw[0].f[j],
-                               temps[i].xyzw[1].f[j],
-                               temps[i].xyzw[2].f[j],
-                               temps[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
+                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
+                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
+                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3411,11 +3643,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("{%6f, %6f, %6f, %6f}\n",
-                               outputs[i].xyzw[0].f[j],
-                               outputs[i].xyzw[1].f[j],
-                               outputs[i].xyzw[2].f[j],
-                               outputs[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
+                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
+                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
+                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3437,6 +3669,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index afaf5c39c4..59e3b445cc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -35,11 +36,13 @@
 extern "C" {
 #endif
 
+
 #define MAX_LABELS (4 * 1024)  /**< basically, max instructions */
 
 #define NUM_CHANNELS 4  /* R,G,B,A */
 #define QUAD_SIZE    4  /* 4 pixel/quad */
 
+
 /**
   * Registers may be treated as float, signed int or unsigned int.
   */
@@ -69,6 +72,11 @@ struct tgsi_interp_coef
    float dady[NUM_CHANNELS];
 };
 
+enum tgsi_sampler_control {
+   tgsi_sampler_lod_bias,
+   tgsi_sampler_lod_explicit
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -80,7 +88,8 @@ struct tgsi_sampler
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
 };
 
@@ -179,6 +188,7 @@ struct tgsi_exec_labels
 
 #define TGSI_EXEC_MAX_COND_NESTING  32
 #define TGSI_EXEC_MAX_LOOP_NESTING  32
+#define TGSI_EXEC_MAX_SWITCH_NESTING 32
 #define TGSI_EXEC_MAX_CALL_NESTING  32
 
 /* The maximum number of input attributes per vertex. For 2D
@@ -206,9 +216,29 @@ struct tgsi_call_record
    uint CondStackTop;
    uint LoopStackTop;
    uint ContStackTop;
+   int SwitchStackTop;
+   int BreakStackTop;
    uint ReturnAddr;
 };
 
+
+/* Switch-case block state. */
+struct tgsi_switch_record {
+   uint mask;                          /**< execution mask */
+   union tgsi_exec_channel selector;   /**< a value case statements are compared to */
+   uint defaultMask;                   /**< non-execute mask for default case */
+};
+
+
+enum tgsi_break_type {
+   TGSI_EXEC_BREAK_INSIDE_LOOP,
+   TGSI_EXEC_BREAK_INSIDE_SWITCH
+};
+
+
+#define TGSI_EXEC_MAX_BREAK_STACK (TGSI_EXEC_MAX_LOOP_NESTING + TGSI_EXEC_MAX_SWITCH_NESTING)
+
+
 /**
  * Run-time virtual machine state for executing TGSI shader.
  */
@@ -251,6 +281,12 @@ struct tgsi_exec_machine
    uint FuncMask;  /**< For function calls */
    uint ExecMask;  /**< = CondMask & LoopMask */
 
+   /* Current switch-case state. */
+   struct tgsi_switch_record Switch;
+
+   /* Current break type. */
+   enum tgsi_break_type BreakType;
+
    /** Condition mask stack (for nested conditionals) */
    uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
    int CondStackTop;
@@ -271,6 +307,13 @@ struct tgsi_exec_machine
    uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int ContStackTop;
 
+   /** Switch case stack */
+   struct tgsi_switch_record SwitchStack[TGSI_EXEC_MAX_SWITCH_NESTING];
+   int SwitchStackTop;
+
+   enum tgsi_break_type BreakStack[TGSI_EXEC_MAX_BREAK_STACK];
+   int BreakStackTop;
+
    /** Function execution mask stack (for executing subroutine code) */
    uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
    int FuncStackTop;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index be375cabb8..de0e09cdba 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -119,7 +119,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 1, 0, 0, 0, 0, "NOT", TGSI_OPCODE_NOT },
    { 1, 1, 0, 0, 0, 0, "TRUNC", TGSI_OPCODE_TRUNC },
    { 1, 2, 0, 0, 0, 0, "SHL", TGSI_OPCODE_SHL },
-   { 1, 2, 0, 0, 0, 0, "SHR", TGSI_OPCODE_SHR },
+   { 0, 0, 0, 0, 0, 0, "", 88 },      /* removed */
    { 1, 2, 0, 0, 0, 0, "AND", TGSI_OPCODE_AND },
    { 1, 2, 0, 0, 0, 0, "OR", TGSI_OPCODE_OR },
    { 1, 2, 0, 0, 0, 0, "MOD", TGSI_OPCODE_MOD },
@@ -149,7 +149,33 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, 0, 0, "BREAKC", TGSI_OPCODE_BREAKC },
    { 0, 1, 0, 0, 0, 0, "KIL", TGSI_OPCODE_KIL },
    { 0, 0, 0, 0, 0, 0, "END", TGSI_OPCODE_END },
-   { 0, 0, 0, 0, 0, 0, "", 118 }      /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 118 },     /* removed */
+   { 1, 1, 0, 0, 0, 0, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH }
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index b34263da48..e4af15c156 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -124,7 +124,6 @@ OP11(I2F)
 OP11(NOT)
 OP11(TRUNC)
 OP12(SHL)
-OP12(SHR)
 OP12(AND)
 OP12(OR)
 OP12(MOD)
@@ -146,6 +145,28 @@ OP01(IFC)
 OP01(BREAKC)
 OP01(KIL)
 OP00(END)
+OP11(F2I)
+OP12(IDIV)
+OP12(IMAX)
+OP12(IMIN)
+OP11(INEG)
+OP12(ISGE)
+OP12(ISHR)
+OP12(ISLT)
+OP11(F2U)
+OP11(U2F)
+OP12(UADD)
+OP12(UDIV)
+OP13(UMAD)
+OP12(UMAX)
+OP12(UMIN)
+OP12(UMOD)
+OP12(UMUL)
+OP12(USEQ)
+OP12(USGE)
+OP12(USHR)
+OP12(USLT)
+OP12(USNE)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index fa65ecb997..8c7062d850 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -119,17 +119,29 @@ tgsi_parse_token(
    case TGSI_TOKEN_TYPE_IMMEDIATE:
    {
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+      uint imm_count;
 
       memset(imm, 0, sizeof *imm);
       copy_token(&imm->Immediate, &token);
 
+      imm_count = imm->Immediate.NrTokens - 1;
+
       switch (imm->Immediate.DataType) {
       case TGSI_IMM_FLOAT32:
-         {
-            uint imm_count = imm->Immediate.NrTokens - 1;
-            for (i = 0; i < imm_count; i++) {
-               next_token(ctx, &imm->u[i]);
-            }
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Float);
+         }
+         break;
+
+      case TGSI_IMM_UINT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Uint);
+         }
+         break;
+
+      case TGSI_IMM_INT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Int);
          }
          break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 16b8ec6051..7f1c8e5dd6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -195,7 +195,7 @@ is_any_register_declared(
    struct cso_hash_iter iter =
       cso_hash_first_node(ctx->regs_decl);
 
-   while (cso_hash_iter_is_null(iter)) {
+   while (!cso_hash_iter_is_null(iter)) {
       scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
       if (reg->file == file)
          return TRUE;
@@ -247,7 +247,7 @@ check_register_usage(
    boolean indirect_access )
 {
    if (!check_file_name( ctx, reg->file )) {
-      free(reg);
+      FREE(reg);
       return FALSE;
    }
 
@@ -261,21 +261,23 @@ check_register_usage(
       if (!is_ind_register_used(ctx, reg))
          cso_hash_insert(ctx->regs_ind_used, reg->file, reg);
       else
-         free(reg);
+         FREE(reg);
    }
    else {
       if (!is_register_declared( ctx, reg )) {
-         if (reg->dimensions == 2)
+         if (reg->dimensions == 2) {
             report_error( ctx, "%s[%d][%d]: Undeclared %s register", file_names[reg->file],
                           reg->indices[0], reg->indices[1], name );
-         else
+         }
+         else {
             report_error( ctx, "%s[%d]: Undeclared %s register", file_names[reg->file],
                           reg->indices[0], name );
          }
+      }
       if (!is_register_used( ctx, reg ))
          cso_hash_insert(ctx->regs_used, scan_register_key(reg), reg);
       else
-         free(reg);
+         FREE(reg);
    }
    return TRUE;
 }
@@ -333,15 +335,15 @@ iter_instruction(
          fill_scan_register1d(ind_reg,
                               inst->Src[i].Indirect.File,
                               inst->Src[i].Indirect.Index);
+         if (!(reg->file == TGSI_FILE_ADDRESS || reg->file == TGSI_FILE_LOOP) ||
+             reg->indices[0] != 0) {
+            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
+         }
          check_register_usage(
             ctx,
             reg,
             "indirect",
             FALSE );
-         if (!(reg->file == TGSI_FILE_ADDRESS || reg->file == TGSI_FILE_LOOP) ||
-             reg->indices[0] != 0) {
-            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
-         }
       }
    }
 
@@ -445,7 +447,9 @@ iter_immediate(
 
    /* Check data type validity.
     */
-   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
+   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32 &&
+       imm->Immediate.DataType != TGSI_IMM_UINT32 &&
+       imm->Immediate.DataType != TGSI_IMM_INT32) {
       report_error( ctx, "(%u): Invalid immediate data type", imm->Immediate.DataType );
       return TRUE;
    }
@@ -486,7 +490,7 @@ epilog(
       struct cso_hash_iter iter =
          cso_hash_first_node(ctx->regs_decl);
 
-      while (cso_hash_iter_is_null(iter)) {
+      while (!cso_hash_iter_is_null(iter)) {
          scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
          if (!is_register_used(ctx, reg) && !is_ind_register_used(ctx, reg)) {
             report_warning( ctx, "%s[%u]: Register never used",
@@ -511,7 +515,8 @@ regs_hash_destroy(struct cso_hash *hash)
    while (!cso_hash_iter_is_null(iter)) {
       scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
       iter = cso_hash_erase(hash, iter);
-      free(reg);
+      assert(reg->file < TGSI_FILE_COUNT);
+      FREE(reg);
    }
    cso_hash_delete(hash);
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index d63c75dafb..a85cc4659e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -1418,13 +1419,13 @@ fetch_texel( struct tgsi_sampler **sampler,
                 sampler, *sampler,
                 store );
 
-   debug_printf("lodbias %f\n", store[12]);
-
    for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n", 
+      debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
                    j, 
                    store[0+j],
-                   store[4+j]);
+                   store[4+j],
+                   store[8 + j],
+                   store[12 + j]);
 #endif
 
    {
@@ -1433,7 +1434,8 @@ fetch_texel( struct tgsi_sampler **sampler,
                               &store[0],  /* s */
                               &store[4],  /* t */
                               &store[8],  /* r */
-                              store[12],  /* lodbias */
+                              &store[12], /* lodbias */
+                              tgsi_sampler_lod_bias,
                               rgba);      /* results */
 
       memcpy( store, rgba, 16 * sizeof(float));
@@ -2144,40 +2146,50 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_XPD:
+      /* Note: we do all stores after all operands have been fetched
+       * to avoid src/dst register aliasing issues for an instruction
+       * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
+       */
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
+         FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
+         FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
+         FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
+         FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
+         emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
+         emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
+         emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
+         /* store xmm[7] in dst.x below */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
+         FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
+         FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
+         emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
+         emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
+         emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
+         /* store xmm[3] in dst.y below */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
+         emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
+         STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
 	 emit_tempf(
@@ -2506,7 +2518,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( func, inst, TRUE, FALSE );
+      return 0;
       break;
 
    case TGSI_OPCODE_TXP:
@@ -2578,7 +2590,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       return 0;
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 6a0af664dd..e64e2b731d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -101,8 +101,13 @@ struct ureg_program
    unsigned nr_outputs;
 
    struct {
-      float v[4];
+      union {
+         float f[4];
+         unsigned u[4];
+         int i[4];
+      } value;
       unsigned nr;
+      unsigned type;
    } immediate[UREG_MAX_IMMEDIATE];
    unsigned nr_immediates;
 
@@ -486,22 +491,22 @@ struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
 }
 
 
-
-
-static int match_or_expand_immediate( const float *v,
-                                      unsigned nr,
-                                      float *v2,
-                                      unsigned *nr2,
-                                      unsigned *swizzle )
+static int
+match_or_expand_immediate( const unsigned *v,
+                           unsigned nr,
+                           unsigned *v2,
+                           unsigned *pnr2,
+                           unsigned *swizzle )
 {
+   unsigned nr2 = *pnr2;
    unsigned i, j;
-   
+
    *swizzle = 0;
 
    for (i = 0; i < nr; i++) {
       boolean found = FALSE;
 
-      for (j = 0; j < *nr2 && !found; j++) {
+      for (j = 0; j < nr2 && !found; j++) {
          if (v[i] == v2[j]) {
             *swizzle |= j << (i * 2);
             found = TRUE;
@@ -509,24 +514,28 @@ static int match_or_expand_immediate( const float *v,
       }
 
       if (!found) {
-         if (*nr2 >= 4) 
+         if (nr2 >= 4) {
             return FALSE;
+         }
 
-         v2[*nr2] = v[i];
-         *swizzle |= *nr2 << (i * 2);
-         (*nr2)++;
+         v2[nr2] = v[i];
+         *swizzle |= nr2 << (i * 2);
+         nr2++;
       }
    }
 
+   /* Actually expand immediate only when fully succeeded.
+    */
+   *pnr2 = nr2;
    return TRUE;
 }
 
 
-
-
-struct ureg_src ureg_DECL_immediate( struct ureg_program *ureg, 
-                                     const float *v,
-                                     unsigned nr )
+static struct ureg_src
+decl_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned nr,
+                unsigned type )
 {
    unsigned i, j;
    unsigned swizzle;
@@ -536,38 +545,82 @@ struct ureg_src ureg_DECL_immediate( struct ureg_program *ureg,
     */
 
    for (i = 0; i < ureg->nr_immediates; i++) {
-      if (match_or_expand_immediate( v, 
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      if (ureg->immediate[i].type != type) {
+         continue;
+      }
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
    if (ureg->nr_immediates < UREG_MAX_IMMEDIATE) {
       i = ureg->nr_immediates++;
-      if (match_or_expand_immediate( v,
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      ureg->immediate[i].type = type;
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
-   set_bad( ureg );
+   set_bad(ureg);
 
 out:
    /* Make sure that all referenced elements are from this immediate.
     * Has the effect of making size-one immediates into scalars.
     */
-   for (j = nr; j < 4; j++)
+   for (j = nr; j < 4; j++) {
       swizzle |= (swizzle & 0x3) << (j * 2);
+   }
+
+   return ureg_swizzle(ureg_src_register(TGSI_FILE_IMMEDIATE, i),
+                       (swizzle >> 0) & 0x3,
+                       (swizzle >> 2) & 0x3,
+                       (swizzle >> 4) & 0x3,
+                       (swizzle >> 6) & 0x3);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate( struct ureg_program *ureg,
+                     const float *v,
+                     unsigned nr )
+{
+   union {
+      float f[4];
+      unsigned u[4];
+   } fu;
+   unsigned int i;
+
+   for (i = 0; i < nr; i++) {
+      fu.f[i] = v[i];
+   }
+
+   return decl_immediate(ureg, fu.u, nr, TGSI_IMM_FLOAT32);
+}
+
 
-   return ureg_swizzle( ureg_src_register( TGSI_FILE_IMMEDIATE, i ),
-                        (swizzle >> 0) & 0x3,
-                        (swizzle >> 2) & 0x3,
-                        (swizzle >> 4) & 0x3,
-                        (swizzle >> 6) & 0x3);
+struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *ureg,
+                          const unsigned *v,
+                          unsigned nr )
+{
+   return decl_immediate(ureg, v, nr, TGSI_IMM_UINT32);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *ureg,
+                         const int *v,
+                         unsigned nr )
+{
+   return decl_immediate(ureg, (const unsigned *)v, nr, TGSI_IMM_INT32);
 }
 
 
@@ -955,21 +1008,23 @@ static void emit_decl_range( struct ureg_program *ureg,
    out[1].decl_range.Last = first + count - 1;
 }
 
-static void emit_immediate( struct ureg_program *ureg,
-                            const float *v )
+static void
+emit_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned type )
 {
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_DECL, 5 );
 
    out[0].value = 0;
    out[0].imm.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
    out[0].imm.NrTokens = 5;
-   out[0].imm.DataType = TGSI_IMM_FLOAT32;
+   out[0].imm.DataType = type;
    out[0].imm.Padding = 0;
 
-   out[1].imm_data.Float = v[0];
-   out[2].imm_data.Float = v[1];
-   out[3].imm_data.Float = v[2];
-   out[4].imm_data.Float = v[3];
+   out[1].imm_data.Uint = v[0];
+   out[2].imm_data.Uint = v[1];
+   out[3].imm_data.Uint = v[2];
+   out[4].imm_data.Uint = v[3];
 }
 
 
@@ -1055,7 +1110,8 @@ static void emit_decls( struct ureg_program *ureg )
 
    for (i = 0; i < ureg->nr_immediates; i++) {
       emit_immediate( ureg,
-                      ureg->immediate[i].v );
+                      ureg->immediate[i].value.u,
+                      ureg->immediate[i].type );
    }
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 7e3e7bcf1d..6f11273320 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -148,6 +148,16 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *,
+                          const unsigned *v,
+                          unsigned nr );
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *,
+                         const int *v,
+                         unsigned nr );
+
+struct ureg_src
 ureg_DECL_constant( struct ureg_program *,
                     unsigned index );
 
@@ -221,6 +231,90 @@ ureg_imm1f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 1 );
 }
 
+static INLINE struct ureg_src
+ureg_imm4u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c, unsigned d)
+{
+   unsigned v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_uint( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c)
+{
+   unsigned v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_uint( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2u( struct ureg_program *ureg,
+            unsigned a, unsigned b)
+{
+   unsigned v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_uint( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1u( struct ureg_program *ureg,
+            unsigned a)
+{
+   return ureg_DECL_immediate_uint( ureg, &a, 1 );
+}
+
+static INLINE struct ureg_src
+ureg_imm4i( struct ureg_program *ureg,
+            int a, int b,
+            int c, int d)
+{
+   int v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_int( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3i( struct ureg_program *ureg,
+            int a, int b,
+            int c)
+{
+   int v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_int( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2i( struct ureg_program *ureg,
+            int a, int b)
+{
+   int v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_int( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1i( struct ureg_program *ureg,
+            int a)
+{
+   return ureg_DECL_immediate_int( ureg, &a, 1 );
+}
+
 /***********************************************************************
  * Functions for patching up labels
  */
diff --git a/src/gallium/auxiliary/util/u_bitmask.c b/src/gallium/auxiliary/util/u_bitmask.c
index 77587c07ec..23c93a3ebc 100644
--- a/src/gallium/auxiliary/util/u_bitmask.c
+++ b/src/gallium/auxiliary/util/u_bitmask.c
@@ -97,12 +97,12 @@ util_bitmask_resize(struct util_bitmask *bm,
    if(!minimum_size)
       return FALSE;
       
-   if(bm->size > minimum_size)
+   if(bm->size >= minimum_size)
       return TRUE;
 
    assert(bm->size % UTIL_BITMASK_BITS_PER_WORD == 0);
    new_size = bm->size;
-   while(!(new_size > minimum_size)) {
+   while(new_size < minimum_size) {
       new_size *= 2;
       /* Check integer overflow */
       if(new_size < bm->size)
@@ -136,7 +136,7 @@ util_bitmask_filled_set(struct util_bitmask *bm,
                         unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index == bm->filled) {
       ++bm->filled;
@@ -149,7 +149,7 @@ util_bitmask_filled_unset(struct util_bitmask *bm,
                           unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index < bm->filled)
       bm->filled = index;
@@ -182,7 +182,7 @@ util_bitmask_add(struct util_bitmask *bm)
       mask = 1;
    }
 found:
-   
+
    /* grow the bitmask if necessary */
    if(!util_bitmask_resize(bm, bm->filled))
       return UTIL_BITMASK_INVALID_INDEX;
@@ -198,9 +198,9 @@ unsigned
 util_bitmask_set(struct util_bitmask *bm, 
                  unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
@@ -208,6 +208,10 @@ util_bitmask_set(struct util_bitmask *bm,
    if(!util_bitmask_resize(bm, index))
       return UTIL_BITMASK_INVALID_INDEX;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] |= mask;
 
    util_bitmask_filled_set(bm, index);
@@ -220,15 +224,19 @@ void
 util_bitmask_clear(struct util_bitmask *bm, 
                    unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
    if(index >= bm->size)
       return;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] &= ~mask;
    
    util_bitmask_filled_unset(bm, index);
@@ -250,7 +258,7 @@ util_bitmask_get(struct util_bitmask *bm,
       return TRUE;
    }
 
-   if(index > bm->size)
+   if(index >= bm->size)
       return FALSE;
 
    if(bm->words[word] & mask) {
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 1f794d39a1..46b4706b76 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -48,6 +48,8 @@
 #include "util/u_simple_shaders.h"
 #include "util/u_texture.h"
 
+#define INVALID_PTR ((void*)~0)
+
 struct blitter_context_priv
 {
    struct blitter_context blitter;
@@ -110,6 +112,11 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    ctx->pipe = pipe;
 
    /* init state objects for them to be considered invalid */
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
    ctx->blitter.saved_fb_state.nr_cbufs = ~0;
    ctx->blitter.saved_num_textures = ~0;
    ctx->blitter.saved_num_sampler_states = ~0;
@@ -156,6 +163,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    rs_state.cull_mode = PIPE_WINDING_NONE;
    rs_state.bypass_vs_clip_and_viewport = 1;
    rs_state.gl_rasterization_rules = 1;
+   rs_state.flatshade = 1;
    ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state);
 
    /* fragment shaders are created on-demand */
@@ -234,11 +242,11 @@ void util_blitter_destroy(struct blitter_context *blitter)
 static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
 {
    /* make sure these CSOs have been saved */
-   assert(ctx->blitter.saved_blend_state &&
-          ctx->blitter.saved_dsa_state &&
-          ctx->blitter.saved_rs_state &&
-          ctx->blitter.saved_fs &&
-          ctx->blitter.saved_vs);
+   assert(ctx->blitter.saved_blend_state != INVALID_PTR &&
+          ctx->blitter.saved_dsa_state != INVALID_PTR &&
+          ctx->blitter.saved_rs_state != INVALID_PTR &&
+          ctx->blitter.saved_fs != INVALID_PTR &&
+          ctx->blitter.saved_vs != INVALID_PTR);
 }
 
 static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
@@ -252,11 +260,11 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
    pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
    pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
 
-   ctx->blitter.saved_blend_state = 0;
-   ctx->blitter.saved_dsa_state = 0;
-   ctx->blitter.saved_rs_state = 0;
-   ctx->blitter.saved_fs = 0;
-   ctx->blitter.saved_vs = 0;
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
 
    /* restore the state objects which are required to be saved before copy/fill
     */
@@ -560,45 +568,29 @@ void util_blitter_clear(struct blitter_context *blitter,
    blitter_restore_CSOs(ctx);
 }
 
-void util_blitter_copy(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       struct pipe_surface *src,
-                       unsigned srcx, unsigned srcy,
-                       unsigned width, unsigned height,
-                       boolean ignore_stencil)
+static boolean
+is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2,
+           unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2)
+{
+    if (sx1 >= dx2 || sx2 <= dx1 || sy1 >= dy2 || sy2 <= dy1) {
+        return FALSE;
+    } else {
+        return TRUE;
+    }
+}
+
+static void util_blitter_do_copy(struct blitter_context *blitter,
+				 struct pipe_surface *dst,
+				 unsigned dstx, unsigned dsty,
+				 struct pipe_surface *src,
+				 unsigned srcx, unsigned srcy,
+				 unsigned width, unsigned height,
+				 boolean is_depth)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
    struct pipe_framebuffer_state fb_state;
-   boolean is_stencil, is_depth;
-   unsigned dst_tex_usage;
-
-   /* give up if textures are not set */
-   assert(dst->texture && src->texture);
-   if (!dst->texture || !src->texture)
-      return;
 
-   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
-   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
-   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
-                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
-
-   /* check if we can sample from and render to the surfaces */
-   /* (assuming copying a stencil buffer is not possible) */
-   if ((!ignore_stencil && is_stencil) ||
-       !screen->is_format_supported(screen, dst->format, dst->texture->target,
-                                    dst_tex_usage, 0) ||
-       !screen->is_format_supported(screen, src->format, src->texture->target,
-                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
-      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
-                        width, height);
-      return;
-   }
-
-   /* check whether the states are properly saved */
-   blitter_check_saved_CSOs(ctx);
    assert(blitter->saved_fb_state.nr_cbufs != ~0);
    assert(blitter->saved_num_textures != ~0);
    assert(blitter->saved_num_sampler_states != ~0);
@@ -656,6 +648,108 @@ void util_blitter_copy(struct blitter_context *blitter,
 
    blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, 0);
    blitter_draw_quad(ctx);
+
+}
+
+static void util_blitter_overlap_copy(struct blitter_context *blitter,
+				      struct pipe_surface *dst,
+				      unsigned dstx, unsigned dsty,
+				      struct pipe_surface *src,
+				      unsigned srcx, unsigned srcy,
+				      unsigned width, unsigned height)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   struct pipe_texture texTemp;
+   struct pipe_texture *texture;
+   struct pipe_surface *tex_surf;
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = dst->texture->format; /* XXX verify supported by driver! */
+   texTemp.last_level = 0;
+   texTemp.width0 = width;
+   texTemp.height0 = height;
+   texTemp.depth0 = 1;
+
+   texture = screen->texture_create(screen, &texTemp);
+   if (!texture)
+      return;
+
+   tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0,
+				      PIPE_BUFFER_USAGE_GPU_READ | 
+				      PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   /* blit from the src to the temp */
+   util_blitter_do_copy(blitter, tex_surf, 0, 0,
+			src, srcx, srcy,
+			width, height,
+			FALSE);
+   util_blitter_do_copy(blitter, dst, dstx, dsty,
+			tex_surf, 0, 0,
+			width, height,
+			FALSE);
+   pipe_surface_reference(&tex_surf, NULL);
+   pipe_texture_reference(&texture, NULL);
+   blitter_restore_CSOs(ctx);
+}
+
+void util_blitter_copy(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface *src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height,
+                       boolean ignore_stencil)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   boolean is_stencil, is_depth;
+   unsigned dst_tex_usage;
+
+   /* give up if textures are not set */
+   assert(dst->texture && src->texture);
+   if (!dst->texture || !src->texture)
+      return;
+
+   if (dst->texture == src->texture) {
+      if (is_overlap(srcx, srcx + width, srcy, srcy + height,
+		             dstx, dstx + width, dsty, dsty + height)) {
+         util_blitter_overlap_copy(blitter, dst, dstx, dsty, src, srcx, srcy,
+                                   width, height);
+         return;
+      }
+   }
+		   
+   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
+   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
+   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
+                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   /* check if we can sample from and render to the surfaces */
+   /* (assuming copying a stencil buffer is not possible) */
+   if ((!ignore_stencil && is_stencil) ||
+       !screen->is_format_supported(screen, dst->format, dst->texture->target,
+                                    dst_tex_usage, 0) ||
+       !screen->is_format_supported(screen, src->format, src->texture->target,
+                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
+      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
+                        width, height);
+      return;
+   }
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+   util_blitter_do_copy(blitter,
+			dst, dstx, dsty,
+			src, srcx, srcy,
+			width, height, is_depth);
    blitter_restore_CSOs(ctx);
 }
 
diff --git a/src/gallium/auxiliary/util/u_debug_dump.c b/src/gallium/auxiliary/util/u_debug_dump.c
index 09866880ae..61624d05c0 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.c
+++ b/src/gallium/auxiliary/util/u_debug_dump.c
@@ -255,15 +255,13 @@ DEFINE_DEBUG_DUMP_CONTINUOUS(tex_mipfilter)
 static const char *
 debug_dump_tex_filter_names[] = {
    "PIPE_TEX_FILTER_NEAREST",
-   "PIPE_TEX_FILTER_LINEAR",
-   "PIPE_TEX_FILTER_ANISO"
+   "PIPE_TEX_FILTER_LINEAR"
 };
 
 static const char *
 debug_dump_tex_filter_short_names[] = {
    "nearest",
-   "linear",
-   "aniso"
+   "linear"
 };
 
 DEFINE_DEBUG_DUMP_CONTINUOUS(tex_filter)
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 7623cb9398..d6484f4ad5 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -297,9 +297,9 @@ debug_memory_end(unsigned long start_no)
 
       if((start_no <= hdr->no && hdr->no < last_no) ||
 	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
-	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
+	 debug_printf("%s:%u:%s: %lu bytes at %p not freed\n",
 		      hdr->file, hdr->line, hdr->function,
-		      hdr->size, ptr);
+		      (unsigned long) hdr->size, ptr);
 #if DEBUG_MEMORY_STACK
 	 debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK);
 #endif
@@ -315,8 +315,8 @@ debug_memory_end(unsigned long start_no)
    }
 
    if(total_size) {
-      debug_printf("Total of %u KB of system memory apparently leaked\n",
-		   (total_size + 1023)/1024);
+      debug_printf("Total of %lu KB of system memory apparently leaked\n",
+		   (unsigned long) (total_size + 1023)/1024);
    }
    else {
       debug_printf("No memory leaks detected.\n");
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 866b18ff16..9f16b42944 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -76,9 +76,9 @@ PIPE_FORMAT_R8G8_SNORM            , array , 1, 1, sn8 , sn8 ,     ,     , xy01,
 PIPE_FORMAT_R8G8B8_SNORM          , array , 1, 1, sn8 , sn8 , sn8 ,     , xyz1, rgb
 PIPE_FORMAT_R8G8B8A8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb
 PIPE_FORMAT_R8G8B8X8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyz1, rgb
-PIPE_FORMAT_B6G5R5_SNORM          , arith , 1, 1, sn5 , sn5 , sn6 ,     , zyx1, rgb
-PIPE_FORMAT_A8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyxw, rgb
-PIPE_FORMAT_X8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyx1, rgb
+PIPE_FORMAT_B6G5R5_SNORM          , arith , 1, 1, sn5 , sn5 , sn6 ,     , xyz1, rgb
+PIPE_FORMAT_A8B8G8R8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , wzyx, rgb
+PIPE_FORMAT_X8B8G8R8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , wzy1, rgb
 PIPE_FORMAT_R8_SSCALED            , array , 1, 1, s8  ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R8G8_SSCALED          , array , 1, 1, s8  , s8  ,     ,     , xy01, rgb
 PIPE_FORMAT_R8G8B8_SSCALED        , array , 1, 1, s8  , s8  , s8  ,     , xyz1, rgb
@@ -90,14 +90,14 @@ PIPE_FORMAT_R32G32B32_FIXED       , array , 1, 1, h32 , h32 , h32 ,     , xyz1,
 PIPE_FORMAT_R32G32B32A32_FIXED    , array , 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
 PIPE_FORMAT_L8_SRGB               , arith , 1, 1, u8  ,     ,     ,     , xxx1, srgb 
 PIPE_FORMAT_A8L8_SRGB             , arith , 1, 1, u8  , u8  ,     ,     , xxxy, srgb 
-PIPE_FORMAT_R8G8B8_SRGB           , arith , 1, 1, u8  , u8  , u8  ,     , xyz1, srgb 
-PIPE_FORMAT_R8G8B8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyzw, srgb 
-PIPE_FORMAT_R8G8B8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyz1, srgb 
-PIPE_FORMAT_A8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , wxyz, srgb 
-PIPE_FORMAT_X8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , 1xyz, srgb 
-PIPE_FORMAT_B8G8R8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyxw, srgb 
-PIPE_FORMAT_B8G8R8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyx1, srgb 
-PIPE_FORMAT_X8UB8UG8SR8S_NORM     , arith , 1, 1, sn8 , sn8 , un8 , x8  , 1zyx, rgb
+PIPE_FORMAT_R8G8B8_SRGB           , array , 1, 1, u8  , u8  , u8  ,     , xyz1, srgb 
+PIPE_FORMAT_R8G8B8A8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , xyzw, srgb 
+PIPE_FORMAT_R8G8B8X8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , xyz1, srgb 
+PIPE_FORMAT_A8R8G8B8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , yzwx, srgb 
+PIPE_FORMAT_X8R8G8B8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , yzw1, srgb 
+PIPE_FORMAT_B8G8R8A8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , zyxw, srgb 
+PIPE_FORMAT_B8G8R8X8_SRGB         , array , 1, 1, u8  , u8  , u8  , u8  , zyx1, srgb 
+PIPE_FORMAT_X8UB8UG8SR8S_NORM     , array , 1, 1, sn8 , sn8 , un8 , x8  , wzy1, rgb
 PIPE_FORMAT_B6UG5SR5S_NORM        , arith , 1, 1, sn5 , sn5 , un6 ,     , xyz1, rgb
 PIPE_FORMAT_DXT1_RGB              , dxt   , 4, 4, x64 ,     ,     ,     , xyz1, rgb
 PIPE_FORMAT_DXT1_RGBA             , dxt   , 4, 4, x64 ,     ,     ,     , xyzw, rgb
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index 9eb8f309cd..87ee0e4768 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -117,7 +117,7 @@ u_socket_connect(const char *hostname, uint16_t port)
    if (!host)
       return -1;
 
-   memcpy((char *)&sa.sin_addr,host->h_addr,host->h_length);
+   memcpy((char *)&sa.sin_addr,host->h_addr_list[0],host->h_length);
    sa.sin_family= host->h_addrtype;
    sa.sin_port = htons(port);
 
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 298fbacecb..8479161c74 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -41,7 +41,7 @@
 /**
  * Copy 2D rect from one place to another.
  * Position and sizes are in pixels.
- * src_pitch may be negative to do vertical flip of pixels from source.
+ * src_stride may be negative to do vertical flip of pixels from source.
  */
 void
 util_copy_rect(ubyte * dst,
@@ -54,7 +54,7 @@ util_copy_rect(ubyte * dst,
                const ubyte * src,
                int src_stride,
                unsigned src_x, 
-               int src_y)
+               unsigned src_y)
 {
    unsigned i;
    int src_stride_pos = src_stride < 0 ? -src_stride : src_stride;
@@ -65,10 +65,6 @@ util_copy_rect(ubyte * dst,
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(src_x >= 0);
-   assert(src_y >= 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
@@ -113,8 +109,6 @@ util_fill_rect(ubyte * dst,
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index 5e444ffae2..b44d821904 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -45,7 +45,7 @@ extern void
 util_copy_rect(ubyte * dst, enum pipe_format format,
                unsigned dst_stride, unsigned dst_x, unsigned dst_y,
                unsigned width, unsigned height, const ubyte * src,
-               int src_stride, unsigned src_x, int src_y);
+               int src_stride, unsigned src_x, unsigned src_y);
 
 extern void
 util_fill_rect(ubyte * dst, enum pipe_format format,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 8172ead020..b751e29ab6 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -44,13 +44,15 @@
 
 /**
  * Make simple vertex pass-through shader.
+ * \param num_attribs  number of attributes to pass through
+ * \param semantic_names  array of semantic names for each attribute
+ * \param semantic_indexes  array of semantic indexes for each attribute
  */
 void *
 util_make_vertex_passthrough_shader(struct pipe_context *pipe,
                                     uint num_attribs,
                                     const uint *semantic_names,
                                     const uint *semantic_indexes)
-                                    
 {
    struct ureg_program *ureg;
    uint i;
@@ -78,8 +80,6 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 }
 
 
-
-
 /**
  * Make simple fragment texture shader:
  *  IMM {0,0,0,1}                         // (if writemask != 0xf)
@@ -125,6 +125,12 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
 
+
+/**
+ * Make a simple fragment shader that sets the output color to a color
+ * taken from a texture.
+ * \param tex_target  one of PIPE_TEXTURE_x
+ */
 void *
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
 {
@@ -133,6 +139,7 @@ util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
                                                    TGSI_WRITEMASK_XYZW );
 }
 
+
 /**
  * Make a simple fragment texture shader which reads an X component from
  * a texture and writes it as depth.
@@ -177,6 +184,7 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
 
+
 /**
  * Make simple fragment color pass-through shader.
  */
@@ -186,15 +194,19 @@ util_make_fragment_passthrough_shader(struct pipe_context *pipe)
    return util_make_fragment_clonecolor_shader(pipe, 1);
 }
 
+
+/**
+ * Make a fragment shader that copies the input color to N output colors.
+ */
 void *
 util_make_fragment_clonecolor_shader(struct pipe_context *pipe, int num_cbufs)
 {
    struct ureg_program *ureg;
    struct ureg_src src;
-   struct ureg_dst dst[8];
+   struct ureg_dst dst[PIPE_MAX_COLOR_BUFS];
    int i;
 
-   assert(num_cbufs <= 8);
+   assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
 
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 5b8dd1abb9..1ba82bb21f 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -1155,27 +1155,6 @@ ycbcr_get_tile_rgba(const ushort *src,
 }
 
 
-static void
-fake_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   float *p,
-                   unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] =
-         pRow[3] = (i ^ j) & 1 ? 1.0f : 0.0f;
-      }
-      p += dst_stride;
-   }
-}
-
-
 void
 pipe_tile_raw_to_rgba(enum pipe_format format,
                       void *src,
@@ -1258,8 +1237,10 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
       break;
    default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
-      fake_get_tile_rgba(src, w, h, dst, dst_stride);
+      util_format_read_4f(format,
+                          dst, dst_stride * sizeof(float),
+                          src, util_format_get_stride(format, w),
+                          0, 0, w, h);
    }
 }