199 files changed, 27746 insertions, 4173 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 1f6860da11..d5f5c7bbba 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -327,7 +327,7 @@ struct cell_command_sampler
    opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
    uint unit;
    struct pipe_sampler_state state;
-   uint32_t pad_[1];
+   uint32_t pad_[2];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 7cd5656a7e..312621fd53 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1767,7 +1767,7 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_MAD:
       return emit_MAD(gen, inst);
    case TGSI_OPCODE_LERP:
-      return emit_LERP(gen, inst);
+      return emit_LRP(gen, inst);
    case TGSI_OPCODE_DP3:
       return emit_DP3(gen, inst);
    case TGSI_OPCODE_DP4:
@@ -1810,9 +1810,9 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
    case TGSI_OPCODE_POW:
       return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
-   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_EX2:
       return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
-   case TGSI_OPCODE_LOGBASE2:
+   case TGSI_OPCODE_LG2:
       return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
    case TGSI_OPCODE_TEX:
       /* fall-through for now */
@@ -1834,9 +1834,9 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_ENDIF:
       return emit_ENDIF(gen, inst);
 
-   case TGSI_OPCODE_BGNLOOP2:
+   case TGSI_OPCODE_BGNLOOP:
       return emit_BGNLOOP(gen, inst);
-   case TGSI_OPCODE_ENDLOOP2:
+   case TGSI_OPCODE_ENDLOOP:
       return emit_ENDLOOP(gen, inst);
    case TGSI_OPCODE_BRK:
       return emit_BRK(gen, inst);
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index bd48ce7005..d185c6b849 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -41,7 +41,7 @@
 static const char *
 cell_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -64,8 +64,6 @@ cell_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index e26594448f..6a63a0e6ce 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -44,13 +44,6 @@
 
 
 
-static unsigned
-minify(unsigned d)
-{
-   return MAX2(1, d>>1);
-}
-
-
 static void
 cell_texture_layout(struct cell_texture *ct)
 {
@@ -424,7 +417,8 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    if (!ctrans->map)
       return NULL; /* out of memory */
 
-   if (transfer->usage & PIPE_TRANSFER_READ) {
+   if (transfer->usage == PIPE_TRANSFER_READ ||
+       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
       /* need to untwiddle the texture to make a linear version */
       const uint bpp = pf_get_size(ct->base.format);
       if (bpp == 4) {
@@ -465,7 +459,8 @@ cell_transfer_unmap(struct pipe_screen *screen,
                                    PIPE_BUFFER_USAGE_CPU_READ);
    }
 
-   if (transfer->usage & PIPE_TRANSFER_WRITE) {
+   if (transfer->usage == PIPE_TRANSFER_WRITE ||
+       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
       /* The user wrote new texture data into the mapped buffer.
        * We need to convert the new linear data into the twiddled/tiled format.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index e27df2dfb3..0eaae2e451 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -952,7 +952,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
       FETCH( &r[0], 0, CHAN_X );
       r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -961,7 +960,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
       FETCH( &r[0], 0, CHAN_X );
       r[0].q = micro_sqrt(r[0].q);
       r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
@@ -1115,7 +1113,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
@@ -1136,8 +1133,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
+   case TGSI_OPCODE_LRP:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
@@ -1154,25 +1150,11 @@ exec_instruction(
       ASSERT (0);
       break;
 
-   case TGSI_OPCODE_CND0:
+   case TGSI_OPCODE_DP2A:
       ASSERT (0);
       break;
 
-   case TGSI_OPCODE_DOT2ADD:
-      /* TGSI_OPCODE_DP2A */
-      ASSERT (0);
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      ASSERT (0);
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      ASSERT (0);
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
+   case TGSI_OPCODE_FRC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          r[0].q = micro_frc(r[0].q);
@@ -1184,8 +1166,7 @@ exec_instruction(
       ASSERT (0);
       break;
 
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
+   case TGSI_OPCODE_FLR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          r[0].q = micro_flr(r[0].q);
@@ -1201,8 +1182,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_EXPBASE2:
-    /* TGSI_OPCODE_EX2 */
+   case TGSI_OPCODE_EX2:
       FETCH(&r[0], 0, CHAN_X);
 
       r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
@@ -1212,8 +1192,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
+   case TGSI_OPCODE_LG2:
       FETCH( &r[0], 0, CHAN_X );
       r[0].q = micro_lg2(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1221,8 +1200,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_POWER:
-      /* TGSI_OPCODE_POW */
+   case TGSI_OPCODE_POW:
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
@@ -1233,7 +1211,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_CROSSPRODUCT:
+   case TGSI_OPCODE_XPD:
       /* TGSI_OPCODE_XPD */
       FETCH(&r[0], 0, CHAN_Y);
       FETCH(&r[1], 1, CHAN_Z);
@@ -1275,10 +1253,6 @@ exec_instruction(
       }
       break;
 
-    case TGSI_OPCODE_MULTIPLYMATRIX:
-       ASSERT (0);
-       break;
-
     case TGSI_OPCODE_ABS:
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH(&r[0], 0, chan_index);
@@ -1780,9 +1754,9 @@ exec_instruction(
       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
       break;
 
-   case TGSI_OPCODE_LOOP:
+   case TGSI_OPCODE_BGNFOR:
       /* fall-through (for now) */
-   case TGSI_OPCODE_BGNLOOP2:
+   case TGSI_OPCODE_BGNLOOP:
       /* push LoopMask and ContMasks */
       ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
@@ -1790,9 +1764,9 @@ exec_instruction(
       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       break;
 
-   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_ENDFOR:
       /* fall-through (for now at least) */
-   case TGSI_OPCODE_ENDLOOP2:
+   case TGSI_OPCODE_ENDLOOP:
       /* Restore ContMask, but don't pop */
       ASSERT(mach->ContStackTop > 0);
       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
index 8870b39866..fb533c1796 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -5,6 +5,7 @@ LIBNAME = i915simple
 
 C_SOURCES = \
 	i915_blit.c \
+	i915_buffer.c \
 	i915_clear.c \
 	i915_flush.c \
 	i915_context.c \
diff --git a/src/gallium/drivers/i915simple/SConscript b/src/gallium/drivers/i915simple/SConscript
index 2366e1247f..778c4ed0fd 100644
--- a/src/gallium/drivers/i915simple/SConscript
+++ b/src/gallium/drivers/i915simple/SConscript
@@ -6,6 +6,7 @@ i915simple = env.ConvenienceLibrary(
 	target = 'i915simple',
 	source = [
 		'i915_blit.c',
+		'i915_buffer.c',
 		'i915_clear.c',
 		'i915_context.c',
 		'i915_debug.c',
diff --git a/src/gallium/drivers/i915simple/i915_batch.h b/src/gallium/drivers/i915simple/i915_batch.h
index a433cf054d..b813784723 100644
--- a/src/gallium/drivers/i915simple/i915_batch.h
+++ b/src/gallium/drivers/i915simple/i915_batch.h
@@ -28,89 +28,20 @@
 #ifndef I915_BATCH_H
 #define I915_BATCH_H
 
-#include "i915_winsys.h"
+#include "intel_batchbuffer.h"
 
-struct i915_batchbuffer
-{
-   struct pipe_buffer *buffer;
-   struct i915_winsys *winsys;
+#define BEGIN_BATCH(dwords, relocs) \
+   (intel_batchbuffer_check(i915->batch, dwords, relocs))
 
-   unsigned char *map;
-   unsigned char *ptr;
+#define OUT_BATCH(dword) \
+   intel_batchbuffer_dword(i915->batch, dword)
 
-   size_t size;
-   size_t actual_size;
+#define OUT_RELOC(buf, usage, offset) \
+   intel_batchbuffer_reloc(i915->batch, buf, usage, offset)
 
-   size_t relocs;
-   size_t max_relocs;
-};
-
-static INLINE boolean
-i915_batchbuffer_check( struct i915_batchbuffer *batch,
-			size_t dwords,
-			size_t relocs )
-{
-   /** TODO JB: Check relocs */
-   return dwords * 4 <= batch->size - (batch->ptr - batch->map);
-}
-
-static INLINE size_t
-i915_batchbuffer_space( struct i915_batchbuffer *batch )
-{
-   return batch->size - (batch->ptr - batch->map);
-}
-
-static INLINE void
-i915_batchbuffer_dword( struct i915_batchbuffer *batch,
-			unsigned dword )
-{
-   if (i915_batchbuffer_space(batch) < 4)
-      return;
-
-   *(unsigned *)batch->ptr = dword;
-   batch->ptr += 4;
-}
-
-static INLINE void
-i915_batchbuffer_write( struct i915_batchbuffer *batch,
-			void *data,
-			size_t size )
-{
-   if (i915_batchbuffer_space(batch) < size)
-      return;
-
-   memcpy(data, batch->ptr, size);
-   batch->ptr += size;
-}
-
-static INLINE void
-i915_batchbuffer_reloc( struct i915_batchbuffer *batch,
-			struct pipe_buffer *buffer,
-			size_t flags,
-			size_t offset )
-{
-   batch->winsys->batch_reloc( batch->winsys, buffer, flags, offset );
-}
-
-static INLINE void
-i915_batchbuffer_flush( struct i915_batchbuffer *batch,
-			struct pipe_fence_handle **fence )
-{
-   batch->winsys->batch_flush( batch->winsys, fence );
-}
-
-#define BEGIN_BATCH( dwords, relocs ) \
-   (i915_batchbuffer_check( i915->batch, dwords, relocs ))
-
-#define OUT_BATCH( dword ) \
-   i915_batchbuffer_dword( i915->batch, dword )
-
-#define OUT_RELOC( buf, flags, delta ) \
-   i915_batchbuffer_reloc( i915->batch, buf, flags, delta )
-
-#define FLUSH_BATCH(fence) do {				\
-   i915->winsys->batch_flush( i915->winsys, fence );	\
-   i915->hardware_dirty = ~0;				\
+#define FLUSH_BATCH(fence) do {                 \
+   intel_batchbuffer_flush(i915->batch, fence); \
+   i915->hardware_dirty = ~0;                   \
 } while (0)
 
 #endif
diff --git a/src/gallium/drivers/i915simple/i915_blit.c b/src/gallium/drivers/i915simple/i915_blit.c
index 448a4708ce..83dfc33528 100644
--- a/src/gallium/drivers/i915simple/i915_blit.c
+++ b/src/gallium/drivers/i915simple/i915_blit.c
@@ -26,8 +26,6 @@
  **************************************************************************/
 
 
-#include "i915_context.h"
-#include "i915_winsys.h"
 #include "i915_blit.h"
 #include "i915_reg.h"
 #include "i915_batch.h"
@@ -37,33 +35,33 @@
 
 void
 i915_fill_blit(struct i915_context *i915,
-	       unsigned cpp,
-	       unsigned short dst_pitch,
-	       struct pipe_buffer *dst_buffer,
-	       unsigned dst_offset,
-	       short x, short y, 
-	       short w, short h, 
-	       unsigned color)
+               unsigned cpp,
+               unsigned short dst_pitch,
+               struct intel_buffer *dst_buffer,
+               unsigned dst_offset,
+               short x, short y, 
+               short w, short h, 
+               unsigned color)
 {
    unsigned BR13, CMD;
 
 
    I915_DBG(i915,
-       "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-       __FUNCTION__,
-       dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+      "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+      __FUNCTION__,
+      dst_buffer, dst_pitch, dst_offset, x, y, w, h);
 
    switch (cpp) {
    case 1:
    case 2:
    case 3:
       BR13 = (((int) dst_pitch) & 0xffff) |
-	 (0xF0 << 16) | (1 << 24);
+         (0xF0 << 16) | (1 << 24);
       CMD = XY_COLOR_BLT_CMD;
       break;
    case 4:
       BR13 = (((int) dst_pitch) & 0xffff) |
-	 (0xF0 << 16) | (1 << 24) | (1 << 25);
+         (0xF0 << 16) | (1 << 24) | (1 << 25);
       CMD = (XY_COLOR_BLT_CMD | XY_COLOR_BLT_WRITE_ALPHA |
              XY_COLOR_BLT_WRITE_RGB);
       break;
@@ -79,25 +77,24 @@ i915_fill_blit(struct i915_context *i915,
    OUT_BATCH(BR13);
    OUT_BATCH((y << 16) | x);
    OUT_BATCH(((y + h) << 16) | (x + w));
-   OUT_RELOC( dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_RELOC(dst_buffer, INTEL_USAGE_2D_TARGET, dst_offset);
    OUT_BATCH(color);
    FLUSH_BATCH(NULL);
 }
 
-
 void
-i915_copy_blit( struct i915_context *i915,
-                  unsigned do_flip,
-                  unsigned cpp,
-                  unsigned short src_pitch,
-                  struct pipe_buffer *src_buffer,
-                  unsigned src_offset,
-                  unsigned short dst_pitch,
-                  struct pipe_buffer *dst_buffer,
-                  unsigned dst_offset,
-                  short src_x, short src_y,
-                  short dst_x, short dst_y, 
-		  short w, short h )
+i915_copy_blit(struct i915_context *i915,
+               unsigned do_flip,
+               unsigned cpp,
+               unsigned short src_pitch,
+               struct intel_buffer *src_buffer,
+               unsigned src_offset,
+               unsigned short dst_pitch,
+               struct intel_buffer *dst_buffer,
+               unsigned dst_offset,
+               short src_x, short src_y,
+               short dst_x, short dst_y, 
+               short w, short h)
 {
    unsigned CMD, BR13;
    int dst_y2 = dst_y + h;
@@ -105,32 +102,30 @@ i915_copy_blit( struct i915_context *i915,
 
 
    I915_DBG(i915,
-       "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-       __FUNCTION__,
-       src_buffer, src_pitch, src_offset, src_x, src_y,
-       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+      "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+      __FUNCTION__,
+      src_buffer, src_pitch, src_offset, src_x, src_y,
+      dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
    switch (cpp) {
    case 1:
    case 2:
    case 3:
       BR13 = (((int) dst_pitch) & 0xffff) |
-	 (0xCC << 16) | (1 << 24);
+         (0xCC << 16) | (1 << 24);
       CMD = XY_SRC_COPY_BLT_CMD;
       break;
    case 4:
       BR13 = (((int) dst_pitch) & 0xffff) |
-	 (0xCC << 16) | (1 << 24) | (1 << 25);
-      CMD =
-         (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
-          XY_SRC_COPY_BLT_WRITE_RGB);
+             (0xCC << 16) | (1 << 24) | (1 << 25);
+      CMD = (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+            XY_SRC_COPY_BLT_WRITE_RGB);
       break;
    default:
       return;
    }
 
-   if (dst_y2 < dst_y || 
-       dst_x2 < dst_x) {
+   if (dst_y2 < dst_y || dst_x2 < dst_x) {
       return;
    }
 
@@ -140,7 +135,6 @@ i915_copy_blit( struct i915_context *i915,
     */
    assert (dst_pitch > 0 && src_pitch > 0);
 
-
    if (!BEGIN_BATCH(8, 2)) {
       FLUSH_BATCH(NULL);
       assert(BEGIN_BATCH(8, 2));
@@ -149,11 +143,9 @@ i915_copy_blit( struct i915_context *i915,
    OUT_BATCH(BR13);
    OUT_BATCH((dst_y << 16) | dst_x);
    OUT_BATCH((dst_y2 << 16) | dst_x2);
-   OUT_RELOC(dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_RELOC(dst_buffer, INTEL_USAGE_2D_TARGET, dst_offset);
    OUT_BATCH((src_y << 16) | src_x);
    OUT_BATCH(((int) src_pitch & 0xffff));
-   OUT_RELOC(src_buffer, I915_BUFFER_ACCESS_READ, src_offset);
+   OUT_RELOC(src_buffer, INTEL_USAGE_2D_SOURCE, src_offset);
    FLUSH_BATCH(NULL);
 }
-
-
diff --git a/src/gallium/drivers/i915simple/i915_blit.h b/src/gallium/drivers/i915simple/i915_blit.h
index 0bb3453861..8ce3220cfd 100644
--- a/src/gallium/drivers/i915simple/i915_blit.h
+++ b/src/gallium/drivers/i915simple/i915_blit.h
@@ -32,24 +32,24 @@
 
 extern void i915_copy_blit(struct i915_context *i915,
                            unsigned do_flip,
-			   unsigned cpp,
-			   unsigned short src_pitch,
-			   struct pipe_buffer *src_buffer,
-			   unsigned src_offset,
-			   unsigned short dst_pitch,
-			   struct pipe_buffer *dst_buffer,
-			   unsigned dst_offset,
-			   short srcx, short srcy,
-			   short dstx, short dsty,
-			   short w, short h );
+                           unsigned cpp,
+                           unsigned short src_pitch,
+                           struct intel_buffer *src_buffer,
+                           unsigned src_offset,
+                           unsigned short dst_pitch,
+                           struct intel_buffer *dst_buffer,
+                           unsigned dst_offset,
+                           short srcx, short srcy,
+                           short dstx, short dsty,
+                           short w, short h);
 
 extern void i915_fill_blit(struct i915_context *i915,
-			   unsigned cpp,
-			   unsigned short dst_pitch,
-			   struct pipe_buffer *dst_buffer,
-			   unsigned dst_offset,
-			   short x, short y,
-			   short w, short h, unsigned color);
+                           unsigned cpp,
+                           unsigned short dst_pitch,
+                           struct intel_buffer *dst_buffer,
+                           unsigned dst_offset,
+                           short x, short y,
+                           short w, short h, unsigned color);
 
 
 #endif
diff --git a/src/gallium/drivers/i915simple/i915_buffer.c b/src/gallium/drivers/i915simple/i915_buffer.c
new file mode 100644
index 0000000000..effeba1297
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_buffer.c
@@ -0,0 +1,136 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "i915_screen.h"
+#include "i915_buffer.h"
+
+struct intel_buffer;
+
+struct i915_buffer
+{
+   struct pipe_buffer base;
+
+   struct intel_buffer *ibuf; /** hw buffer */
+
+   void *data; /**< user and malloc data */
+   boolean own; /**< we own the data incase of malloc */
+};
+
+static INLINE struct i915_buffer *
+i915_buffer(struct pipe_buffer *buffer)
+{
+   return (struct i915_buffer *)buffer;
+}
+
+static struct pipe_buffer *
+i915_buffer_create(struct pipe_screen *screen,
+                   unsigned alignment,
+                   unsigned usage,
+                   unsigned size)
+{
+   struct i915_buffer *buf = CALLOC_STRUCT(i915_buffer);
+
+   if (!buf)
+      return NULL;
+
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = alignment;
+   buf->base.screen = screen;
+   buf->base.usage = usage;
+   buf->base.size = size;
+   buf->data = MALLOC(size);
+   buf->own = TRUE;
+
+   if (!buf->data)
+      goto err;
+
+   return &buf->base;
+
+err:
+   FREE(buf);
+   return NULL;
+}
+
+static struct pipe_buffer *
+i915_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes)
+{
+   struct i915_buffer *buf = CALLOC_STRUCT(i915_buffer);
+
+   if (!buf)
+      return NULL;
+
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = 0;
+   buf->base.screen = screen;
+   buf->base.usage = 0;
+   buf->base.size = bytes;
+   buf->data = ptr;
+   buf->own = FALSE;
+
+   return &buf->base;
+}
+
+static void *
+i915_buffer_map(struct pipe_screen *screen,
+                struct pipe_buffer *buffer,
+                unsigned usage)
+{
+   struct i915_buffer *buf = i915_buffer(buffer);
+   assert(!buf->ibuf);
+   return buf->data;
+}
+
+static void
+i915_buffer_unmap(struct pipe_screen *screen,
+                  struct pipe_buffer *buffer)
+{
+   struct i915_buffer *buf = i915_buffer(buffer);
+   assert(!buf->ibuf);
+}
+
+static void
+i915_buffer_destroy(struct pipe_buffer *buffer)
+{
+   struct i915_buffer *buf = i915_buffer(buffer);
+   assert(!buf->ibuf);
+
+   if (buf->own)
+      FREE(buf->data);
+   FREE(buf);
+}
+
+void i915_init_screen_buffer_functions(struct i915_screen *screen)
+{
+   screen->base.buffer_create = i915_buffer_create;
+   screen->base.user_buffer_create = i915_user_buffer_create;
+   screen->base.buffer_map = i915_buffer_map;
+   screen->base.buffer_map_range = NULL;
+   screen->base.buffer_flush_mapped_range = NULL;
+   screen->base.buffer_unmap = i915_buffer_unmap;
+   screen->base.buffer_destroy = i915_buffer_destroy;
+}
diff --git a/src/gallium/drivers/i915simple/i915_buffer.h b/src/gallium/drivers/i915simple/i915_buffer.h
new file mode 100644
index 0000000000..80fda7c62f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_buffer.h
@@ -0,0 +1,31 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef I915_BUFFER_H
+#define I915_BUFFER_H
+
+void i915_init_screen_buffer_functions(struct i915_screen *screen);
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index ccf9bb31fb..e745f3342d 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -26,8 +26,8 @@
  **************************************************************************/
 
 #include "i915_context.h"
-#include "i915_winsys.h"
 #include "i915_state.h"
+#include "i915_screen.h"
 #include "i915_batch.h"
 #include "i915_texture.h"
 #include "i915_reg.h"
@@ -40,66 +40,58 @@
 #include "pipe/p_screen.h"
 
 
-static void i915_destroy( struct pipe_context *pipe )
-{
-   struct i915_context *i915 = i915_context( pipe );
-
-   draw_destroy( i915->draw );
-   
-   if(i915->winsys->destroy)
-      i915->winsys->destroy(i915->winsys);
-
-   FREE( i915 );
-}
+/*
+ * Draw functions
+ */
 
 
 static boolean
 i915_draw_range_elements(struct pipe_context *pipe,
-			     struct pipe_buffer *indexBuffer,
-			     unsigned indexSize,
-			     unsigned min_index,
-			     unsigned max_index,
-			     unsigned prim, unsigned start, unsigned count)
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned prim, unsigned start, unsigned count)
 {
-   struct i915_context *i915 = i915_context( pipe );
+   struct i915_context *i915 = i915_context(pipe);
    struct draw_context *draw = i915->draw;
    unsigned i;
 
    if (i915->dirty)
-      i915_update_derived( i915 );
+      i915_update_derived(i915);
 
    /*
     * Map vertex buffers
     */
    for (i = 0; i < i915->num_vertex_buffers; i++) {
-      void *buf
-         = pipe_buffer_map(pipe->screen,
-                                    i915->vertex_buffer[i].buffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+      void *buf = pipe_buffer_map(pipe->screen, i915->vertex_buffer[i].buffer,
+                                  PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_vertex_buffer(draw, i, buf);
    }
-   /* Map index buffer, if present */
+
+   /*
+    * Map index buffer, if present
+    */
    if (indexBuffer) {
-      void *mapped_indexes
-         = pipe_buffer_map(pipe->screen, indexBuffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+      void *mapped_indexes = pipe_buffer_map(pipe->screen, indexBuffer,
+                                             PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_element_buffer_range(draw, indexSize,
-					   min_index,
-					   max_index,
-					   mapped_indexes);
-   }
-   else {
-      /* no index/element buffer */
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   } else {
       draw_set_mapped_element_buffer(draw, 0, NULL);
    }
 
 
    draw_set_mapped_constant_buffer(draw,
                                    i915->current.constants[PIPE_SHADER_VERTEX],
-                                   ( i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
-                                     4 * sizeof(float) ));
+                                   (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
+                                      4 * sizeof(float)));
 
-   /* draw! */
+   /*
+    * Do the drawing
+    */
    draw_arrays(i915->draw, prim, start, count);
 
    /*
@@ -109,6 +101,7 @@ i915_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, i915->vertex_buffer[i].buffer);
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
+
    if (indexBuffer) {
       pipe_buffer_unmap(pipe->screen, indexBuffer);
       draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
@@ -118,51 +111,88 @@ i915_draw_range_elements(struct pipe_context *pipe,
 }
 
 static boolean
-i915_draw_elements( struct pipe_context *pipe,
-                    struct pipe_buffer *indexBuffer,
-                    unsigned indexSize,
-                    unsigned prim, unsigned start, unsigned count)
+i915_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_range_elements( pipe, indexBuffer,
-					indexSize,
-					0, 0xffffffff,
-					prim, start, count );
+   return i915_draw_range_elements(pipe, indexBuffer,
+                                   indexSize,
+                                   0, 0xffffffff,
+                                   prim, start, count);
 }
 
-static boolean i915_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+static boolean
+i915_draw_arrays(struct pipe_context *pipe,
+                 unsigned prim, unsigned start, unsigned count)
 {
    return i915_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
+/*
+ * Is referenced functions
+ */
+
+
 static unsigned int
-i915_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
+i915_is_texture_referenced(struct pipe_context *pipe,
+                           struct pipe_texture *texture,
+                           unsigned face, unsigned level)
 {
    /**
-    * FIXME: Optimize.
+    * FIXME: Return the corrent result. We can't alays return referenced
+    *        since it causes a double flush within the vbo module.
     */
-
+#if 0
    return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+#else
+   return 0;
+#endif
 }
 
 static unsigned int
-i915_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
+i915_is_buffer_referenced(struct pipe_context *pipe,
+                          struct pipe_buffer *buf)
 {
    /**
-    * FIXME: Optimize.
+    * FIXME: Return the corrent result. We can't alays return referenced
+    *        since it causes a double flush within the vbo module.
     */
-
+#if 0
    return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+#else
+   return 0;
+#endif
 }
 
 
-struct pipe_context *i915_create_context( struct pipe_screen *screen,
-                                          struct pipe_winsys *pipe_winsys,
-                                          struct i915_winsys *i915_winsys )
+/*
+ * Generic context functions
+ */
+
+
+static void i915_destroy(struct pipe_context *pipe)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   int i;
+
+   draw_destroy(i915->draw);
+   
+   if(i915->batch)
+      i915->iws->batchbuffer_destroy(i915->batch);
+
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
+   FREE(i915);
+}
+
+struct pipe_context *
+i915_create_context(struct pipe_screen *screen)
 {
    struct i915_context *i915;
 
@@ -170,21 +200,20 @@ struct pipe_context *i915_create_context( struct pipe_screen *screen,
    if (i915 == NULL)
       return NULL;
 
-   i915->winsys = i915_winsys;
-   i915->pipe.winsys = pipe_winsys;
-   i915->pipe.screen = screen;
+   i915->iws = i915_screen(screen)->iws;
+   i915->base.winsys = NULL;
+   i915->base.screen = screen;
 
-   i915->pipe.destroy = i915_destroy;
+   i915->base.destroy = i915_destroy;
 
-   i915->pipe.clear = i915_clear;
+   i915->base.clear = i915_clear;
 
+   i915->base.draw_arrays = i915_draw_arrays;
+   i915->base.draw_elements = i915_draw_elements;
+   i915->base.draw_range_elements = i915_draw_range_elements;
 
-   i915->pipe.draw_arrays = i915_draw_arrays;
-   i915->pipe.draw_elements = i915_draw_elements;
-   i915->pipe.draw_range_elements = i915_draw_range_elements;
-
-   i915->pipe.is_texture_referenced = i915_is_texture_referenced;
-   i915->pipe.is_buffer_referenced = i915_is_buffer_referenced;
+   i915->base.is_texture_referenced = i915_is_texture_referenced;
+   i915->base.is_buffer_referenced = i915_is_buffer_referenced;
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -193,27 +222,23 @@ struct pipe_context *i915_create_context( struct pipe_screen *screen,
    assert(i915->draw);
    if (!debug_get_bool_option("I915_NO_VBUF", FALSE)) {
       draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
-   }
-   else {
+   } else {
       draw_set_rasterize_stage(i915->draw, i915_draw_render_stage(i915));
    }
 
    i915_init_surface_functions(i915);
    i915_init_state_functions(i915);
    i915_init_flush_functions(i915);
-   i915_init_texture_functions(i915);
 
-   draw_install_aaline_stage(i915->draw, &i915->pipe);
-   draw_install_aapoint_stage(i915->draw, &i915->pipe);
+   draw_install_aaline_stage(i915->draw, &i915->base);
+   draw_install_aapoint_stage(i915->draw, &i915->base);
 
    i915->dirty = ~0;
    i915->hardware_dirty = ~0;
 
    /* Batch stream debugging is a bit hacked up at the moment:
     */
-   i915->batch = i915_winsys->batch_get(i915_winsys);
-   i915->batch->winsys = i915_winsys;
+   i915->batch = i915->iws->batchbuffer_create(i915->iws);
 
-   return &i915->pipe;
+   return &i915->base;
 }
-
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
index b6983ba86e..234b441ce6 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -38,6 +38,11 @@
 #include "tgsi/tgsi_scan.h"
 
 
+struct intel_winsys;
+struct intel_buffer;
+struct intel_batchbuffer;
+
+
 #define I915_TEX_UNITS 8
 
 #define I915_DYNAMIC_MODES4       0
@@ -182,7 +187,6 @@ struct i915_sampler_state {
    unsigned maxlod;
 };
 
-
 struct i915_texture {
    struct pipe_texture base;
 
@@ -192,7 +196,8 @@ struct i915_texture {
    unsigned depth_stride;          /* per-image on i945? */
    unsigned total_nblocksy;
 
-   unsigned tiled;
+   unsigned sw_tiled; /**< tiled with software flags */
+   unsigned hw_tiled; /**< tiled with hardware fences */
 
    unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
 
@@ -206,15 +211,15 @@ struct i915_texture {
 
    /* The data is held here:
     */
-   struct pipe_buffer *buffer;
+   struct intel_buffer *buffer;
 };
 
-struct i915_batchbuffer;
-
 struct i915_context
 {
-   struct pipe_context pipe;
-   struct i915_winsys *winsys;
+   struct pipe_context base;
+
+   struct intel_winsys *iws;
+
    struct draw_context *draw;
 
    /* The most recent drawing state as set by the driver:
@@ -243,10 +248,10 @@ struct i915_context
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
-   struct i915_batchbuffer *batch;
+   struct intel_batchbuffer *batch;
 
    /** Vertex buffer */
-   struct pipe_buffer *vbo;
+   struct intel_buffer *vbo;
    size_t vbo_offset;
    unsigned vbo_flushed;
 
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915simple/i915_debug.c
index e08582efab..ce92d1af9a 100644
--- a/src/gallium/drivers/i915simple/i915_debug.c
+++ b/src/gallium/drivers/i915simple/i915_debug.c
@@ -27,7 +27,6 @@
 
 #include "i915_reg.h"
 #include "i915_context.h"
-#include "i915_winsys.h"
 #include "i915_debug.h"
 #include "i915_batch.h"
 #include "pipe/internal/p_winsys_screen.h"
@@ -864,7 +863,7 @@ static boolean i915_debug_packet( struct debug_stream *stream )
 
 
 void
-i915_dump_batchbuffer( struct i915_batchbuffer *batch )
+i915_dump_batchbuffer( struct intel_batchbuffer *batch )
 {
    struct debug_stream stream;
    unsigned *start = (unsigned*)batch->map;
diff --git a/src/gallium/drivers/i915simple/i915_debug.h b/src/gallium/drivers/i915simple/i915_debug.h
index 16ca7277c7..dd9b86e17b 100644
--- a/src/gallium/drivers/i915simple/i915_debug.h
+++ b/src/gallium/drivers/i915simple/i915_debug.h
@@ -104,9 +104,9 @@ I915_DBG(
 #endif
 
 
-struct i915_batchbuffer;
+struct intel_batchbuffer;
 
-void i915_dump_batchbuffer( struct i915_batchbuffer *i915 );
+void i915_dump_batchbuffer( struct intel_batchbuffer *i915 );
 
 void i915_debug_init( struct i915_context *i915 );
 
diff --git a/src/gallium/drivers/i915simple/i915_flush.c b/src/gallium/drivers/i915simple/i915_flush.c
index 472e0ab774..1582168eba 100644
--- a/src/gallium/drivers/i915simple/i915_flush.c
+++ b/src/gallium/drivers/i915simple/i915_flush.c
@@ -45,6 +45,7 @@ static void i915_flush( struct pipe_context *pipe,
 
    draw_flush(i915->draw);
 
+#if 0
    /* Do we need to emit an MI_FLUSH command to flush the hardware
     * caches?
     */
@@ -63,6 +64,13 @@ static void i915_flush( struct pipe_context *pipe,
       }
       OUT_BATCH( flush );
    }
+#endif
+
+#if 0
+   if (i915->batch->map == i915->batch->ptr) {
+      return;
+   }
+#endif
 
    /* If there are no flags, just flush pending commands to hardware:
     */
@@ -74,5 +82,5 @@ static void i915_flush( struct pipe_context *pipe,
 
 void i915_init_flush_functions( struct i915_context *i915 )
 {
-   i915->pipe.flush = i915_flush;
+   i915->base.flush = i915_flush;
 }
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index 8f1f58b2dd..d9a5c40ab9 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -32,7 +32,6 @@
 #include "util/u_pack_color.h"
 
 #include "i915_context.h"
-#include "i915_winsys.h"
 #include "i915_reg.h"
 #include "i915_state.h"
 #include "i915_batch.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index 9bdd91f288..b3a7774fd6 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -42,13 +42,12 @@
 #include "draw/draw_vbuf.h"
 #include "util/u_debug.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_fifo.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
-#include "i915_winsys.h"
 #include "i915_batch.h"
 #include "i915_state.h"
 
@@ -59,7 +58,7 @@
 struct i915_vbuf_render {
    struct vbuf_render base;
 
-   struct i915_context *i915;   
+   struct i915_context *i915;
 
    /** Vertex size in bytes */
    size_t vertex_size;
@@ -74,12 +73,17 @@ struct i915_vbuf_render {
    unsigned fallback;
 
    /* Stuff for the vbo */
-   struct pipe_buffer *vbo;
+   struct intel_buffer *vbo;
    size_t vbo_size;
    size_t vbo_offset;
    void *vbo_ptr;
-   size_t vbo_alloc_size;
    size_t vbo_max_used;
+
+   /* stuff for the pool */
+   struct util_fifo *pool_fifo;
+   unsigned pool_used;
+   unsigned pool_buffer_size;
+   boolean pool_not_used;
 };
 
 
@@ -87,56 +91,93 @@ struct i915_vbuf_render {
  * Basically a cast wrapper.
  */
 static INLINE struct i915_vbuf_render *
-i915_vbuf_render( struct vbuf_render *render )
+i915_vbuf_render(struct vbuf_render *render)
 {
    assert(render);
    return (struct i915_vbuf_render *)render;
 }
 
-
 static const struct vertex_info *
-i915_vbuf_render_get_vertex_info( struct vbuf_render *render )
+i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
 
    if (i915->dirty) {
       /* make sure we have up to date vertex layout */
-      i915_update_derived( i915 );
+      i915_update_derived(i915);
    }
 
    return &i915->current.vertex_info;
 }
 
+static boolean
+i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+      return FALSE;
+
+   if (i915->vbo_flushed)
+      return FALSE;
+
+   return TRUE;
+}
+
+static void
+i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+   struct intel_winsys *iws = i915->iws;
+
+   if (i915_render->vbo) {
+      if (i915_render->pool_not_used)
+         iws->buffer_destroy(iws, i915_render->vbo);
+      else
+         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
+      i915_render->vbo = NULL;
+   }
+
+   i915->vbo_flushed = 0;
+
+   i915_render->vbo_size = MAX2(size, i915_render->pool_buffer_size);
+   i915_render->vbo_offset = 0;
+
+   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
+      i915_render->pool_not_used = TRUE;
+      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
+            INTEL_NEW_VERTEX);
+   } else {
+      i915_render->pool_not_used = FALSE;
+
+      if (i915_render->pool_used >= 2) {
+         FLUSH_BATCH(NULL);
+         i915->vbo_flushed = 0;
+         i915_render->pool_used = 0;
+      }
+      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
+   }
+}
 
 static boolean
-i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
-                                    ushort vertex_size,
-                                    ushort nr_vertices )
+i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
+                                   ushort vertex_size,
+                                   ushort nr_vertices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct pipe_screen *screen = i915->pipe.screen;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
 
    /* FIXME: handle failure */
    assert(!i915->vbo);
 
-   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
-   } else {
-      i915->vbo_flushed = 0;
-      if (i915_render->vbo)
-         pipe_buffer_reference(&i915_render->vbo, NULL);
-   }
+   if (!i915_vbuf_render_reserve(i915_render, size)) {
 
-   if (!i915_render->vbo) {
-      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
-      i915_render->vbo_offset = 0;
-      i915_render->vbo = pipe_buffer_create(screen,
-                                            64,
-                                            I915_BUFFER_USAGE_LIT_VERTEX,
-                                            i915_render->vbo_size);
+      if (i915->vbo_flushed)
+         i915_render->pool_used = 0;
 
+      i915_vbuf_render_new_buf(i915_render, size);
    }
 
    i915_render->vertex_size = vertex_size;
@@ -149,40 +190,37 @@ i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
    return TRUE;
 }
 
-
 static void *
-i915_vbuf_render_map_vertices( struct vbuf_render *render )
+i915_vbuf_render_map_vertices(struct vbuf_render *render)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct pipe_screen *screen = i915->pipe.screen;
+   struct intel_winsys *iws = i915->iws;
 
    if (i915->vbo_flushed)
       debug_printf("%s bad vbo flush occured stalling on hw\n");
 
-   i915_render->vbo_ptr = pipe_buffer_map(screen,
-                                          i915_render->vbo,
-                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
 
    return (unsigned char *)i915_render->vbo_ptr + i915->vbo_offset;
 }
 
 static void
-i915_vbuf_render_unmap_vertices( struct vbuf_render *render,
-                                 ushort min_index,
-                                 ushort max_index )
+i915_vbuf_render_unmap_vertices(struct vbuf_render *render,
+                                ushort min_index,
+                                ushort max_index)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct pipe_screen *screen = i915->pipe.screen;
+   struct intel_winsys *iws = i915->iws;
 
    i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
-   pipe_buffer_unmap(screen, i915_render->vbo);
+   iws->buffer_unmap(iws, i915_render->vbo);
 }
 
 static boolean
-i915_vbuf_render_set_primitive( struct vbuf_render *render, 
-                                unsigned prim )
+i915_vbuf_render_set_primitive(struct vbuf_render *render, 
+                               unsigned prim)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    i915_render->prim = prim;
@@ -234,15 +272,13 @@ i915_vbuf_render_set_primitive( struct vbuf_render *render,
    }
 }
 
-
-
 /**
  * Used for fallbacks in draw_arrays
  */
 static void
-draw_arrays_generate_indices( struct vbuf_render *render,
-                              unsigned start, uint nr,
-                              unsigned type )
+draw_arrays_generate_indices(struct vbuf_render *render,
+                             unsigned start, uint nr,
+                             unsigned type)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
@@ -251,29 +287,29 @@ draw_arrays_generate_indices( struct vbuf_render *render,
    switch(type) {
    case 0:
       for (i = start; i+1 < end; i += 2)
-	 OUT_BATCH( (i+0) | (i+1) << 16 );
+         OUT_BATCH((i+0) | (i+1) << 16);
       if (i < end)
-	 OUT_BATCH( i );
+         OUT_BATCH(i);
       break;
    case PIPE_PRIM_LINE_LOOP:
       if (nr >= 2) {
-	 for (i = start + 1; i < end; i++)
-	    OUT_BATCH( (i-0) | (i+0) << 16 );
-	 OUT_BATCH( (i-0) | (  start) << 16 );
+         for (i = start + 1; i < end; i++)
+            OUT_BATCH((i-0) | (i+0) << 16);
+         OUT_BATCH((i-0) | ( start) << 16);
       }
       break;
    case PIPE_PRIM_QUADS:
       for (i = start; i + 3 < end; i += 4) {
-	 OUT_BATCH( (i+0) | (i+1) << 16 );
-	 OUT_BATCH( (i+3) | (i+1) << 16 );
-	 OUT_BATCH( (i+2) | (i+3) << 16 );
+         OUT_BATCH((i+0) | (i+1) << 16);
+         OUT_BATCH((i+3) | (i+1) << 16);
+         OUT_BATCH((i+2) | (i+3) << 16);
       }
       break;
    case PIPE_PRIM_QUAD_STRIP:
       for (i = start; i + 3 < end; i += 2) {
-	 OUT_BATCH( (i+0) | (i+1) << 16 );
-	 OUT_BATCH( (i+3) | (i+2) << 16 );
-	 OUT_BATCH( (i+0) | (i+3) << 16 );
+         OUT_BATCH((i+0) | (i+1) << 16);
+         OUT_BATCH((i+3) | (i+2) << 16);
+         OUT_BATCH((i+0) | (i+3) << 16);
       }
       break;
    default:
@@ -282,16 +318,16 @@ draw_arrays_generate_indices( struct vbuf_render *render,
 }
 
 static unsigned
-draw_arrays_calc_nr_indices( uint nr, unsigned type )
+draw_arrays_calc_nr_indices(uint nr, unsigned type)
 {
    switch (type) {
    case 0:
       return nr;
    case PIPE_PRIM_LINE_LOOP:
       if (nr >= 2)
-	 return nr * 2;
+         return nr * 2;
       else
-	 return 0;
+         return 0;
    case PIPE_PRIM_QUADS:
       return (nr / 4) * 6;
    case PIPE_PRIM_QUAD_STRIP:
@@ -303,64 +339,64 @@ draw_arrays_calc_nr_indices( uint nr, unsigned type )
 }
 
 static void
-draw_arrays_fallback( struct vbuf_render *render,
-                      unsigned start,
-                      uint nr )
+draw_arrays_fallback(struct vbuf_render *render,
+                     unsigned start,
+                     uint nr)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
    unsigned nr_indices;
 
    if (i915->dirty)
-      i915_update_derived( i915 );
+      i915_update_derived(i915);
 
    if (i915->hardware_dirty)
-      i915_emit_hardware_state( i915 );
+      i915_emit_hardware_state(i915);
 
-   nr_indices = draw_arrays_calc_nr_indices( nr, i915_render->fallback );
+   nr_indices = draw_arrays_calc_nr_indices(nr, i915_render->fallback);
    if (!nr_indices)
       return;
 
-   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
       FLUSH_BATCH(NULL);
 
       /* Make sure state is re-emitted after a flush:
        */
-      i915_update_derived( i915 );
-      i915_emit_hardware_state( i915 );
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
       i915->vbo_flushed = 1;
 
-      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
-	 assert(0);
-	 goto out;
+      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+         assert(0);
+         goto out;
       }
    }
-   OUT_BATCH( _3DPRIMITIVE |
-	      PRIM_INDIRECT |
-	      i915_render->hwprim |
-	      PRIM_INDIRECT_ELTS |
-	      nr_indices );
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             i915_render->hwprim |
+             PRIM_INDIRECT_ELTS |
+             nr_indices);
 
-   draw_arrays_generate_indices( render, start, nr, i915_render->fallback );
+   draw_arrays_generate_indices(render, start, nr, i915_render->fallback);
 
 out:
    return;
 }
 
 static void
-i915_vbuf_render_draw_arrays( struct vbuf_render *render,
-                              unsigned start,
-                              uint nr )
+i915_vbuf_render_draw_arrays(struct vbuf_render *render,
+                             unsigned start,
+                             uint nr)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
 
    if (i915_render->fallback) {
-      draw_arrays_fallback( render, start, nr );
+      draw_arrays_fallback(render, start, nr);
       return;
    }
 
    /* JB: TODO submit direct cmds */
-   draw_arrays_fallback( render, start, nr );
+   draw_arrays_fallback(render, start, nr);
 }
 
 /**
@@ -368,10 +404,10 @@ i915_vbuf_render_draw_arrays( struct vbuf_render *render,
  * If type is zero normal operation assumed.
  */
 static void
-draw_generate_indices( struct vbuf_render *render,
-                       const ushort *indices,
-                       uint nr_indices,
-                       unsigned type )
+draw_generate_indices(struct vbuf_render *render,
+                      const ushort *indices,
+                      uint nr_indices,
+                      unsigned type)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
@@ -380,31 +416,31 @@ draw_generate_indices( struct vbuf_render *render,
    switch(type) {
    case 0:
       for (i = 0; i + 1 < nr_indices; i += 2) {
-	 OUT_BATCH( indices[i] | indices[i+1] << 16 );
+         OUT_BATCH(indices[i] | indices[i+1] << 16);
       }
       if (i < nr_indices) {
-	 OUT_BATCH( indices[i] );
+         OUT_BATCH(indices[i]);
       }
       break;
    case PIPE_PRIM_LINE_LOOP:
       if (nr_indices >= 2) {
-	 for (i = 1; i < nr_indices; i++)
-	    OUT_BATCH( indices[i-1] | indices[i] << 16 );
-	 OUT_BATCH( indices[i-1] | indices[0] << 16 );
+         for (i = 1; i < nr_indices; i++)
+            OUT_BATCH(indices[i-1] | indices[i] << 16);
+         OUT_BATCH(indices[i-1] | indices[0] << 16);
       }
       break;
    case PIPE_PRIM_QUADS:
       for (i = 0; i + 3 < nr_indices; i += 4) {
-	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
-	 OUT_BATCH( indices[i+3] | indices[i+1] << 16 );
-	 OUT_BATCH( indices[i+2] | indices[i+3] << 16 );
+         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+3] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+2] | indices[i+3] << 16);
       }
       break;
    case PIPE_PRIM_QUAD_STRIP:
       for (i = 0; i + 3 < nr_indices; i += 2) {
-	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
-	 OUT_BATCH( indices[i+3] | indices[i+2] << 16 );
-	 OUT_BATCH( indices[i+0] | indices[i+3] << 16 );
+         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+3] | indices[i+2] << 16);
+         OUT_BATCH(indices[i+0] | indices[i+3] << 16);
       }
       break;
    default:
@@ -414,16 +450,16 @@ draw_generate_indices( struct vbuf_render *render,
 }
 
 static unsigned
-draw_calc_nr_indices( uint nr_indices, unsigned type )
+draw_calc_nr_indices(uint nr_indices, unsigned type)
 {
    switch (type) {
    case 0:
       return nr_indices;
    case PIPE_PRIM_LINE_LOOP:
       if (nr_indices >= 2)
-	 return nr_indices * 2;
+         return nr_indices * 2;
       else
-	 return 0;
+         return 0;
    case PIPE_PRIM_QUADS:
       return (nr_indices / 4) * 6;
    case PIPE_PRIM_QUAD_STRIP:
@@ -435,9 +471,9 @@ draw_calc_nr_indices( uint nr_indices, unsigned type )
 }
 
 static void 
-i915_vbuf_render_draw( struct vbuf_render *render,
-                       const ushort *indices,
-                       uint nr_indices)
+i915_vbuf_render_draw(struct vbuf_render *render,
+                      const ushort *indices,
+                      uint nr_indices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
@@ -445,48 +481,47 @@ i915_vbuf_render_draw( struct vbuf_render *render,
 
    save_nr_indices = nr_indices;
 
-   nr_indices = draw_calc_nr_indices( nr_indices, i915_render->fallback );
+   nr_indices = draw_calc_nr_indices(nr_indices, i915_render->fallback);
    if (!nr_indices)
       return;
 
    if (i915->dirty)
-      i915_update_derived( i915 );
+      i915_update_derived(i915);
 
    if (i915->hardware_dirty)
-      i915_emit_hardware_state( i915 );
+      i915_emit_hardware_state(i915);
 
-   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
       FLUSH_BATCH(NULL);
 
       /* Make sure state is re-emitted after a flush: 
        */
-      i915_update_derived( i915 );
-      i915_emit_hardware_state( i915 );
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
       i915->vbo_flushed = 1;
 
-      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
-	 assert(0);
-     goto out;
+      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+         assert(0);
+         goto out;
       }
    }
 
-   OUT_BATCH( _3DPRIMITIVE |
-	      PRIM_INDIRECT |
-	      i915_render->hwprim |
-	      PRIM_INDIRECT_ELTS |
-	      nr_indices );
-   draw_generate_indices( render,
-			  indices,
-			  save_nr_indices,
-			  i915_render->fallback );
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             i915_render->hwprim |
+             PRIM_INDIRECT_ELTS |
+             nr_indices);
+   draw_generate_indices(render,
+                         indices,
+                         save_nr_indices,
+                         i915_render->fallback);
 
 out:
    return;
 }
 
-
 static void
-i915_vbuf_render_release_vertices( struct vbuf_render *render )
+i915_vbuf_render_release_vertices(struct vbuf_render *render)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
@@ -499,23 +534,22 @@ i915_vbuf_render_release_vertices( struct vbuf_render *render )
    i915->dirty |= I915_NEW_VBO;
 }
 
-
 static void
-i915_vbuf_render_destroy( struct vbuf_render *render )
+i915_vbuf_render_destroy(struct vbuf_render *render)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    FREE(i915_render);
 }
 
-
 /**
  * Create a new primitive render.
  */
 static struct vbuf_render *
-i915_vbuf_render_create( struct i915_context *i915 )
+i915_vbuf_render_create(struct i915_context *i915)
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
-   struct pipe_screen *screen = i915->pipe.screen;
+   struct intel_winsys *iws = i915->iws;
+   int i;
 
    i915_render->i915 = i915;
    
@@ -536,26 +570,32 @@ i915_vbuf_render_create( struct i915_context *i915 )
    i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
    i915_render->base.destroy = i915_vbuf_render_destroy;
 
-   i915_render->vbo_alloc_size = 128 * 4096;
-   i915_render->vbo_size = i915_render->vbo_alloc_size;
+
+   i915_render->vbo = NULL;
+   i915_render->vbo_size = 0;
    i915_render->vbo_offset = 0;
-   i915_render->vbo = pipe_buffer_create(screen,
-                                         64,
-                                         I915_BUFFER_USAGE_LIT_VERTEX,
-                                         i915_render->vbo_size);
-   i915_render->vbo_ptr = pipe_buffer_map(screen,
-                                          i915_render->vbo,
-                                          PIPE_BUFFER_USAGE_CPU_WRITE);
-   pipe_buffer_unmap(screen, i915_render->vbo);
+
+   i915_render->pool_used = FALSE;
+   i915_render->pool_buffer_size = 128 * 4096;
+   i915_render->pool_fifo = u_fifo_create(6);
+   for (i = 0; i < 6; i++)
+      u_fifo_add(i915_render->pool_fifo,
+                 iws->buffer_create(iws, i915_render->pool_buffer_size, 64,
+                                    INTEL_NEW_VERTEX));
+
+#if 0
+   /* TODO JB: is this realy needed? */
+   i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
+   iws->buffer_unmap(iws, i915_render->vbo);
+#endif
 
    return &i915_render->base;
 }
 
-
 /**
  * Create a new primitive vbuf/render stage.
  */
-struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 )
+struct draw_stage *i915_draw_vbuf_stage(struct i915_context *i915)
 {
    struct vbuf_render *render;
    struct draw_stage *stage;
@@ -564,7 +604,7 @@ struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 )
    if(!render)
       return NULL;
    
-   stage = draw_vbuf_stage( i915->draw, render );
+   stage = draw_vbuf_stage(i915->draw, render);
    if(!stage) {
       render->destroy(render);
       return NULL;
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index f4aa8e60d8..c66558c320 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -26,33 +26,36 @@
  **************************************************************************/
 
 
-#include "util/u_memory.h"
-#include "util/u_simple_screen.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
 #include "i915_screen.h"
+#include "i915_buffer.h"
 #include "i915_texture.h"
-#include "i915_winsys.h"
+#include "intel_winsys.h"
+
+
+/*
+ * Probe functions
+ */
 
 
 static const char *
-i915_get_vendor( struct pipe_screen *pscreen )
+i915_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
-
 static const char *
-i915_get_name( struct pipe_screen *pscreen )
+i915_get_name(struct pipe_screen *screen)
 {
    static char buffer[128];
    const char *chipset;
 
-   switch (i915_screen(pscreen)->pci_id) {
+   switch (i915_screen(screen)->pci_id) {
    case PCI_CHIP_I915_G:
       chipset = "915G";
       break;
@@ -86,7 +89,6 @@ i915_get_name( struct pipe_screen *pscreen )
    return buffer;
 }
 
-
 static int
 i915_get_param(struct pipe_screen *screen, int param)
 {
@@ -99,8 +101,6 @@ i915_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -122,7 +122,6 @@ i915_get_param(struct pipe_screen *screen, int param)
    }
 }
 
-
 static float
 i915_get_paramf(struct pipe_screen *screen, int param)
 {
@@ -148,13 +147,12 @@ i915_get_paramf(struct pipe_screen *screen, int param)
    }
 }
 
-
 static boolean
-i915_is_format_supported( struct pipe_screen *screen,
-                          enum pipe_format format, 
-                          enum pipe_texture_target target,
-                          unsigned tex_usage, 
-                          unsigned geom_flags )
+i915_is_format_supported(struct pipe_screen *screen,
+                         enum pipe_format format, 
+                         enum pipe_texture_target target,
+                         unsigned tex_usage, 
+                         unsigned geom_flags)
 {
    static const enum pipe_format tex_supported[] = {
       PIPE_FORMAT_R8G8B8A8_UNORM,
@@ -173,7 +171,6 @@ i915_is_format_supported( struct pipe_screen *screen,
       PIPE_FORMAT_A8R8G8B8_UNORM,
       PIPE_FORMAT_R5G6B5_UNORM,
       PIPE_FORMAT_S8Z24_UNORM,
-      /*PIPE_FORMAT_R16G16B16A16_SNORM,*/
       PIPE_FORMAT_NONE  /* list terminator */
    };
    const enum pipe_format *list;
@@ -193,120 +190,73 @@ i915_is_format_supported( struct pipe_screen *screen,
 }
 
 
-static void
-i915_destroy_screen( struct pipe_screen *screen )
-{
-   struct pipe_winsys *winsys = screen->winsys;
-
-   if(winsys->destroy)
-      winsys->destroy(winsys);
-
-   FREE(screen);
-}
+/*
+ * Fence functions
+ */
 
 
-static struct pipe_transfer*
-i915_get_tex_transfer(struct pipe_screen *screen,
-                      struct pipe_texture *texture,
-                      unsigned face, unsigned level, unsigned zslice,
-                      enum pipe_transfer_usage usage, unsigned x, unsigned y,
-                      unsigned w, unsigned h)
+static void
+i915_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
 {
-   struct i915_texture *tex = (struct i915_texture *)texture;
-   struct i915_transfer *trans;
-   unsigned offset;  /* in bytes */
+   struct i915_screen *is = i915_screen(screen);
 
-   if (texture->target == PIPE_TEXTURE_CUBE) {
-      offset = tex->image_offset[level][face];
-   }
-   else if (texture->target == PIPE_TEXTURE_3D) {
-      offset = tex->image_offset[level][zslice];
-   }
-   else {
-      offset = tex->image_offset[level][0];
-      assert(face == 0);
-      assert(zslice == 0);
-   }
-
-   trans = CALLOC_STRUCT(i915_transfer);
-   if (trans) {
-      pipe_texture_reference(&trans->base.texture, texture);
-      trans->base.format = trans->base.format;
-      trans->base.width = w;
-      trans->base.height = h;
-      trans->base.block = texture->block;
-      trans->base.nblocksx = texture->nblocksx[level];
-      trans->base.nblocksy = texture->nblocksy[level];
-      trans->base.stride = tex->stride;
-      trans->offset = offset;
-      trans->base.usage = usage;
-   }
-   return &trans->base;
+   is->iws->fence_reference(is->iws, ptr, fence);
 }
 
-static void
-i915_tex_transfer_destroy(struct pipe_transfer *trans)
+static int
+i915_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flags)
 {
-   pipe_texture_reference(&trans->texture, NULL);
-   FREE(trans);
+   struct i915_screen *is = i915_screen(screen);
+
+   return is->iws->fence_signalled(is->iws, fence);
 }
 
-static void *
-i915_transfer_map( struct pipe_screen *screen,
-                   struct pipe_transfer *transfer )
+static int
+i915_fence_finish(struct pipe_screen *screen,
+                  struct pipe_fence_handle *fence,
+                  unsigned flags)
 {
-   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
-   char *map;
-   unsigned flags = 0;
+   struct i915_screen *is = i915_screen(screen);
 
-   if (transfer->usage != PIPE_TRANSFER_WRITE)
-      flags |= PIPE_BUFFER_USAGE_CPU_READ;
+   return is->iws->fence_finish(is->iws, fence);
+}
 
-   if (transfer->usage != PIPE_TRANSFER_READ)
-      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
 
-   map = pipe_buffer_map( screen, tex->buffer, flags );
-   if (map == NULL)
-      return NULL;
+/*
+ * Generic functions
+ */
 
-   if (transfer->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
-   {
-      /* Do something to notify contexts of a texture change.  
-       */
-      /* i915_screen(screen)->timestamp++; */
-   }
-   
-   return map + i915_transfer(transfer)->offset +
-      transfer->y / transfer->block.height * transfer->stride +
-      transfer->x / transfer->block.width * transfer->block.size;
-}
 
 static void
-i915_transfer_unmap(struct pipe_screen *screen,
-                    struct pipe_transfer *transfer)
+i915_destroy_screen(struct pipe_screen *screen)
 {
-   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
-   pipe_buffer_unmap( screen, tex->buffer );
-}
+   struct i915_screen *is = i915_screen(screen);
 
+   if (is->iws)
+      is->iws->destroy(is->iws);
 
+   FREE(is);
+}
 
 /**
  * Create a new i915_screen object
  */
 struct pipe_screen *
-i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
+i915_create_screen(struct intel_winsys *iws, uint pci_id)
 {
-   struct i915_screen *i915screen = CALLOC_STRUCT(i915_screen);
+   struct i915_screen *is = CALLOC_STRUCT(i915_screen);
 
-   if (!i915screen)
+   if (!is)
       return NULL;
 
    switch (pci_id) {
    case PCI_CHIP_I915_G:
    case PCI_CHIP_I915_GM:
-      i915screen->is_i945 = FALSE;
+      is->is_i945 = FALSE;
       break;
 
    case PCI_CHIP_I945_G:
@@ -315,7 +265,7 @@ i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
    case PCI_CHIP_G33_G:
    case PCI_CHIP_Q33_G:
    case PCI_CHIP_Q35_G:
-      i915screen->is_i945 = TRUE;
+      is->is_i945 = TRUE;
       break;
 
    default:
@@ -324,24 +274,25 @@ i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
       return NULL;
    }
 
-   i915screen->pci_id = pci_id;
+   is->pci_id = pci_id;
+   is->iws = iws;
+
+   is->base.winsys = NULL;
 
-   i915screen->screen.winsys = winsys;
+   is->base.destroy = i915_destroy_screen;
 
-   i915screen->screen.destroy = i915_destroy_screen;
+   is->base.get_name = i915_get_name;
+   is->base.get_vendor = i915_get_vendor;
+   is->base.get_param = i915_get_param;
+   is->base.get_paramf = i915_get_paramf;
+   is->base.is_format_supported = i915_is_format_supported;
 
-   i915screen->screen.get_name = i915_get_name;
-   i915screen->screen.get_vendor = i915_get_vendor;
-   i915screen->screen.get_param = i915_get_param;
-   i915screen->screen.get_paramf = i915_get_paramf;
-   i915screen->screen.is_format_supported = i915_is_format_supported;
-   i915screen->screen.get_tex_transfer = i915_get_tex_transfer;
-   i915screen->screen.tex_transfer_destroy = i915_tex_transfer_destroy;
-   i915screen->screen.transfer_map = i915_transfer_map;
-   i915screen->screen.transfer_unmap = i915_transfer_unmap;
+   is->base.fence_reference = i915_fence_reference;
+   is->base.fence_signalled = i915_fence_signalled;
+   is->base.fence_finish = i915_fence_finish;
 
-   i915_init_screen_texture_functions(&i915screen->screen);
-   u_simple_screen_init(&i915screen->screen);
+   i915_init_screen_texture_functions(is);
+   i915_init_screen_buffer_functions(is);
 
-   return &i915screen->screen;
+   return &is->base;
 }
diff --git a/src/gallium/drivers/i915simple/i915_screen.h b/src/gallium/drivers/i915simple/i915_screen.h
index 5284c32595..5126485caa 100644
--- a/src/gallium/drivers/i915simple/i915_screen.h
+++ b/src/gallium/drivers/i915simple/i915_screen.h
@@ -25,17 +25,14 @@
  * 
  **************************************************************************/
 
-
 #ifndef I915_SCREEN_H
 #define I915_SCREEN_H
 
-
+#include "pipe/p_state.h"
 #include "pipe/p_screen.h"
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+struct intel_winsys;
 
 
 /**
@@ -43,13 +40,14 @@ extern "C" {
  */
 struct i915_screen
 {
-   struct pipe_screen screen;
+   struct pipe_screen base;
+
+   struct intel_winsys *iws;
 
    boolean is_i945;
    uint pci_id;
 };
 
-
 /**
  * Subclass of pipe_transfer
  */
@@ -61,7 +59,11 @@ struct i915_transfer
 };
 
 
-/** cast wrappers */
+/*
+ * Cast wrappers
+ */
+
+
 static INLINE struct i915_screen *
 i915_screen(struct pipe_screen *pscreen)
 {
@@ -69,14 +71,10 @@ i915_screen(struct pipe_screen *pscreen)
 }
 
 static INLINE struct i915_transfer *
-i915_transfer( struct pipe_transfer *transfer )
+i915_transfer(struct pipe_transfer *transfer)
 {
    return (struct i915_transfer *)transfer;
 }
 
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index 273e74002a..7d48e6e84d 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -518,7 +518,7 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
                                      const struct pipe_constant_buffer *buf)
 {
    struct i915_context *i915 = i915_context(pipe);
-   struct pipe_winsys *ws = pipe->winsys;
+   struct pipe_screen *screen = pipe->screen;
    draw_flush(i915->draw);
 
    assert(shader < PIPE_SHADER_TYPES);
@@ -536,10 +536,10 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
    if (buf) {
       void *mapped;
       if (buf->buffer && buf->buffer->size &&
-          (mapped = ws->buffer_map(ws, buf->buffer,
+          (mapped = pipe_buffer_map(screen, buf->buffer,
                                     PIPE_BUFFER_USAGE_CPU_READ))) {
          memcpy(i915->current.constants[shader], mapped, buf->buffer->size);
-         ws->buffer_unmap(ws, buf->buffer);
+         pipe_buffer_unmap(screen, buf->buffer);
          i915->current.num_user_constants[shader]
             = buf->buffer->size / (4 * sizeof(float));
       }
@@ -588,9 +588,17 @@ static void i915_set_framebuffer_state(struct pipe_context *pipe,
 				       const struct pipe_framebuffer_state *fb)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
+
    draw_flush(i915->draw);
 
-   i915->framebuffer = *fb; /* struct copy */
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
@@ -751,38 +759,38 @@ static void i915_set_edgeflags(struct pipe_context *pipe,
 void
 i915_init_state_functions( struct i915_context *i915 )
 {
-   i915->pipe.set_edgeflags = i915_set_edgeflags;
-   i915->pipe.create_blend_state = i915_create_blend_state;
-   i915->pipe.bind_blend_state = i915_bind_blend_state;
-   i915->pipe.delete_blend_state = i915_delete_blend_state;
-
-   i915->pipe.create_sampler_state = i915_create_sampler_state;
-   i915->pipe.bind_sampler_states = i915_bind_sampler_states;
-   i915->pipe.delete_sampler_state = i915_delete_sampler_state;
-
-   i915->pipe.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
-   i915->pipe.bind_depth_stencil_alpha_state = i915_bind_depth_stencil_state;
-   i915->pipe.delete_depth_stencil_alpha_state = i915_delete_depth_stencil_state;
-
-   i915->pipe.create_rasterizer_state = i915_create_rasterizer_state;
-   i915->pipe.bind_rasterizer_state = i915_bind_rasterizer_state;
-   i915->pipe.delete_rasterizer_state = i915_delete_rasterizer_state;
-   i915->pipe.create_fs_state = i915_create_fs_state;
-   i915->pipe.bind_fs_state = i915_bind_fs_state;
-   i915->pipe.delete_fs_state = i915_delete_fs_state;
-   i915->pipe.create_vs_state = i915_create_vs_state;
-   i915->pipe.bind_vs_state = i915_bind_vs_state;
-   i915->pipe.delete_vs_state = i915_delete_vs_state;
-
-   i915->pipe.set_blend_color = i915_set_blend_color;
-   i915->pipe.set_clip_state = i915_set_clip_state;
-   i915->pipe.set_constant_buffer = i915_set_constant_buffer;
-   i915->pipe.set_framebuffer_state = i915_set_framebuffer_state;
-
-   i915->pipe.set_polygon_stipple = i915_set_polygon_stipple;
-   i915->pipe.set_scissor_state = i915_set_scissor_state;
-   i915->pipe.set_sampler_textures = i915_set_sampler_textures;
-   i915->pipe.set_viewport_state = i915_set_viewport_state;
-   i915->pipe.set_vertex_buffers = i915_set_vertex_buffers;
-   i915->pipe.set_vertex_elements = i915_set_vertex_elements;
+   i915->base.set_edgeflags = i915_set_edgeflags;
+   i915->base.create_blend_state = i915_create_blend_state;
+   i915->base.bind_blend_state = i915_bind_blend_state;
+   i915->base.delete_blend_state = i915_delete_blend_state;
+
+   i915->base.create_sampler_state = i915_create_sampler_state;
+   i915->base.bind_sampler_states = i915_bind_sampler_states;
+   i915->base.delete_sampler_state = i915_delete_sampler_state;
+
+   i915->base.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
+   i915->base.bind_depth_stencil_alpha_state = i915_bind_depth_stencil_state;
+   i915->base.delete_depth_stencil_alpha_state = i915_delete_depth_stencil_state;
+
+   i915->base.create_rasterizer_state = i915_create_rasterizer_state;
+   i915->base.bind_rasterizer_state = i915_bind_rasterizer_state;
+   i915->base.delete_rasterizer_state = i915_delete_rasterizer_state;
+   i915->base.create_fs_state = i915_create_fs_state;
+   i915->base.bind_fs_state = i915_bind_fs_state;
+   i915->base.delete_fs_state = i915_delete_fs_state;
+   i915->base.create_vs_state = i915_create_vs_state;
+   i915->base.bind_vs_state = i915_bind_vs_state;
+   i915->base.delete_vs_state = i915_delete_vs_state;
+
+   i915->base.set_blend_color = i915_set_blend_color;
+   i915->base.set_clip_state = i915_set_clip_state;
+   i915->base.set_constant_buffer = i915_set_constant_buffer;
+   i915->base.set_framebuffer_state = i915_set_framebuffer_state;
+
+   i915->base.set_polygon_stipple = i915_set_polygon_stipple;
+   i915->base.set_scissor_state = i915_set_scissor_state;
+   i915->base.set_sampler_textures = i915_set_sampler_textures;
+   i915->base.set_viewport_state = i915_set_viewport_state;
+   i915->base.set_vertex_buffers = i915_set_vertex_buffers;
+   i915->base.set_vertex_elements = i915_set_vertex_elements;
 }
diff --git a/src/gallium/drivers/i915simple/i915_state_emit.c b/src/gallium/drivers/i915simple/i915_state_emit.c
index 1e1fb968b4..a3d4e3b04e 100644
--- a/src/gallium/drivers/i915simple/i915_state_emit.c
+++ b/src/gallium/drivers/i915simple/i915_state_emit.c
@@ -28,7 +28,6 @@
 
 #include "i915_reg.h"
 #include "i915_context.h"
-#include "i915_winsys.h"
 #include "i915_batch.h"
 #include "i915_reg.h"
 
@@ -107,7 +106,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
                              6 
                            ) * 3/2; /* plus 50% margin */
    const unsigned relocs = ( I915_TEX_UNITS +
-	                     3
+                             3
                            ) * 3/2; /* plus 50% margin */
 
 #if 0
@@ -123,9 +122,9 @@ i915_emit_hardware_state(struct i915_context *i915 )
    if (i915->hardware_dirty & I915_HW_INVARIENT)
    {
       OUT_BATCH(_3DSTATE_AA_CMD |
-		AA_LINE_ECAAR_WIDTH_ENABLE |
-		AA_LINE_ECAAR_WIDTH_1_0 |
-		AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+                AA_LINE_ECAAR_WIDTH_ENABLE |
+                AA_LINE_ECAAR_WIDTH_1_0 |
+                AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
 
       OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
       OUT_BATCH(0);
@@ -137,24 +136,24 @@ i915_emit_hardware_state(struct i915_context *i915 )
       OUT_BATCH(0);
 
       OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
-		CSB_TCB(0, 0) |
-		CSB_TCB(1, 1) |
-		CSB_TCB(2, 2) |
-		CSB_TCB(3, 3) |
-		CSB_TCB(4, 4) | 
-		CSB_TCB(5, 5) | 
-		CSB_TCB(6, 6) | 
-		CSB_TCB(7, 7));
+                CSB_TCB(0, 0) |
+                CSB_TCB(1, 1) |
+                CSB_TCB(2, 2) |
+                CSB_TCB(3, 3) |
+                CSB_TCB(4, 4) | 
+                CSB_TCB(5, 5) | 
+                CSB_TCB(6, 6) | 
+                CSB_TCB(7, 7));
 
       OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-		ENABLE_POINT_RASTER_RULE |
-		OGL_POINT_RASTER_RULE |
-		ENABLE_LINE_STRIP_PROVOKE_VRTX |
-		ENABLE_TRI_FAN_PROVOKE_VRTX |
-		LINE_STRIP_PROVOKE_VRTX(1) |
-		TRI_FAN_PROVOKE_VRTX(2) | 
-		ENABLE_TEXKILL_3D_4D | 
-		TEXKILL_4D);
+                ENABLE_POINT_RASTER_RULE |
+                OGL_POINT_RASTER_RULE |
+                ENABLE_LINE_STRIP_PROVOKE_VRTX |
+                ENABLE_TRI_FAN_PROVOKE_VRTX |
+                LINE_STRIP_PROVOKE_VRTX(1) |
+                TRI_FAN_PROVOKE_VRTX(2) | 
+                ENABLE_TEXKILL_3D_4D | 
+                TEXKILL_4D);
 
       /* Need to initialize this to zero.
        */
@@ -173,21 +172,21 @@ i915_emit_hardware_state(struct i915_context *i915 )
    if (i915->hardware_dirty & I915_HW_IMMEDIATE)
    {
       OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | 
-		I1_LOAD_S(0) |
-		I1_LOAD_S(1) |
-		I1_LOAD_S(2) |
-		I1_LOAD_S(4) |
-		I1_LOAD_S(5) |
-		I1_LOAD_S(6) | 
-		(5));
+                I1_LOAD_S(0) |
+                I1_LOAD_S(1) |
+                I1_LOAD_S(2) |
+                I1_LOAD_S(4) |
+                I1_LOAD_S(5) |
+                I1_LOAD_S(6) | 
+                (5));
       
       if(i915->vbo)
          OUT_RELOC(i915->vbo,
-                   I915_BUFFER_ACCESS_READ,
+                   INTEL_USAGE_VERTEX,
                    i915->current.immediate[I915_IMMEDIATE_S0]);
       else
-	 /* FIXME: we should not do this */
-	 OUT_BATCH(0);
+         /* FIXME: we should not do this */
+         OUT_BATCH(0);
       OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S1]);
       OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S2]);
       OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S4]);
@@ -200,7 +199,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
    {
       int i;
       for (i = 0; i < I915_MAX_DYNAMIC; i++) {
-	 OUT_BATCH(i915->current.dynamic[i]);
+         OUT_BATCH(i915->current.dynamic[i]);
       }
    }
    
@@ -211,68 +210,68 @@ i915_emit_hardware_state(struct i915_context *i915 )
       struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
 
       if (cbuf_surface) {
-	 unsigned ctile = BUF_3D_USE_FENCE;
+         unsigned ctile = BUF_3D_USE_FENCE;
          struct i915_texture *tex = (struct i915_texture *)
                                     cbuf_surface->texture;
          assert(tex);
 
-	 if (tex && tex->tiled) {
-	    ctile = BUF_3D_TILED_SURFACE;
-	 }
+         if (tex && tex->sw_tiled) {
+            ctile = BUF_3D_TILED_SURFACE;
+         }
 
-	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
 
-	 OUT_BATCH(BUF_3D_ID_COLOR_BACK |
-		   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-		   ctile);
+         OUT_BATCH(BUF_3D_ID_COLOR_BACK |
+                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                   ctile);
 
-	 OUT_RELOC(tex->buffer,
-		   I915_BUFFER_ACCESS_WRITE,
-		   cbuf_surface->offset);
+         OUT_RELOC(tex->buffer,
+                   INTEL_USAGE_RENDER,
+                   cbuf_surface->offset);
       }
 
       /* What happens if no zbuf??
        */
       if (depth_surface) {
-	 unsigned ztile = BUF_3D_USE_FENCE;
+         unsigned ztile = BUF_3D_USE_FENCE;
          struct i915_texture *tex = (struct i915_texture *)
                                     depth_surface->texture;
          assert(tex);
 
-	 if (tex && tex->tiled) {
-	    ztile = BUF_3D_TILED_SURFACE;
-	 }
+         if (tex && tex->sw_tiled) {
+            ztile = BUF_3D_TILED_SURFACE;
+         }
 
-	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
 
-	 OUT_BATCH(BUF_3D_ID_DEPTH |
-		   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-		   ztile);
+         OUT_BATCH(BUF_3D_ID_DEPTH |
+                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                   ztile);
 
-	 OUT_RELOC(tex->buffer,
-		   I915_BUFFER_ACCESS_WRITE,
-		   depth_surface->offset);
+         OUT_RELOC(tex->buffer,
+                   INTEL_USAGE_RENDER,
+                   depth_surface->offset);
       }
    
       {
-	 unsigned cformat, zformat = 0;
+         unsigned cformat, zformat = 0;
       
-	 if (cbuf_surface)
+         if (cbuf_surface)
             cformat = cbuf_surface->format;
          else
             cformat = PIPE_FORMAT_A8R8G8B8_UNORM; /* arbitrary */
          cformat = translate_format(cformat);
 
-	 if (depth_surface) 
-	    zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
+         if (depth_surface) 
+            zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
 
-	 OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-	 OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
-		   DSTORG_VERT_BIAS(0x8) | /* .5 */
-		   LOD_PRECLAMP_OGL |
-		   TEX_DEFAULT_COLOR_OGL |
-		   cformat |
-		   zformat );
+         OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+         OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
+                   DSTORG_VERT_BIAS(0x8) | /* .5 */
+                   LOD_PRECLAMP_OGL |
+                   TEX_DEFAULT_COLOR_OGL |
+                   cformat |
+                   zformat );
       }
    }
 
@@ -290,16 +289,13 @@ i915_emit_hardware_state(struct i915_context *i915 )
             OUT_BATCH(enabled);
             for (unit = 0; unit < I915_TEX_UNITS; unit++) {
                if (enabled & (1 << unit)) {
-                  struct pipe_buffer *buf =
-                     i915->texture[unit]->buffer;
+                  struct intel_buffer *buf = i915->texture[unit]->buffer;
                   uint offset = 0;
                   assert(buf);
 
                   count++;
 
-                  OUT_RELOC(buf,
-                            I915_BUFFER_ACCESS_READ,
-                            offset);
+                  OUT_RELOC(buf, INTEL_USAGE_SAMPLER, offset);
                   OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
                   OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
                }
@@ -315,20 +311,20 @@ i915_emit_hardware_state(struct i915_context *i915 )
    if (i915->hardware_dirty & I915_HW_SAMPLER) 
    {
       if (i915->current.sampler_enable_nr) {
-	 int i;
-	 
-	 OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
-		    (3 * i915->current.sampler_enable_nr) );
-
-	 OUT_BATCH( i915->current.sampler_enable_flags );
-
-	 for (i = 0; i < I915_TEX_UNITS; i++) {
-	    if (i915->current.sampler_enable_flags & (1<<i)) {
-	       OUT_BATCH( i915->current.sampler[i][0] );
-	       OUT_BATCH( i915->current.sampler[i][1] );
-	       OUT_BATCH( i915->current.sampler[i][2] );
-	    }
-	 }
+         int i;
+         
+         OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
+                    (3 * i915->current.sampler_enable_nr) );
+
+         OUT_BATCH( i915->current.sampler_enable_flags );
+
+         for (i = 0; i < I915_TEX_UNITS; i++) {
+            if (i915->current.sampler_enable_flags & (1<<i)) {
+               OUT_BATCH( i915->current.sampler[i][0] );
+               OUT_BATCH( i915->current.sampler[i][1] );
+               OUT_BATCH( i915->current.sampler[i][2] );
+            }
+         }
       }
    }
 #endif
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c
index 3667ed1afa..c5e9084d12 100644
--- a/src/gallium/drivers/i915simple/i915_state_sampler.c
+++ b/src/gallium/drivers/i915simple/i915_state_sampler.c
@@ -247,7 +247,7 @@ i915_update_texture(struct i915_context *i915,
    assert(format);
    assert(pitch);
 
-   if (tex->tiled) {
+   if (tex->sw_tiled) {
       assert(!((pitch - 1) & pitch));
       tiled = MS3_TILED_SURFACE;
    }
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index 09b2c499b8..ab8331f3e6 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -89,6 +89,6 @@ i915_surface_fill(struct pipe_context *pipe,
 void
 i915_init_surface_functions(struct i915_context *i915)
 {
-   i915->pipe.surface_copy = i915_surface_copy;
-   i915->pipe.surface_fill = i915_surface_fill;
+   i915->base.surface_copy = i915_surface_copy;
+   i915->base.surface_fill = i915_surface_fill;
 }
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index ca8e87af8d..6a6c654271 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -42,12 +42,14 @@
 #include "i915_texture.h"
 #include "i915_debug.h"
 #include "i915_screen.h"
-#include "i915_winsys.h"
+#include "intel_winsys.h"
+
 
 /*
  * Helper function and arrays
  */
 
+
 /**
  * Initial offset for Cube map.
  */
@@ -72,11 +74,6 @@ static const int step_offsets[6][2] = {
    {-1, 1}
 };
 
-static unsigned minify( unsigned d )
-{
-   return MAX2(1, d>>1);
-}
-
 static unsigned
 power_of_two(unsigned x)
 {
@@ -138,7 +135,7 @@ i915_miptree_set_level_info(struct i915_texture *tex,
 
 static void
 i915_miptree_set_image_offset(struct i915_texture *tex,
-			      unsigned level, unsigned img, unsigned x, unsigned y)
+                              unsigned level, unsigned img, unsigned x, unsigned y)
 {
    if (img == 0 && level == 0)
       assert(x == 0 && y == 0);
@@ -155,49 +152,189 @@ i915_miptree_set_image_offset(struct i915_texture *tex,
 
 
 /*
- * Layout functions
+ * i915 layout functions, some used by i945
  */
 
 
 /**
- * Special case to deal with display targets.
+ * Special case to deal with scanout textures.
  */
 static boolean
-i915_displaytarget_layout(struct i915_texture *tex)
+i915_scanout_layout(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
 
    if (pt->last_level > 0 || pt->block.size != 4)
       return 0;
 
-   i915_miptree_set_level_info( tex, 0, 1,
-                                tex->base.width[0],
-                                tex->base.height[0],
-                                1 );
-   i915_miptree_set_image_offset( tex, 0, 0, 0, 0 );
+   i915_miptree_set_level_info(tex, 0, 1,
+                               tex->base.width[0],
+                               tex->base.height[0],
+                               1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
-   if (tex->base.width[0] >= 128) {
+   if (tex->base.width[0] >= 240) {
+      tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+      tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+      tex->hw_tiled = INTEL_TILE_X;
+   } else if (tex->base.width[0] == 64 && tex->base.height[0] == 64) {
       tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
       tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
-#if 0 /* used for tiled display targets */
-      tex->tiled = 1;
-#endif
    } else {
-      tex->stride = round_up(tex->base.nblocksx[0] * pt->block.size, 64);
-      tex->total_nblocksy = tex->base.nblocksy[0];
+      return FALSE;
    }
 
-   /*
-   printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
       tex->base.width[0], tex->base.height[0], pt->block.size,
       tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
-   */
 
-   return 1;
+   return TRUE;
+}
+
+static void
+i915_miptree_layout_2d(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 1, width, height, 1);
+      i915_miptree_set_image_offset(tex, level, 0, 0, tex->total_nblocksy);
+
+      nblocksy = round_up(MAX2(2, nblocksy), 2);
+
+      tex->total_nblocksy += nblocksy;
+
+      width = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static void
+i915_miptree_layout_3d(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+   unsigned stack_nblocksy = 0;
+
+   /* Calculate the size of a single slice. 
+    */
+   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* XXX: hardware expects/requires 9 levels at minimum.
+    */
+   for (level = 0; level <= MAX2(8, pt->last_level); level++) {
+      i915_miptree_set_level_info(tex, level, depth, width, height, depth);
+
+      stack_nblocksy += MAX2(2, nblocksy);
+
+      width = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+
+   /* Fixup depth image_offsets: 
+    */
+   depth = pt->depth[0];
+   for (level = 0; level <= pt->last_level; level++) {
+      unsigned i;
+      for (i = 0; i < depth; i++) 
+         i915_miptree_set_image_offset(tex, level, i, 0, i * stack_nblocksy);
+
+      depth = minify(depth);
+   }
+
+   /* Multiply slice size by texture depth for total size.  It's
+    * remarkable how wasteful of memory the i915 texture layouts
+    * are.  They are largely fixed in the i945.
+    */
+   tex->total_nblocksy = stack_nblocksy * pt->depth[0];
+}
+
+static void
+i915_miptree_layout_cube(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned width = pt->width[0], height = pt->height[0];
+   const unsigned nblocks = pt->nblocksx[0];
+   unsigned level;
+   unsigned face;
+
+   assert(width == height); /* cubemap images are square */
+
+   /* double pitch for cube layouts */
+   tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+   tex->total_nblocksy = nblocks * 4;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 6, width, height, 1);
+      width /= 2;
+      height /= 2;
+   }
+
+   for (face = 0; face < 6; face++) {
+      unsigned x = initial_offsets[face][0] * nblocks;
+      unsigned y = initial_offsets[face][1] * nblocks;
+      unsigned d = nblocks;
+
+      for (level = 0; level <= pt->last_level; level++) {
+         i915_miptree_set_image_offset(tex, level, face, x, y);
+         d >>= 1;
+         x += step_offsets[face][0] * d;
+         y += step_offsets[face][1] * d;
+      }
+   }
+}
+
+static boolean
+i915_miptree_layout(struct i915_texture * tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+      i915_miptree_layout_2d(tex);
+      break;
+   case PIPE_TEXTURE_3D:
+      i915_miptree_layout_3d(tex);
+      break;
+   case PIPE_TEXTURE_CUBE:
+      i915_miptree_layout_cube(tex);
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   return TRUE;
 }
 
+
+/*
+ * i945 layout functions
+ */
+
+
 static void
-i945_miptree_layout_2d( struct i915_texture *tex )
+i945_miptree_layout_2d(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
    const int align_x = 2, align_y = 4;
@@ -209,10 +346,10 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
-   /* used for tiled display targets */
-   if (0)
-      if (i915_displaytarget_layout(tex))
-	 return;
+   /* used for scanouts that need special layouts */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+      if (i915_scanout_layout(tex))
+         return;
 
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
@@ -223,11 +360,11 @@ i945_miptree_layout_2d( struct i915_texture *tex )
     */
    if (pt->last_level > 0) {
       unsigned mip1_nblocksx 
-	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
          + pf_get_nblocksx(&pt->block, minify(minify(width)));
 
       if (mip1_nblocksx > nblocksx)
-	 tex->stride = mip1_nblocksx * pt->block.size;
+         tex->stride = mip1_nblocksx * pt->block.size;
    }
 
    /* Pitch must be a whole number of dwords
@@ -249,10 +386,10 @@ i945_miptree_layout_2d( struct i915_texture *tex )
       /* Layout_below: step right after second mipmap level.
        */
       if (level == 1) {
-	 x += align(nblocksx, align_x);
+         x += align(nblocksx, align_x);
       }
       else {
-	 y += nblocksy;
+         y += nblocksy;
       }
 
       width  = minify(width);
@@ -263,6 +400,63 @@ i945_miptree_layout_2d( struct i915_texture *tex )
 }
 
 static void
+i945_miptree_layout_3d(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+   unsigned pack_x_pitch, pack_x_nr;
+   unsigned pack_y_pitch;
+   unsigned level;
+
+   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->total_nblocksy = 0;
+
+   pack_y_pitch = MAX2(pt->nblocksy[0], 2);
+   pack_x_pitch = tex->stride / pt->block.size;
+   pack_x_nr = 1;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      int x = 0;
+      int y = 0;
+      unsigned q, j;
+
+      i915_miptree_set_level_info(tex, level, depth, width, height, depth);
+
+      for (q = 0; q < depth;) {
+         for (j = 0; j < pack_x_nr && q < depth; j++, q++) {
+            i915_miptree_set_image_offset(tex, level, q, x, y + tex->total_nblocksy);
+            x += pack_x_pitch;
+         }
+
+         x = 0;
+         y += pack_y_pitch;
+      }
+
+      tex->total_nblocksy += y;
+
+      if (pack_x_pitch > 4) {
+         pack_x_pitch >>= 1;
+         pack_x_nr <<= 1;
+         assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+      }
+
+      if (pack_y_pitch > 2) {
+         pack_y_pitch >>= 1;
+      }
+
+      width = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static void
 i945_miptree_layout_cube(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
@@ -364,226 +558,44 @@ i945_miptree_layout_cube(struct i915_texture *tex)
 }
 
 static boolean
-i915_miptree_layout(struct i915_texture * tex)
-{
-   struct pipe_texture *pt = &tex->base;
-   unsigned level;
-
-   switch (pt->target) {
-   case PIPE_TEXTURE_CUBE: {
-         const unsigned nblocks = pt->nblocksx[0];
-         unsigned face;
-         unsigned width = pt->width[0], height = pt->height[0];
-
-         assert(width == height); /* cubemap images are square */
-
-         /* double pitch for cube layouts */
-         tex->stride = round_up(nblocks * pt->block.size * 2, 4);
-         tex->total_nblocksy = nblocks * 4;
-
-         for (level = 0; level <= pt->last_level; level++) {
-            i915_miptree_set_level_info(tex, level, 6,
-                                         width, height,
-                                         1);
-            width /= 2;
-            height /= 2;
-         }
-
-         for (face = 0; face < 6; face++) {
-            unsigned x = initial_offsets[face][0] * nblocks;
-            unsigned y = initial_offsets[face][1] * nblocks;
-            unsigned d = nblocks;
-
-            for (level = 0; level <= pt->last_level; level++) {
-               i915_miptree_set_image_offset(tex, level, face, x, y);
-               d >>= 1;
-               x += step_offsets[face][0] * d;
-               y += step_offsets[face][1] * d;
-            }
-         }
-         break;
-      }
-   case PIPE_TEXTURE_3D:{
-         unsigned width = pt->width[0];
-         unsigned height = pt->height[0];
-         unsigned depth = pt->depth[0];
-         unsigned nblocksx = pt->nblocksx[0];
-         unsigned nblocksy = pt->nblocksy[0];
-         unsigned stack_nblocksy = 0;
-
-         /* Calculate the size of a single slice. 
-          */
-         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
-
-         /* XXX: hardware expects/requires 9 levels at minimum.
-          */
-         for (level = 0; level <= MAX2(8, pt->last_level);
-              level++) {
-            i915_miptree_set_level_info(tex, level, depth,
-                                        width, height, depth);
-
-
-            stack_nblocksy += MAX2(2, nblocksy);
-
-            width = minify(width);
-            height = minify(height);
-            depth = minify(depth);
-            nblocksx = pf_get_nblocksx(&pt->block, width);
-            nblocksy = pf_get_nblocksy(&pt->block, height);
-         }
-
-         /* Fixup depth image_offsets: 
-          */
-         depth = pt->depth[0];
-         for (level = 0; level <= pt->last_level; level++) {
-            unsigned i;
-            for (i = 0; i < depth; i++) 
-               i915_miptree_set_image_offset(tex, level, i,
-                                             0, i * stack_nblocksy);
-
-            depth = minify(depth);
-         }
-
-
-         /* Multiply slice size by texture depth for total size.  It's
-          * remarkable how wasteful of memory the i915 texture layouts
-          * are.  They are largely fixed in the i945.
-          */
-         tex->total_nblocksy = stack_nblocksy * pt->depth[0];
-         break;
-      }
-
-   default:{
-         unsigned width = pt->width[0];
-         unsigned height = pt->height[0];
-         unsigned nblocksx = pt->nblocksx[0];
-         unsigned nblocksy = pt->nblocksy[0];
-
-         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
-         tex->total_nblocksy = 0;
-
-         for (level = 0; level <= pt->last_level; level++) {
-            i915_miptree_set_level_info(tex, level, 1,
-                                        width, height, 1);
-            i915_miptree_set_image_offset(tex, level, 0,
-                                          0, tex->total_nblocksy);
-
-            nblocksy = round_up(MAX2(2, nblocksy), 2);
-
-	    tex->total_nblocksy += nblocksy;
-
-            width = minify(width);
-            height = minify(height);
-            nblocksx = pf_get_nblocksx(&pt->block, width);
-            nblocksy = pf_get_nblocksy(&pt->block, height);
-         }
-         break;
-      }
-   }
-   /*
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
-       tex->pitch,
-       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
-   */
-
-   return TRUE;
-}
-
-
-static boolean
 i945_miptree_layout(struct i915_texture * tex)
 {
    struct pipe_texture *pt = &tex->base;
-   unsigned level;
 
    switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+      i945_miptree_layout_2d(tex);
+      break;
+   case PIPE_TEXTURE_3D:
+      i945_miptree_layout_3d(tex);
+      break;
    case PIPE_TEXTURE_CUBE:
       i945_miptree_layout_cube(tex);
       break;
-   case PIPE_TEXTURE_3D:{
-         unsigned width = pt->width[0];
-         unsigned height = pt->height[0];
-         unsigned depth = pt->depth[0];
-         unsigned nblocksx = pt->nblocksx[0];
-         unsigned nblocksy = pt->nblocksy[0];
-         unsigned pack_x_pitch, pack_x_nr;
-         unsigned pack_y_pitch;
-
-         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
-         tex->total_nblocksy = 0;
-
-         pack_y_pitch = MAX2(pt->nblocksy[0], 2);
-         pack_x_pitch = tex->stride / pt->block.size;
-         pack_x_nr = 1;
-
-         for (level = 0; level <= pt->last_level; level++) {
-            unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
-            int x = 0;
-            int y = 0;
-            unsigned q, j;
-
-            i915_miptree_set_level_info(tex, level, nr_images,
-                                        width, height, depth);
-
-            for (q = 0; q < nr_images;) {
-               for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
-                  i915_miptree_set_image_offset(tex, level, q, x, y + tex->total_nblocksy);
-                  x += pack_x_pitch;
-               }
-
-               x = 0;
-               y += pack_y_pitch;
-            }
-
-
-            tex->total_nblocksy += y;
-
-            if (pack_x_pitch > 4) {
-               pack_x_pitch >>= 1;
-               pack_x_nr <<= 1;
-               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
-            }
-
-            if (pack_y_pitch > 2) {
-               pack_y_pitch >>= 1;
-            }
-
-            width = minify(width);
-            height = minify(height);
-            depth = minify(depth);
-            nblocksx = pf_get_nblocksx(&pt->block, width);
-            nblocksy = pf_get_nblocksy(&pt->block, height);
-         }
-         break;
-      }
-
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_2D:
-//   case PIPE_TEXTURE_RECTANGLE:
-         i945_miptree_layout_2d(tex);
-         break;
    default:
       assert(0);
       return FALSE;
    }
 
-   /*
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
-       tex->pitch,
-       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
-   */
-
    return TRUE;
 }
 
 
+/*
+ * Screen texture functions
+ */
+
+
 static struct pipe_texture *
 i915_texture_create(struct pipe_screen *screen,
                     const struct pipe_texture *templat)
 {
-   struct i915_screen *i915screen = i915_screen(screen);
+   struct i915_screen *is = i915_screen(screen);
+   struct intel_winsys *iws = is->iws;
    struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
    size_t tex_size;
+   unsigned buf_usage = 0;
 
    if (!tex)
       return NULL;
@@ -595,23 +607,35 @@ i915_texture_create(struct pipe_screen *screen,
    tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
    tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
    
-   if (i915screen->is_i945) {
+   if (is->is_i945) {
       if (!i945_miptree_layout(tex))
-	 goto fail;
+         goto fail;
    } else {
       if (!i915_miptree_layout(tex))
-	 goto fail;
+         goto fail;
    }
 
    tex_size = tex->stride * tex->total_nblocksy;
 
-   tex->buffer = screen->buffer_create(screen, 64,
-                                    PIPE_BUFFER_USAGE_PIXEL,
-                                    tex_size);
 
+
+   /* for scanouts and cursors, cursors arn't scanouts */
+   if (templat->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY && templat->width[0] != 64)
+      buf_usage = INTEL_NEW_SCANOUT;
+   else
+      buf_usage = INTEL_NEW_TEXTURE;
+
+   tex->buffer = iws->buffer_create(iws, tex_size, 64, buf_usage);
    if (!tex->buffer)
       goto fail;
 
+   /* setup any hw fences */
+   if (tex->hw_tiled) {
+      assert(tex->sw_tiled == INTEL_TILE_NONE);
+      iws->buffer_set_fence_reg(iws, tex->buffer, tex->stride, tex->hw_tiled);
+   }
+
+   
 #if 0
    void *ptr = ws->buffer_map(ws, tex->buffer,
       PIPE_BUFFER_USAGE_CPU_WRITE);
@@ -626,18 +650,56 @@ fail:
    return NULL;
 }
 
+static struct pipe_texture *
+i915_texture_blanket(struct pipe_screen * screen,
+                     const struct pipe_texture *base,
+                     const unsigned *stride,
+                     struct pipe_buffer *buffer)
+{
+#if 0
+   struct i915_texture *tex;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   tex = CALLOC_STRUCT(i915_texture);
+   if (!tex)
+      return NULL;
+
+   tex->base = *base;
+   pipe_reference_init(&tex->base.reference, 1);
+   tex->base.screen = screen;
+
+   tex->stride = stride[0];
+
+   i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   pipe_buffer_reference(&tex->buffer, buffer);
+
+   return &tex->base;
+#else
+   return NULL;
+#endif
+}
 
 static void
 i915_texture_destroy(struct pipe_texture *pt)
 {
    struct i915_texture *tex = (struct i915_texture *)pt;
+   struct intel_winsys *iws = i915_screen(pt->screen)->iws;
    uint i;
 
    /*
      DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
    */
 
-   pipe_buffer_reference(&tex->buffer, NULL);
+   iws->buffer_destroy(iws, tex->buffer);
 
    for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
       if (tex->image_offset[i])
@@ -646,6 +708,12 @@ i915_texture_destroy(struct pipe_texture *pt)
    FREE(tex);
 }
 
+
+/*
+ * Screen surface functions
+ */
+
+
 static struct pipe_surface *
 i915_get_tex_surface(struct pipe_screen *screen,
                      struct pipe_texture *pt,
@@ -681,11 +749,122 @@ i915_get_tex_surface(struct pipe_screen *screen,
    return ps;
 }
 
-static struct pipe_texture *
-i915_texture_blanket(struct pipe_screen * screen,
-                     const struct pipe_texture *base,
-                     const unsigned *stride,
-                     struct pipe_buffer *buffer)
+static void
+i915_tex_surface_destroy(struct pipe_surface *surf)
+{
+   pipe_texture_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+/*
+ * Screen transfer functions
+ */
+
+
+static struct pipe_transfer*
+i915_get_tex_transfer(struct pipe_screen *screen,
+                      struct pipe_texture *texture,
+                      unsigned face, unsigned level, unsigned zslice,
+                      enum pipe_transfer_usage usage, unsigned x, unsigned y,
+                      unsigned w, unsigned h)
+{
+   struct i915_texture *tex = (struct i915_texture *)texture;
+   struct i915_transfer *trans;
+   unsigned offset;  /* in bytes */
+
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   }
+   else if (texture->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   }
+   else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   trans = CALLOC_STRUCT(i915_transfer);
+   if (trans) {
+      pipe_texture_reference(&trans->base.texture, texture);
+      trans->base.format = trans->base.format;
+      trans->base.x = x;
+      trans->base.y = y;
+      trans->base.width = w;
+      trans->base.height = h;
+      trans->base.block = texture->block;
+      trans->base.nblocksx = texture->nblocksx[level];
+      trans->base.nblocksy = texture->nblocksy[level];
+      trans->base.stride = tex->stride;
+      trans->offset = offset;
+      trans->base.usage = usage;
+   }
+   return &trans->base;
+}
+
+static void *
+i915_transfer_map(struct pipe_screen *screen,
+                  struct pipe_transfer *transfer)
+{
+   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
+   struct intel_winsys *iws = i915_screen(tex->base.screen)->iws;
+   char *map;
+   boolean write = FALSE;
+
+   if (transfer->usage != PIPE_TRANSFER_READ)
+      write = TRUE;
+
+   map = iws->buffer_map(iws, tex->buffer, write);
+   if (map == NULL)
+      return NULL;
+
+   return map + i915_transfer(transfer)->offset +
+      transfer->y / transfer->block.height * transfer->stride +
+      transfer->x / transfer->block.width * transfer->block.size;
+}
+
+static void
+i915_transfer_unmap(struct pipe_screen *screen,
+                    struct pipe_transfer *transfer)
+{
+   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
+   struct intel_winsys *iws = i915_screen(tex->base.screen)->iws;
+   iws->buffer_unmap(iws, tex->buffer);
+}
+
+static void
+i915_tex_transfer_destroy(struct pipe_transfer *trans)
+{
+   pipe_texture_reference(&trans->texture, NULL);
+   FREE(trans);
+}
+
+
+/*
+ * Other texture functions
+ */
+
+
+void
+i915_init_screen_texture_functions(struct i915_screen *is)
+{
+   is->base.texture_create = i915_texture_create;
+   is->base.texture_blanket = i915_texture_blanket;
+   is->base.texture_destroy = i915_texture_destroy;
+   is->base.get_tex_surface = i915_get_tex_surface;
+   is->base.tex_surface_destroy = i915_tex_surface_destroy;
+   is->base.get_tex_transfer = i915_get_tex_transfer;
+   is->base.transfer_map = i915_transfer_map;
+   is->base.transfer_unmap = i915_transfer_unmap;
+   is->base.tex_transfer_destroy = i915_tex_transfer_destroy;
+}
+
+struct pipe_texture *
+i915_texture_blanket_intel(struct pipe_screen *screen,
+                           struct pipe_texture *base,
+                           unsigned stride,
+                           struct intel_buffer *buffer)
 {
    struct i915_texture *tex;
    assert(screen);
@@ -705,52 +884,28 @@ i915_texture_blanket(struct pipe_screen * screen,
    pipe_reference_init(&tex->base.reference, 1);
    tex->base.screen = screen;
 
-   tex->stride = stride[0];
+   tex->stride = stride;
 
    i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
-   pipe_buffer_reference(&tex->buffer, buffer);
+   tex->buffer = buffer;
 
    return &tex->base;
 }
 
-void
-i915_init_texture_functions(struct i915_context *i915)
-{
-//   i915->pipe.texture_update = i915_texture_update;
-}
-
-static void
-i915_tex_surface_destroy(struct pipe_surface *surf)
-{
-   pipe_texture_reference(&surf->texture, NULL);
-   FREE(surf);
-}
-
-void
-i915_init_screen_texture_functions(struct pipe_screen *screen)
-{
-   screen->texture_create = i915_texture_create;
-   screen->texture_destroy = i915_texture_destroy;
-   screen->get_tex_surface = i915_get_tex_surface;
-   screen->texture_blanket = i915_texture_blanket;
-   screen->tex_surface_destroy = i915_tex_surface_destroy;
-}
-
-boolean i915_get_texture_buffer( struct pipe_texture *texture,
-                                 struct pipe_buffer **buf,
-                                 unsigned *stride )
+boolean
+i915_get_texture_buffer_intel(struct pipe_texture *texture,
+                              struct intel_buffer **buffer,
+                              unsigned *stride)
 {
    struct i915_texture *tex = (struct i915_texture *)texture;
 
-   if (!tex)
+   if (!texture)
       return FALSE;
 
-   pipe_buffer_reference(buf, tex->buffer);
-
-   if (stride)
-      *stride = tex->stride;
+   *stride = tex->stride;
+   *buffer = tex->buffer;
 
    return TRUE;
 }
diff --git a/src/gallium/drivers/i915simple/i915_texture.h b/src/gallium/drivers/i915simple/i915_texture.h
index 7225016a9f..51a1dd984c 100644
--- a/src/gallium/drivers/i915simple/i915_texture.h
+++ b/src/gallium/drivers/i915simple/i915_texture.h
@@ -28,16 +28,9 @@
 #ifndef I915_TEXTURE_H
 #define I915_TEXTURE_H
 
-struct i915_context;
-struct pipe_screen;
-
+struct i915_screen;
 
 extern void
-i915_init_texture_functions(struct i915_context *i915);
-
-
-extern void
-i915_init_screen_texture_functions(struct pipe_screen *screen);
-
+i915_init_screen_texture_functions(struct i915_screen *is);
 
 #endif /* I915_TEXTURE_H */
diff --git a/src/gallium/drivers/i915simple/i915_winsys.h b/src/gallium/drivers/i915simple/i915_winsys.h
deleted file mode 100644
index ff5b34f193..0000000000
--- a/src/gallium/drivers/i915simple/i915_winsys.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \file
- * This is the interface that i915simple requires any window system
- * hosting it to implement.  This is the only include file in i915simple
- * which is public.
- *
- * This isn't currently true as the winsys needs i915_batchbuffer.h
- */
-
-#ifndef I915_WINSYS_H
-#define I915_WINSYS_H
-
-
-#include "pipe/p_defines.h"
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-   
-/* Pipe drivers are independent of both GL and the window system.
- * The window system provides a buffer manager and a set of additional
- * hooks for things like command buffer submission, etc.
- *
- * There clearly has to be some agreement between the window system
- * driver and the hardware driver about the format of command buffers,
- * etc.
- */
-
-struct i915_batchbuffer;
-struct pipe_texture;
-struct pipe_buffer;
-struct pipe_fence_handle;
-struct pipe_winsys;
-struct pipe_screen;
-
-
-/**
- * Additional winsys interface for i915simple.
- *
- * It is an over-simple batchbuffer mechanism.  Will want to improve the
- * performance of this, perhaps based on the cmdstream stuff.  It
- * would be pretty impossible to implement swz on top of this
- * interface.
- *
- * Will also need additions/changes to implement static/dynamic
- * indirect state.
- */
-struct i915_winsys {
-
-   void (*destroy)( struct i915_winsys *sws );
-   
-   /**
-    * Get the current batch buffer from the winsys.
-    */
-   struct i915_batchbuffer *(*batch_get)( struct i915_winsys *sws );
-
-   /**
-    * Emit a relocation to a buffer.
-    * 
-    * Used not only when the buffer addresses are not pinned, but also to 
-    * ensure refered buffers will not be destroyed until the current batch 
-    * buffer execution is finished.
-    *
-    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and 
-    * I915_BUFFER_ACCESS_READ macros.
-    */
-   void (*batch_reloc)( struct i915_winsys *sws,
-			struct pipe_buffer *buf,
-			unsigned access_flags,
-			unsigned delta );
-
-   /**
-    * Flush the batch.
-    */
-   void (*batch_flush)( struct i915_winsys *sws,
-                        struct pipe_fence_handle **fence );
-};
-
-#define I915_BUFFER_ACCESS_WRITE   0x1 
-#define I915_BUFFER_ACCESS_READ    0x2
-
-#define I915_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
-
-
-/**
- * Create i915 pipe_screen.
- */
-struct pipe_screen *i915_create_screen( struct pipe_winsys *winsys,
-                                        uint pci_id );
-
-/**
- * Create a i915 pipe_context.
- */
-struct pipe_context *i915_create_context( struct pipe_screen *screen,
-                                          struct pipe_winsys *winsys,
-                                          struct i915_winsys *i915 );
-
-/**
- * Used for the winsys to get the buffer used for a texture
- * and also the stride used for the texture.
- *
- * Buffer is referenced for you so you need to unref after use.
- *
- * This is needed for example kms.
- */
-boolean i915_get_texture_buffer( struct pipe_texture *texture,
-                                 struct pipe_buffer **buf,
-                                 unsigned *stride );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/gallium/drivers/i915simple/intel_batchbuffer.h b/src/gallium/drivers/i915simple/intel_batchbuffer.h
new file mode 100644
index 0000000000..db12dfd2ac
--- /dev/null
+++ b/src/gallium/drivers/i915simple/intel_batchbuffer.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef INTEL_BATCH_H
+#define INTEL_BATCH_H
+
+#include "intel_winsys.h"
+
+static INLINE boolean
+intel_batchbuffer_check(struct intel_batchbuffer *batch,
+                        size_t dwords,
+                        size_t relocs)
+{
+   return dwords * 4 <= batch->size - (batch->ptr - batch->map) &&
+          relocs <= (batch->max_relocs - batch->relocs);
+}
+
+static INLINE size_t
+intel_batchbuffer_space(struct intel_batchbuffer *batch)
+{
+   return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+intel_batchbuffer_dword(struct intel_batchbuffer *batch,
+                        unsigned dword)
+{
+   if (intel_batchbuffer_space(batch) < 4)
+      return;
+
+   *(unsigned *)batch->ptr = dword;
+   batch->ptr += 4;
+}
+
+static INLINE void
+intel_batchbuffer_write(struct intel_batchbuffer *batch,
+                        void *data,
+                        size_t size)
+{
+   if (intel_batchbuffer_space(batch) < size)
+      return;
+
+   memcpy(data, batch->ptr, size);
+   batch->ptr += size;
+}
+
+static INLINE int
+intel_batchbuffer_reloc(struct intel_batchbuffer *batch,
+                        struct intel_buffer *buffer,
+                        enum intel_buffer_usage usage,
+                        size_t offset)
+{
+   return batch->iws->batchbuffer_reloc(batch, buffer, usage, offset);
+}
+
+static INLINE void
+intel_batchbuffer_flush(struct intel_batchbuffer *batch,
+                        struct pipe_fence_handle **fence)
+{
+   batch->iws->batchbuffer_flush(batch, fence);
+}
+
+#endif
diff --git a/src/gallium/drivers/i915simple/intel_winsys.h b/src/gallium/drivers/i915simple/intel_winsys.h
new file mode 100644
index 0000000000..42c5e7470e
--- /dev/null
+++ b/src/gallium/drivers/i915simple/intel_winsys.h
@@ -0,0 +1,230 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INTEL_WINSYS_H
+#define INTEL_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+struct intel_winsys;
+struct intel_buffer;
+struct intel_batchbuffer;
+struct pipe_texture;
+struct pipe_fence_handle;
+
+enum intel_buffer_usage
+{
+   /* use on textures */
+   INTEL_USAGE_RENDER    = 0x01,
+   INTEL_USAGE_SAMPLER   = 0x02,
+   INTEL_USAGE_2D_TARGET = 0x04,
+   INTEL_USAGE_2D_SOURCE = 0x08,
+   /* use on vertex */
+   INTEL_USAGE_VERTEX    = 0x10,
+};
+
+enum intel_buffer_type
+{
+   INTEL_NEW_TEXTURE,
+   INTEL_NEW_SCANOUT, /**< a texture used for scanning out from */
+   INTEL_NEW_VERTEX,
+};
+
+enum intel_buffer_tile
+{
+   INTEL_TILE_NONE,
+   INTEL_TILE_X,
+   INTEL_TILE_Y,
+};
+
+struct intel_batchbuffer {
+
+   struct intel_winsys *iws;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct intel_winsys {
+
+   /**
+    * Batchbuffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a new batchbuffer.
+    */
+   struct intel_batchbuffer *(*batchbuffer_create)(struct intel_winsys *iws);
+
+   /**
+    * Emit a relocation to a buffer.
+    * Target position in batchbuffer is the same as ptr.
+    *
+    * @batch
+    * @reloc buffer address to be inserted into target.
+    * @usage how is the hardware going to use the buffer.
+    * @offset add this to the reloc buffers address
+    * @target buffer where to write the address, null for batchbuffer.
+    */
+   int (*batchbuffer_reloc)(struct intel_batchbuffer *batch,
+                            struct intel_buffer *reloc,
+                            enum intel_buffer_usage usage,
+                            unsigned offset);
+
+   /**
+    * Flush a bufferbatch.
+    */
+   void (*batchbuffer_flush)(struct intel_batchbuffer *batch,
+                             struct pipe_fence_handle **fence);
+
+   /**
+    * Destroy a batchbuffer.
+    */
+   void (*batchbuffer_destroy)(struct intel_batchbuffer *batch);
+   /*@}*/
+
+
+   /**
+    * Buffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   struct intel_buffer *(*buffer_create)(struct intel_winsys *iws,
+                                         unsigned size, unsigned alignment,
+                                         enum intel_buffer_type type);
+
+   /**
+    * Fence a buffer with a fence reg.
+    * Not to be confused with pipe_fence_handle.
+    */
+   int (*buffer_set_fence_reg)(struct intel_winsys *iws,
+                               struct intel_buffer *buffer,
+                               unsigned stride,
+                               enum intel_buffer_tile tile);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*buffer_map)(struct intel_winsys *iws,
+                       struct intel_buffer *buffer,
+                       boolean write);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*buffer_unmap)(struct intel_winsys *iws,
+                        struct intel_buffer *buffer);
+
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pwrite(2)
+    */
+   int (*buffer_write)(struct intel_winsys *iws,
+                       struct intel_buffer *dst,
+                       const void *src,
+                       size_t size,
+                       size_t offset);
+
+   void (*buffer_destroy)(struct intel_winsys *iws,
+                          struct intel_buffer *buffer);
+   /*@}*/
+
+
+   /**
+    * Fence functions.
+    */
+   /*@{*/
+   /**
+    * Reference fence and set ptr to fence.
+    */
+   void (*fence_reference)(struct intel_winsys *iws,
+                           struct pipe_fence_handle **ptr,
+                           struct pipe_fence_handle *fence);
+
+   /**
+    * Check if a fence has finished.
+    */
+   int (*fence_signalled)(struct intel_winsys *iws,
+                          struct pipe_fence_handle *fence);
+
+   /**
+    * Wait on a fence to finish.
+    */
+   int (*fence_finish)(struct intel_winsys *iws,
+                       struct pipe_fence_handle *fence);
+   /*@}*/
+
+
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct intel_winsys *iws);
+};
+
+
+/**
+ * Create i915 pipe_screen.
+ */
+struct pipe_screen *i915_create_screen(struct intel_winsys *iws, unsigned pci_id);
+
+/**
+ * Create a i915 pipe_context.
+ */
+struct pipe_context *i915_create_context(struct pipe_screen *screen);
+
+/**
+ * Get the intel_winsys buffer backing the texture.
+ *
+ * TODO UGLY
+ */
+boolean i915_get_texture_buffer_intel(struct pipe_texture *texture,
+                                      struct intel_buffer **buffer,
+                                      unsigned *stride);
+
+/**
+ * Wrap a intel_winsys buffer with a texture blanket.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture * i915_texture_blanket_intel(struct pipe_screen *screen,
+                                                 struct pipe_texture *tmplt,
+                                                 unsigned pitch,
+                                                 struct intel_buffer *buffer);
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index b22e105f10..4a84c4db23 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -39,7 +39,7 @@
 static const char *
 brw_get_vendor( struct pipe_screen *screen )
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -85,8 +85,6 @@ brw_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index 511779dbfa..724a69b2ee 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -60,7 +60,7 @@ brw_surface_copy(struct pipe_context *pipe,
                                                        src,
                                                        PIPE_BUFFER_USAGE_CPU_READ );
       
-      pipe_copy_rect(dst_map,
+      util_copy_rect(dst_map,
                      &dst->block,
                      dst->stride,
                      dstx, dsty,
@@ -99,7 +99,7 @@ brw_surface_fill(struct pipe_context *pipe,
                                                  dst,
                                                  PIPE_BUFFER_USAGE_CPU_WRITE );
 
-      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+      util_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
 
       pipe->screen->surface_unmap(pipe->screen, dst);
    }
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
index 8aea8c0558..998ffaeac4 100644
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -65,11 +65,6 @@ unsigned intel_compressed_alignment(unsigned internalFormat)
 }
 #endif
 
-static unsigned minify( unsigned d )
-{
-   return MAX2(1, d>>1);
-}
-
 
 static void intel_miptree_set_image_offset(struct brw_texture *tex,
                                            unsigned level,
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index ab6410aa60..db75963932 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -947,7 +947,7 @@ static void brw_wm_emit_instruction( struct brw_wm_compile *c,
 #endif
 
       break;
-   case TGSI_OPCODE_LOOP:
+   case TGSI_OPCODE_BGNFOR:
       c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
       break;
    case TGSI_OPCODE_BRK:
@@ -958,11 +958,11 @@ static void brw_wm_emit_instruction( struct brw_wm_compile *c,
       brw_CONT(p);
       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       break;
-   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_ENDFOR:
       c->loop_insn--;
       c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
       /* patch all the BREAK instructions from
-	 last BEGINLOOP */
+         last BGNFOR */
       while (c->inst0 > c->loop_inst[c->loop_insn]) {
 	 c->inst0--;
 	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
index a500ec6045..4e700089e3 100644
--- a/src/gallium/drivers/identity/id_context.c
+++ b/src/gallium/drivers/identity/id_context.c
@@ -501,7 +501,7 @@ identity_set_sampler_textures(struct pipe_context *_pipe,
 
    pipe->set_sampler_textures(pipe,
                               num_textures,
-                              _textures);
+                              textures);
 }
 
 static void
diff --git a/src/gallium/drivers/identity/id_drm.c b/src/gallium/drivers/identity/id_drm.c
index 555220f853..14f68ac0d0 100644
--- a/src/gallium/drivers/identity/id_drm.c
+++ b/src/gallium/drivers/identity/id_drm.c
@@ -29,6 +29,7 @@
 
 #include "util/u_memory.h"
 #include "identity/id_drm.h"
+#include "identity/id_screen.h"
 #include "identity/id_public.h"
 #include "identity/id_screen.h"
 #include "identity/id_objects.h"
@@ -60,7 +61,7 @@ identity_drm_create_screen(struct drm_api *_api, int fd,
    screen = api->create_screen(api, fd, arg);
 
    return identity_screen_create(screen);
-};
+}
 
 static struct pipe_context *
 identity_drm_create_context(struct drm_api *_api,
@@ -77,83 +78,61 @@ identity_drm_create_context(struct drm_api *_api,
    pipe = identity_context_create(_screen, pipe);
 
    return pipe;
-};
-
-static boolean
-identity_drm_buffer_from_texture(struct drm_api *_api,
-                                 struct pipe_texture *_texture,
-                                 struct pipe_buffer **_buffer,
-                                 unsigned *stride)
-{
-   struct identity_texture *id_texture = identity_texture(_texture);
-   struct identity_drm_api *id_api = identity_drm_api(_api);
-   struct pipe_texture *texture = id_texture->texture;
-   struct drm_api *api = id_api->api;
-   struct pipe_buffer *buffer = NULL;
-   boolean result;
-
-   result = api->buffer_from_texture(api, texture, &buffer, stride);
-
-   if (result && _buffer)
-      buffer = identity_buffer_create(identity_screen(texture->screen), buffer);
-
-   if (_buffer)
-      *_buffer = buffer;
-   else
-      pipe_buffer_reference(&buffer, NULL);
-
-   return result;
 }
 
-static struct pipe_buffer *
-identity_drm_buffer_from_handle(struct drm_api *_api,
-                                struct pipe_screen *_screen,
-                                const char *name,
-                                unsigned handle)
+static struct pipe_texture *
+identity_drm_texture_from_shared_handle(struct drm_api *_api,
+                                        struct pipe_screen *_screen,
+                                        struct pipe_texture *templ,
+                                        const char *name,
+                                        unsigned stride,
+                                        unsigned handle)
 {
    struct identity_screen *id_screen = identity_screen(_screen);
    struct identity_drm_api *id_api = identity_drm_api(_api);
    struct pipe_screen *screen = id_screen->screen;
    struct drm_api *api = id_api->api;
-   struct pipe_buffer *result;
+   struct pipe_texture *result;
 
-   result = api->buffer_from_handle(api, screen, name, handle);
+   result = api->texture_from_shared_handle(api, screen, templ, name, stride, handle);
 
-   result = identity_buffer_create(identity_screen(_screen), result);
+   result = identity_texture_create(identity_screen(_screen), result);
 
    return result;
 }
 
 static boolean
-identity_drm_handle_from_buffer(struct drm_api *_api,
-                                struct pipe_screen *_screen,
-                                struct pipe_buffer *_buffer,
-                                unsigned *handle)
+identity_drm_shared_handle_from_texture(struct drm_api *_api,
+                                        struct pipe_screen *_screen,
+                                        struct pipe_texture *_texture,
+                                        unsigned *stride,
+                                        unsigned *handle)
 {
    struct identity_screen *id_screen = identity_screen(_screen);
-   struct identity_buffer *id_buffer = identity_buffer(_buffer);
+   struct identity_texture *id_texture = identity_texture(_texture);
    struct identity_drm_api *id_api = identity_drm_api(_api);
    struct pipe_screen *screen = id_screen->screen;
-   struct pipe_buffer *buffer = id_buffer->buffer;
+   struct pipe_texture *texture = id_texture->texture;
    struct drm_api *api = id_api->api;
 
-   return api->handle_from_buffer(api, screen, buffer, handle);
+   return api->shared_handle_from_texture(api, screen, texture, stride, handle);
 }
 
 static boolean
-identity_drm_global_handle_from_buffer(struct drm_api *_api,
+identity_drm_local_handle_from_texture(struct drm_api *_api,
                                        struct pipe_screen *_screen,
-                                       struct pipe_buffer *_buffer,
+                                       struct pipe_texture *_texture,
+                                       unsigned *stride,
                                        unsigned *handle)
 {
    struct identity_screen *id_screen = identity_screen(_screen);
-   struct identity_buffer *id_buffer = identity_buffer(_buffer);
+   struct identity_texture *id_texture = identity_texture(_texture);
    struct identity_drm_api *id_api = identity_drm_api(_api);
    struct pipe_screen *screen = id_screen->screen;
-   struct pipe_buffer *buffer = id_buffer->buffer;
+   struct pipe_texture *texture = id_texture->texture;
    struct drm_api *api = id_api->api;
 
-   return api->global_handle_from_buffer(api, screen, buffer, handle);
+   return api->local_handle_from_texture(api, screen, texture, stride, handle);
 }
 
 static void
@@ -169,19 +148,26 @@ identity_drm_destroy(struct drm_api *_api)
 struct drm_api *
 identity_drm_create(struct drm_api *api)
 {
-   struct identity_drm_api *id_api = CALLOC_STRUCT(identity_drm_api);
+   struct identity_drm_api *id_api;
+
+   if (!api)
+      goto error;
+
+   id_api = CALLOC_STRUCT(identity_drm_api);
 
    if (!id_api)
-      return NULL;
+      goto error;
 
    id_api->base.create_screen = identity_drm_create_screen;
    id_api->base.create_context = identity_drm_create_context;
-   id_api->base.buffer_from_texture = identity_drm_buffer_from_texture;
-   id_api->base.buffer_from_handle = identity_drm_buffer_from_handle;
-   id_api->base.handle_from_buffer = identity_drm_handle_from_buffer;
-   id_api->base.global_handle_from_buffer = identity_drm_global_handle_from_buffer;
+   id_api->base.texture_from_shared_handle = identity_drm_texture_from_shared_handle;
+   id_api->base.shared_handle_from_texture = identity_drm_shared_handle_from_texture;
+   id_api->base.local_handle_from_texture = identity_drm_local_handle_from_texture;
    id_api->base.destroy = identity_drm_destroy;
    id_api->api = api;
 
    return &id_api->base;
+
+error:
+   return api;
 }
diff --git a/src/gallium/drivers/identity/id_screen.c b/src/gallium/drivers/identity/id_screen.c
index 259f1be36e..26439637d0 100644
--- a/src/gallium/drivers/identity/id_screen.c
+++ b/src/gallium/drivers/identity/id_screen.c
@@ -289,6 +289,7 @@ identity_screen_surface_buffer_create(struct pipe_screen *_screen,
                                       unsigned height,
                                       enum pipe_format format,
                                       unsigned usage,
+                                      unsigned tex_usage,
                                       unsigned *stride)
 {
    struct identity_screen *id_screen = identity_screen(_screen);
@@ -300,6 +301,7 @@ identity_screen_surface_buffer_create(struct pipe_screen *_screen,
                                           height,
                                           format,
                                           usage,
+                                          tex_usage,
                                           stride);
 
    if (result)
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
new file mode 100644
index 0000000000..cd7b6356d2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -0,0 +1,57 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = llvmpipe
+
+CFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+
+C_SOURCES = \
+	lp_bld_alpha.c \
+	lp_bld_arit.c \
+	lp_bld_blend_aos.c \
+	lp_bld_blend_logicop.c \
+	lp_bld_blend_soa.c \
+	lp_bld_const.c \
+	lp_bld_conv.c \
+	lp_bld_debug.c \
+	lp_bld_depth.c \
+	lp_bld_flow.c \
+	lp_bld_format_aos.c \
+	lp_bld_format_soa.c \
+	lp_bld_interp.c \
+	lp_bld_intr.c \
+	lp_bld_logic.c \
+	lp_bld_sample_soa.c \
+	lp_bld_swizzle.c \
+	lp_bld_struct.c \
+	lp_bld_tgsi_soa.c \
+	lp_bld_type.c \
+	lp_buffer.c \
+	lp_clear.c \
+	lp_context.c \
+	lp_draw_arrays.c \
+	lp_flush.c \
+	lp_jit.c \
+	lp_prim_setup.c \
+	lp_prim_vbuf.c \
+	lp_setup.c \
+	lp_query.c \
+	lp_screen.c \
+	lp_state_blend.c \
+	lp_state_clip.c \
+	lp_state_derived.c \
+	lp_state_fs.c \
+	lp_state_rasterizer.c \
+	lp_state_sampler.c \
+	lp_state_surface.c \
+	lp_state_vertex.c \
+	lp_state_vs.c \
+	lp_surface.c \
+	lp_tex_cache.c \
+	lp_tex_sample_c.c \
+	lp_tex_sample_llvm.c \
+	lp_texture.c \
+	lp_tile_cache.c \
+	lp_tile_soa.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
new file mode 100644
index 0000000000..89d08834a3
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/README
@@ -0,0 +1,149 @@
+LLVMPIPE -- a fork of softpipe that employs LLVM for code generation.
+
+
+Status
+======
+
+Done so far is:
+
+ - the whole fragment pipeline is code generated in a single function
+ 
+   - input interpolation
+   
+   - depth testing
+ 
+   - texture sampling (not all state/formats are supported) 
+   
+   - fragment shader TGSI translation
+     - same level of support as the TGSI SSE2 exec machine, with the exception
+       we don't fallback to TGSI interpretation when an unsupported opcode is
+       found, but just ignore it
+     - done in SoA layout
+     - input interpolation also code generated
+ 
+   - alpha testing
+ 
+   - blend (including logic ops)
+     - both in SoA and AoS layouts, but only the former used for now
+ 
+ - code is generic
+   - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
+     any width and length
+   - not all operations are implemented for these types yet though
+
+Most mesa/progs/demos/* work. 
+
+To do (probably by this order):
+
+ - code generate stipple and stencil testing
+
+ - translate the remaining bits of texture sampling state
+
+ - translate TGSI control flow instructions, and all other remaining opcodes
+ 
+ - integrate with the draw module for VS code generation
+
+ - code generate the triangle setup and rasterization
+
+
+Requirements
+============
+
+ - Linux
+ 
+ - udis86, http://udis86.sourceforge.net/ . Use my repository, which decodes
+   opcodes not yet supported by upstream.
+ 
+     git clone git://people.freedesktop.org/~jrfonseca/udis86
+     cd udis86
+     ./configure --with-pic
+     make
+     sudo make install
+ 
+ - LLVM 2.5. On Debian based distributions do:
+ 
+     aptitude install llvm-dev
+
+   There is a typo in one of the llvm-dev 2.5 headers, that causes compilation
+   errors in the debug build:
+
+     --- /usr/include/llvm-c/Core.h.orig	2009-08-10 15:38:54.000000000 +0100
+     +++ /usr/include/llvm-c/Core.h	2009-08-10 15:38:25.000000000 +0100
+     @@ -831,7 +831,7 @@
+        template<typename T>
+        inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
+          #if DEBUG
+     -    for (LLVMValueRef *I = Vals, E = Vals + Length; I != E; ++I)
+     +    for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
+            cast<T>(*I);
+          #endif
+          return reinterpret_cast<T**>(Vals);
+ 
+ - A x86 or amd64 processor with support for sse2, sse3, and sse4.1 SIMD
+   instructions. This is necessary because we emit several SSE intrinsics for
+   convenience. See /proc/cpuinfo to know what your CPU supports.
+ 
+ - scons
+
+
+Building
+========
+
+To build everything invoke scons as:
+
+  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
+
+Alternatively, you can build it with GNU make, if you prefer, by invoking it as
+
+  make linux-llvm
+
+but the rest of these instructions assume that scons is used.
+
+
+Using
+=====
+
+Building will create a drop-in alternative for libGL.so. To use it set the
+environment variables:
+
+  export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
+
+or
+
+  export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
+
+For performance evaluation pass debug=no to scons, and use the corresponding
+lib directory without the "-debug" suffix.
+
+
+Unit testing
+============
+
+Building will also create several unit tests in
+build/linux-???-debug/gallium/drivers/llvmpipe:
+
+ - lp_test_blend: blending
+ - lp_test_conv: SIMD vector conversion
+ - lp_test_format: pixel unpacking/packing
+
+Some of this tests can output results and benchmarks to a tab-separated-file
+for posterior analysis, e.g.:
+
+  build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
+
+
+Development Notes
+=================
+
+- When looking to this code by the first time start in lp_state_fs.c, and 
+  then skim through the lp_bld_* functions called in there, and the comments
+  at the top of the lp_bld_*.c functions.  
+
+- All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be 
+  put in a stand-alone Gallium state -> LLVM IR translation module.
+
+- We use LLVM-C bindings for now. They are not documented, but follow the C++
+  interfaces very closely, and appear to be complete enough for code
+  generation. See 
+  http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+  for a stand-alone example.
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
new file mode 100644
index 0000000000..f4a9a3b22e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -0,0 +1,84 @@
+Import('*')
+
+env = env.Clone()
+
+env.Tool('llvm')
+if not env.has_key('LLVM_VERSION'):
+    print 'warning: LLVM not found: not building llvmpipe'
+    Return()
+
+env.Tool('udis86')
+
+llvmpipe = env.ConvenienceLibrary(
+	target = 'llvmpipe',
+	source = [
+		'lp_bld_alpha.c',
+		'lp_bld_arit.c',
+		'lp_bld_blend_aos.c',
+		'lp_bld_blend_logicop.c',
+		'lp_bld_blend_soa.c',
+		'lp_bld_const.c',
+		'lp_bld_conv.c',
+		'lp_bld_debug.c',
+		'lp_bld_depth.c',
+		'lp_bld_flow.c',
+		'lp_bld_format_aos.c',
+		'lp_bld_format_soa.c',
+		'lp_bld_interp.c',
+		'lp_bld_intr.c',
+		'lp_bld_sample_soa.c',
+		'lp_bld_struct.c',
+		'lp_bld_logic.c',
+		'lp_bld_swizzle.c',
+		'lp_bld_tgsi_soa.c',		
+		'lp_bld_type.c',
+		'lp_buffer.c',
+		'lp_clear.c',
+		'lp_context.c',
+		'lp_draw_arrays.c',
+		'lp_flush.c',
+		'lp_jit.c',
+		'lp_prim_setup.c',
+		'lp_prim_vbuf.c',
+		'lp_setup.c',
+		'lp_query.c',
+		'lp_screen.c',
+		'lp_state_blend.c',
+		'lp_state_clip.c',
+		'lp_state_derived.c',
+		'lp_state_fs.c',
+		'lp_state_rasterizer.c',
+		'lp_state_sampler.c',
+		'lp_state_surface.c',
+		'lp_state_vertex.c',
+		'lp_state_vs.c',
+		'lp_surface.c',
+		'lp_tex_cache.c',
+		'lp_tex_sample_c.c',
+		'lp_tex_sample_llvm.c',
+		'lp_texture.c',
+		'lp_tile_cache.c',
+		'lp_tile_soa.c',
+	])
+
+
+env = env.Clone()
+
+env.Prepend(LIBS = [llvmpipe] + auxiliaries)
+
+env.Program(
+    target = 'lp_test_format',
+    source = ['lp_test_format.c'],
+)
+
+env.Program(
+    target = 'lp_test_blend',
+    source = ['lp_test_blend.c', 'lp_test_main.c'],
+)
+
+env.Program(
+    target = 'lp_test_conv',
+    source = ['lp_test_conv.c', 'lp_test_main.c'],
+)
+
+Export('llvmpipe')
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
new file mode 100644
index 0000000000..2b4bc5c819
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -0,0 +1,64 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_alpha.h"
+
+
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref)
+{
+   struct lp_build_context bld;
+
+   lp_build_context_init(&bld, builder, type);
+
+   if(state->enabled) {
+      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
+
+      lp_build_name(test, "alpha_mask");
+
+      lp_build_mask_update(mask, test);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
new file mode 100644
index 0000000000..634575670d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_ALPHA_H
+#define LP_BLD_ALPHA_H
+
+
+#include <llvm-c/Core.h>  
+
+struct pipe_alpha_state;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref);
+
+
+#endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
new file mode 100644
index 0000000000..0b115fc9b0
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -0,0 +1,1177 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper
+ *
+ * LLVM IR doesn't support all basic arithmetic operations we care about (most
+ * notably min/max and saturated operations), and it is often necessary to
+ * resort machine-specific intrinsics directly. The functions here hide all
+ * these implementation details from the other modules.
+ *
+ * We also do simple expressions simplification here. Reasons are:
+ * - it is very easy given we have all necessary information readily available
+ * - LLVM optimization passes fail to simplify several vector expressions
+ * - We often know value constraints which the optimization passes have no way
+ *   of knowing, such as when source arguments are known to be in [0, 1] range.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_memory.h"
+#include "util/u_debug.h"
+#include "util/u_string.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_arit.h"
+
+
+/**
+ * Generate min(a, b)
+ * No checks for special case values of a or b = 1 or 0 are done.
+ */
+static LLVMValueRef
+lp_build_min_simple(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   const char *intrinsic = NULL;
+   LLVMValueRef cond;
+
+   /* TODO: optimize the constant case */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width * type.length == 128) {
+      if(type.floating) {
+         if(type.width == 32)
+            intrinsic = "llvm.x86.sse.min.ps";
+         if(type.width == 64)
+            intrinsic = "llvm.x86.sse2.min.pd";
+      }
+      else {
+         if(type.width == 8 && !type.sign)
+            intrinsic = "llvm.x86.sse2.pminu.b";
+         if(type.width == 8 && type.sign)
+            intrinsic = "llvm.x86.sse41.pminsb";
+         if(type.width == 16 && !type.sign)
+            intrinsic = "llvm.x86.sse41.pminuw";
+         if(type.width == 16 && type.sign)
+            intrinsic = "llvm.x86.sse2.pmins.w";
+         if(type.width == 32 && !type.sign)
+            intrinsic = "llvm.x86.sse41.pminud";
+         if(type.width == 32 && type.sign)
+            intrinsic = "llvm.x86.sse41.pminsd";
+      }
+   }
+#endif
+
+   if(intrinsic)
+      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+
+   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
+   return lp_build_select(bld, cond, a, b);
+}
+
+
+/**
+ * Generate max(a, b)
+ * No checks for special case values of a or b = 1 or 0 are done.
+ */
+static LLVMValueRef
+lp_build_max_simple(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   const char *intrinsic = NULL;
+   LLVMValueRef cond;
+
+   /* TODO: optimize the constant case */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width * type.length == 128) {
+      if(type.floating) {
+         if(type.width == 32)
+            intrinsic = "llvm.x86.sse.max.ps";
+         if(type.width == 64)
+            intrinsic = "llvm.x86.sse2.max.pd";
+      }
+      else {
+         if(type.width == 8 && !type.sign)
+            intrinsic = "llvm.x86.sse2.pmaxu.b";
+         if(type.width == 8 && type.sign)
+            intrinsic = "llvm.x86.sse41.pmaxsb";
+         if(type.width == 16 && !type.sign)
+            intrinsic = "llvm.x86.sse41.pmaxuw";
+         if(type.width == 16 && type.sign)
+            intrinsic = "llvm.x86.sse2.pmaxs.w";
+         if(type.width == 32 && !type.sign)
+            intrinsic = "llvm.x86.sse41.pmaxud";
+         if(type.width == 32 && type.sign)
+            intrinsic = "llvm.x86.sse41.pmaxsd";
+      }
+   }
+#endif
+
+   if(intrinsic)
+      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+
+   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+   return lp_build_select(bld, cond, a, b);
+}
+
+
+/**
+ * Generate 1 - a, or ~a depending on bld->type.
+ */
+LLVMValueRef
+lp_build_comp(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->one)
+      return bld->zero;
+   if(a == bld->zero)
+      return bld->one;
+
+   if(type.norm && !type.floating && !type.fixed && !type.sign) {
+      if(LLVMIsConstant(a))
+         return LLVMConstNot(a);
+      else
+         return LLVMBuildNot(bld->builder, a, "");
+   }
+
+   if(LLVMIsConstant(a))
+      return LLVMConstSub(bld->one, a);
+   else
+      return LLVMBuildSub(bld->builder, bld->one, a, "");
+}
+
+
+/**
+ * Generate a + b
+ */
+LLVMValueRef
+lp_build_add(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(a == bld->zero)
+      return b;
+   if(b == bld->zero)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(bld->type.norm) {
+      const char *intrinsic = NULL;
+
+      if(a == bld->one || b == bld->one)
+        return bld->one;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+      if(type.width * type.length == 128 &&
+         !type.floating && !type.fixed) {
+         if(type.width == 8)
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+         if(type.width == 16)
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+      }
+#endif
+   
+      if(intrinsic)
+         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+   }
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      res = LLVMConstAdd(a, b);
+   else
+      res = LLVMBuildAdd(bld->builder, a, b, "");
+
+   /* clamp to ceiling of 1.0 */
+   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
+      res = lp_build_min_simple(bld, res, bld->one);
+
+   /* XXX clamp to floor of -1 or 0??? */
+
+   return res;
+}
+
+
+/**
+ * Generate a - b
+ */
+LLVMValueRef
+lp_build_sub(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(b == bld->zero)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+   if(a == b)
+      return bld->zero;
+
+   if(bld->type.norm) {
+      const char *intrinsic = NULL;
+
+      if(b == bld->one)
+        return bld->zero;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+      if(type.width * type.length == 128 &&
+         !type.floating && !type.fixed) {
+         if(type.width == 8)
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+         if(type.width == 16)
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+      }
+#endif
+   
+      if(intrinsic)
+         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+   }
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      res = LLVMConstSub(a, b);
+   else
+      res = LLVMBuildSub(bld->builder, a, b, "");
+
+   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
+      res = lp_build_max_simple(bld, res, bld->zero);
+
+   return res;
+}
+
+
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef 
+lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Build constant int vector of width 'n' and value 'c'.
+ */
+static LLVMValueRef 
+lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(type, c, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Normalized 8bit multiplication.
+ *
+ * - alpha plus one
+ *
+ *     makes the following approximation to the division (Sree)
+ *    
+ *       a*b/255 ~= (a*(b + 1)) >> 256
+ *    
+ *     which is the fastest method that satisfies the following OpenGL criteria
+ *    
+ *       0*0 = 0 and 255*255 = 255
+ *
+ * - geometric series
+ *
+ *     takes the geometric series approximation to the division
+ *
+ *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ *     in this case just the first two terms to fit in 16bit arithmetic
+ *
+ *       t/255 ~= (t + (t >> 8)) >> 8
+ *
+ *     note that just by itself it doesn't satisfies the OpenGL criteria, as
+ *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
+ *     must be used
+ *
+ * - geometric series plus rounding
+ *
+ *     when using a geometric series division instead of truncating the result
+ *     use roundoff in the approximation (Jim Blinn)
+ *
+ *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
+ *
+ *     achieving the exact results
+ *
+ * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
+ *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
+ * @sa Michael Herf, The "double blend trick", May 2000, 
+ *     http://www.stereopsis.com/doubleblend.html
+ */
+static LLVMValueRef
+lp_build_mul_u8n(LLVMBuilderRef builder,
+                 LLVMValueRef a, LLVMValueRef b)
+{
+   static LLVMValueRef c01 = NULL;
+   static LLVMValueRef c08 = NULL;
+   static LLVMValueRef c80 = NULL;
+   LLVMValueRef ab;
+
+   if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
+   if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
+   if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+   
+#if 0
+   
+   /* a*b/255 ~= (a*(b + 1)) >> 256 */
+   b = LLVMBuildAdd(builder, b, c01, "");
+   ab = LLVMBuildMul(builder, a, b, "");
+
+#else
+   
+   /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+   ab = LLVMBuildMul(builder, a, b, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
+   ab = LLVMBuildAdd(builder, ab, c80, "");
+
+#endif
+   
+   ab = LLVMBuildLShr(builder, ab, c08, "");
+
+   return ab;
+}
+
+
+/**
+ * Generate a * b
+ */
+LLVMValueRef
+lp_build_mul(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->zero)
+      return bld->zero;
+   if(a == bld->one)
+      return b;
+   if(b == bld->zero)
+      return bld->zero;
+   if(b == bld->one)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(!type.floating && !type.fixed && type.norm) {
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+      if(type.width == 8 && type.length == 16) {
+         LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
+         LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
+         static LLVMValueRef ml = NULL;
+         static LLVMValueRef mh = NULL;
+         LLVMValueRef al, ah, bl, bh;
+         LLVMValueRef abl, abh;
+         LLVMValueRef ab;
+         
+         if(!ml) ml = lp_build_unpack_shuffle(16, 0);
+         if(!mh) mh = lp_build_unpack_shuffle(16, 1);
+
+         /*  PUNPCKLBW, PUNPCKHBW */
+         al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
+         bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
+         ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
+         bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+
+         /* NOP */
+         al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
+         bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
+         ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
+         bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+
+         /* PMULLW, PSRLW, PADDW */
+         abl = lp_build_mul_u8n(bld->builder, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, ah, bh);
+
+         /* PACKUSWB */
+         ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
+
+         /* NOP */
+         ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+         
+         return ab;
+      }
+#endif
+
+      /* FIXME */
+      assert(0);
+   }
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      return LLVMConstMul(a, b);
+
+   return LLVMBuildMul(bld->builder, a, b, "");
+}
+
+
+/**
+ * Generate a / b
+ */
+LLVMValueRef
+lp_build_div(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->zero)
+      return bld->zero;
+   if(a == bld->one)
+      return lp_build_rcp(bld, b);
+   if(b == bld->zero)
+      return bld->undef;
+   if(b == bld->one)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      return LLVMConstFDiv(a, b);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width == 32 && type.length == 4)
+      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
+#endif
+
+   return LLVMBuildFDiv(bld->builder, a, b, "");
+}
+
+
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
+/**
+ * Generate min(a, b)
+ * Do checks for special cases.
+ */
+LLVMValueRef
+lp_build_min(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(a == b)
+      return a;
+
+   if(bld->type.norm) {
+      if(a == bld->zero || b == bld->zero)
+         return bld->zero;
+      if(a == bld->one)
+         return b;
+      if(b == bld->one)
+         return a;
+   }
+
+   return lp_build_min_simple(bld, a, b);
+}
+
+
+/**
+ * Generate max(a, b)
+ * Do checks for special cases.
+ */
+LLVMValueRef
+lp_build_max(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(a == b)
+      return a;
+
+   if(bld->type.norm) {
+      if(a == bld->one || b == bld->one)
+         return bld->one;
+      if(a == bld->zero)
+         return b;
+      if(b == bld->zero)
+         return a;
+   }
+
+   return lp_build_max_simple(bld, a, b);
+}
+
+
+/**
+ * Generate abs(a)
+ */
+LLVMValueRef
+lp_build_abs(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+
+   if(!type.sign)
+      return a;
+
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width*type.length == 128) {
+      switch(type.width) {
+      case 8:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
+      case 16:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
+      case 32:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
+   }
+#endif
+
+   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
+}
+
+
+LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   a = lp_build_floor(bld, a);
+   a = lp_build_int(bld, a);
+   return a;
+}
+
+
+LLVMValueRef
+lp_build_sqrt(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+LLVMValueRef
+lp_build_rcp(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->zero)
+      return bld->undef;
+   if(a == bld->one)
+      return bld->one;
+   if(a == bld->undef)
+      return bld->undef;
+
+   assert(type.floating);
+
+   if(LLVMIsConstant(a))
+      return LLVMConstFDiv(bld->one, a);
+
+   /* XXX: is this really necessary? */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width == 32 && type.length == 4)
+      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
+#endif
+
+   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
+}
+
+
+/**
+ * Generate 1/sqrt(a)
+ */
+LLVMValueRef
+lp_build_rsqrt(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   /* XXX: is this really necessary? */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width == 32 && type.length == 4)
+      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
+#endif
+
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+
+
+/**
+ * Generate cos(a)
+ */
+LLVMValueRef
+lp_build_cos(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+/**
+ * Generate sin(a)
+ */
+LLVMValueRef
+lp_build_sin(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+/**
+ * Generate pow(x, y)
+ */
+LLVMValueRef
+lp_build_pow(struct lp_build_context *bld,
+             LLVMValueRef x,
+             LLVMValueRef y)
+{
+   /* TODO: optimize the constant case */
+   if(LLVMIsConstant(x) && LLVMIsConstant(y))
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+
+   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
+}
+
+
+/**
+ * Generate exp(x)
+ */
+LLVMValueRef
+lp_build_exp(struct lp_build_context *bld,
+             LLVMValueRef x)
+{
+   /* log2(e) = 1/log(2) */
+   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
+
+   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
+}
+
+
+/**
+ * Generate log(x)
+ */
+LLVMValueRef
+lp_build_log(struct lp_build_context *bld,
+             LLVMValueRef x)
+{
+   /* log(2) */
+   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
+
+   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
+}
+
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+
+/**
+ * Generate polynomial.
+ * Ex:  x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
+ */
+static LLVMValueRef
+lp_build_polynomial(struct lp_build_context *bld,
+                    LLVMValueRef x,
+                    const double *coeffs,
+                    unsigned num_coeffs)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res = NULL;
+   unsigned i;
+
+   /* TODO: optimize the constant case */
+   if(LLVMIsConstant(x))
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+
+   for (i = num_coeffs; i--; ) {
+      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
+      if(res)
+         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
+      else
+         res = coeff;
+   }
+
+   if(res)
+      return res;
+   else
+      return bld->undef;
+}
+
+
+/**
+ * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
+ */
+const double lp_build_exp2_polynomial[] = {
+#if EXP_POLY_DEGREE == 5
+   9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
+#elif EXP_POLY_DEGREE == 4
+   1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
+#elif EXP_POLY_DEGREE == 3
+   9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
+#elif EXP_POLY_DEGREE == 2
+   1.0017247, 6.5763628e-1, 3.3718944e-1
+#else
+#error
+#endif
+};
+
+
+void
+lp_build_exp2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp2_int_part,
+                     LLVMValueRef *p_frac_part,
+                     LLVMValueRef *p_exp2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef ipart = NULL;
+   LLVMValueRef fpart = NULL;
+   LLVMValueRef expipart = NULL;
+   LLVMValueRef expfpart = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp2_int_part || p_frac_part || p_exp2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+
+      assert(type.floating && type.width == 32);
+
+      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
+      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
+
+      /* ipart = int(x - 0.5) */
+      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
+      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
+
+      /* fpart = x - ipart */
+      fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
+      fpart = LLVMBuildSub(bld->builder, x, fpart, "");
+   }
+
+   if(p_exp2_int_part || p_exp2) {
+      /* expipart = (float) (1 << ipart) */
+      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
+      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
+      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
+   }
+
+   if(p_exp2) {
+      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
+                                     Elements(lp_build_exp2_polynomial));
+
+      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
+   }
+
+   if(p_exp2_int_part)
+      *p_exp2_int_part = expipart;
+
+   if(p_frac_part)
+      *p_frac_part = fpart;
+
+   if(p_exp2)
+      *p_exp2 = res;
+}
+
+
+LLVMValueRef
+lp_build_exp2(struct lp_build_context *bld,
+              LLVMValueRef x)
+{
+   LLVMValueRef res;
+   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
+   return res;
+}
+
+
+/**
+ * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+ * These coefficients can be generate with
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+const double lp_build_log2_polynomial[] = {
+#if LOG_POLY_DEGREE == 6
+   3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
+#elif LOG_POLY_DEGREE == 5
+   2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
+#elif LOG_POLY_DEGREE == 4
+   2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
+#elif LOG_POLY_DEGREE == 3
+   2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
+#else
+#error
+#endif
+};
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+void
+lp_build_log2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp,
+                     LLVMValueRef *p_floor_log2,
+                     LLVMValueRef *p_log2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
+   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
+
+   LLVMValueRef i = NULL;
+   LLVMValueRef exp = NULL;
+   LLVMValueRef mant = NULL;
+   LLVMValueRef logexp = NULL;
+   LLVMValueRef logmant = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp || p_floor_log2 || p_log2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+
+      assert(type.floating && type.width == 32);
+
+      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
+
+      /* exp = (float) exponent(x) */
+      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
+   }
+
+   if(p_floor_log2 || p_log2) {
+      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
+      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
+      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
+   }
+
+   if(p_log2) {
+      /* mant = (float) mantissa(x) */
+      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
+      mant = LLVMBuildOr(bld->builder, mant, one, "");
+      mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
+
+      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
+                                    Elements(lp_build_log2_polynomial));
+
+      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
+
+      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+   }
+
+   if(p_exp)
+      *p_exp = exp;
+
+   if(p_floor_log2)
+      *p_floor_log2 = logexp;
+
+   if(p_log2)
+      *p_log2 = res;
+}
+
+
+LLVMValueRef
+lp_build_log2(struct lp_build_context *bld,
+              LLVMValueRef x)
+{
+   LLVMValueRef res;
+   lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   return res;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
new file mode 100644
index 0000000000..d68a97c4b8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -0,0 +1,191 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper arithmetic functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_ARIT_H
+#define LP_BLD_ARIT_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type type;
+struct lp_build_context;
+
+
+/**
+ * Complement, i.e., 1 - a.
+ */
+LLVMValueRef
+lp_build_comp(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_add(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_sub(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_mul(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_div(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1);
+
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11);
+
+LLVMValueRef
+lp_build_min(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_max(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_abs(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sqrt(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_rcp(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_rsqrt(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_cos(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sin(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_pow(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_exp(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_log(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_exp2(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_log2(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+void
+lp_build_exp2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp2_int_part,
+                     LLVMValueRef *p_frac_part,
+                     LLVMValueRef *p_exp2);
+
+void
+lp_build_log2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp,
+                     LLVMValueRef *p_floor_log2,
+                     LLVMValueRef *p_log2);
+
+#endif /* !LP_BLD_ARIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
new file mode 100644
index 0000000000..da272e549f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_BLEND_H
+#define LP_BLD_BLEND_H
+
+
+/**
+ * @file
+ * LLVM IR building helpers interfaces.
+ *
+ * We use LLVM-C bindings for now. They are not documented, but follow the C++
+ * interfaces very closely, and appear to be complete enough for code
+ * genration. See
+ * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+ * for a standalone example.
+ */
+
+#include <llvm-c/Core.h>  
+ 
+#include "pipe/p_format.h"
+
+
+struct pipe_blend_state;
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Whether the blending function is commutative or not.
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func);
+
+
+/**
+ * Whether the blending functions are the reverse of each other.
+ */
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
+
+
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2);
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle);
+
+
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef const_[4],
+                   LLVMValueRef res[4]);
+
+
+/**
+ * Apply a logic op.
+ *
+ * src/dst parameters are packed values. It should work regardless the inputs
+ * are scalars, or a vector.
+ */
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst);
+
+
+#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
new file mode 100644
index 0000000000..d14f468ba9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -0,0 +1,356 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_debug.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_aos_context
+{
+   struct lp_build_context base;
+   
+   LLVMValueRef src;
+   LLVMValueRef dst;
+   LLVMValueRef const_;
+
+   LLVMValueRef inv_src;
+   LLVMValueRef inv_dst;
+   LLVMValueRef inv_const;
+   LLVMValueRef saturate;
+
+   LLVMValueRef rgb_src_factor;
+   LLVMValueRef alpha_src_factor;
+   LLVMValueRef rgb_dst_factor;
+   LLVMValueRef alpha_dst_factor;
+};
+
+
+static LLVMValueRef
+lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
+                                 unsigned factor,
+                                 boolean alpha)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(alpha)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst)
+            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+         if(!bld->saturate)
+            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
+         return bld->saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->const_;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src)
+         bld->inv_src = lp_build_comp(&bld->base, bld->src);
+      return bld->inv_src;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst)
+         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+      return bld->inv_dst;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_const)
+         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
+      return bld->inv_const;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+enum lp_build_blend_swizzle {
+   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1,
+};
+
+
+/**
+ * How should we shuffle the base factor.
+ */
+static enum lp_build_blend_swizzle
+lp_build_blend_factor_swizzle(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+   case PIPE_BLENDFACTOR_ZERO:
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return LP_BUILD_BLEND_SWIZZLE_AAAA;
+   default:
+      assert(0);
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   }
+}
+
+
+static LLVMValueRef
+lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
+                       LLVMValueRef rgb, 
+                       LLVMValueRef alpha, 
+                       enum lp_build_blend_swizzle rgb_swizzle,
+                       unsigned alpha_swizzle)
+{
+   if(rgb == alpha) {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
+         return rgb;
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
+         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
+   }
+   else {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
+         boolean cond[4] = {0, 0, 0, 0};
+         cond[alpha_swizzle] = 1;
+         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
+      }
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
+         unsigned char swizzle[4];
+         swizzle[0] = alpha_swizzle;
+         swizzle[1] = alpha_swizzle;
+         swizzle[2] = alpha_swizzle;
+         swizzle[3] = alpha_swizzle;
+         swizzle[alpha_swizzle] += 4;
+         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
+      }
+   }
+   assert(0);
+   return bld->base.undef;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
+ */
+static LLVMValueRef
+lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
+                      LLVMValueRef factor1,
+                      unsigned rgb_factor,
+                      unsigned alpha_factor,
+                      unsigned alpha_swizzle)
+{
+   LLVMValueRef rgb_factor_;
+   LLVMValueRef alpha_factor_;
+   LLVMValueRef factor2;
+   enum lp_build_blend_swizzle rgb_swizzle;
+
+   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
+   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+
+   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
+
+   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
+
+   return lp_build_mul(&bld->base, factor1, factor2);
+}
+
+
+boolean
+lp_build_blend_func_commutative(unsigned func)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+   case PIPE_BLEND_MIN:
+   case PIPE_BLEND_MAX:
+      return TRUE;
+   case PIPE_BLEND_SUBTRACT:
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return FALSE;
+   default:
+      assert(0);
+      return TRUE;
+   }
+}
+
+
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
+{
+   if(rgb_func == alpha_func)
+      return FALSE;
+   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+      return TRUE;
+   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+      return TRUE;
+   return FALSE;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
+ */
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1, 
+                    LLVMValueRef term2)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      return lp_build_add(bld, term1, term2);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      return lp_build_sub(bld, term1, term2);
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return lp_build_sub(bld, term2, term1);
+   case PIPE_BLEND_MIN:
+      return lp_build_min(bld, term1, term2);
+   case PIPE_BLEND_MAX:
+      return lp_build_max(bld, term1, term2);
+   default:
+      assert(0);
+      return bld->zero;
+   }
+}
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle)
+{
+   struct lp_build_blend_aos_context bld;
+   LLVMValueRef src_term;
+   LLVMValueRef dst_term;
+
+   /* FIXME */
+   assert(blend->colormask == 0xf);
+
+   if(!blend->blend_enable)
+      return src;
+
+   /* It makes no sense to blend unless values are normalized */
+   assert(type.norm);
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   bld.src = src;
+   bld.dst = dst;
+   bld.const_ = const_;
+
+   /* TODO: There are still a few optimization opportunities here. For certain
+    * combinations it is possible to reorder the operations and therefore saving
+    * some instructions. */
+
+   src_term = lp_build_blend_factor(&bld, src, blend->rgb_src_factor, blend->alpha_src_factor, alpha_swizzle);
+   dst_term = lp_build_blend_factor(&bld, dst, blend->rgb_dst_factor, blend->alpha_dst_factor, alpha_swizzle);
+
+   lp_build_name(src_term, "src_term");
+   lp_build_name(dst_term, "dst_term");
+
+   if(blend->rgb_func == blend->alpha_func) {
+      return lp_build_blend_func(&bld.base, blend->rgb_func, src_term, dst_term);
+   }
+   else {
+      /* Seperate RGB / A functions */
+
+      LLVMValueRef rgb;
+      LLVMValueRef alpha;
+
+      rgb   = lp_build_blend_func(&bld.base, blend->rgb_func,   src_term, dst_term);
+      alpha = lp_build_blend_func(&bld.base, blend->alpha_func, src_term, dst_term);
+
+      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
new file mode 100644
index 0000000000..88321f62a2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
@@ -0,0 +1,108 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_blend.h"
+
+
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst)
+{
+   LLVMTypeRef type;
+   LLVMValueRef res;
+
+   type = LLVMTypeOf(src);
+
+   switch (logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      res = LLVMConstNull(type);
+      break;
+   case PIPE_LOGICOP_NOR:
+      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      res = LLVMBuildNot(builder, src, "");
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_INVERT:
+      res = LLVMBuildNot(builder, dst, "");
+      break;
+   case PIPE_LOGICOP_XOR:
+      res = LLVMBuildXor(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_NAND:
+      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND:
+      res = LLVMBuildAnd(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_NOOP:
+      res = dst;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY:
+      res = src;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_OR:
+      res = LLVMBuildOr(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_SET:
+      res = LLVMConstAllOnes(type);
+      break;
+   default:
+      assert(0);
+      res = src;
+   }
+
+   return res;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
new file mode 100644
index 0000000000..9511299d55
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -0,0 +1,298 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- SoA layout.
+ *
+ * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
+ * factors/functions are used, since no channel masking/shuffling is necessary
+ * and we can achieve the full throughput of the SIMD operations. Furthermore
+ * the fragment shader output is also in SoA, so it fits nicely with the rest of
+ * the fragment pipeline.
+ *
+ * The drawback is that to be displayed the color buffer needs to be in AoS
+ * layout, so we need to tile/untile the color buffer before/after rendering.
+ * A color buffer like
+ *
+ *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
+ *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
+ *
+ *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
+ *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
+ *
+ *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
+ *
+ * will actually be stored in memory as
+ *
+ *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
+ *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
+ *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * NOTE: Run lp_blend_test after any change to this file.
+ *
+ * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
+ * as:
+ *
+ *  lp_blend_test -o blend.tsv
+ *
+ * will generate a tab-seperated-file with the test results and performance
+ * measurements.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef src[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef con[4];
+
+   LLVMValueRef inv_src[4];
+   LLVMValueRef inv_dst[4];
+   LLVMValueRef inv_con[4];
+
+   LLVMValueRef src_alpha_saturate;
+
+   /**
+    * We store all factors in a table in order to eliminate redundant
+    * multiplications later.
+    */
+   LLVMValueRef factor[2][2][4];
+
+   /**
+    * Table with all terms.
+    */
+   LLVMValueRef term[2][4];
+};
+
+
+static LLVMValueRef
+lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
+                          unsigned factor, unsigned i)
+{
+   /*
+    * Compute src/first term RGB
+    */
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return bld->src[i];
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src[3];
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return bld->dst[i];
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst[3];
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(i == 3)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst[3])
+            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+         if(!bld->src_alpha_saturate)
+            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
+         return bld->src_alpha_saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return bld->con[i];
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->con[3];
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      if(!bld->inv_src[i])
+         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
+      return bld->inv_src[i];
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src[3])
+         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
+      return bld->inv_src[3];
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      if(!bld->inv_dst[i])
+         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
+      return bld->inv_dst[i];
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst[3])
+         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+      return bld->inv_dst[3];
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      if(!bld->inv_con[i])
+         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
+      return bld->inv_con[i];
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_con[3])
+         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
+      return bld->inv_con[3];
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+/**
+ * Generate blend code in SOA mode.
+ * \param src  src/fragment color
+ * \param dst  dst/framebuffer color
+ * \param con  constant blend color
+ * \param res  the result/output
+ */
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef con[4],
+                   LLVMValueRef res[4])
+{
+   struct lp_build_blend_soa_context bld;
+   unsigned i, j, k;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   for (i = 0; i < 4; ++i) {
+      bld.src[i] = src[i];
+      bld.dst[i] = dst[i];
+      bld.con[i] = con[i];
+   }
+
+   for (i = 0; i < 4; ++i) {
+      if (blend->colormask & (1 << i)) {
+         if (blend->logicop_enable) {
+            if(!type.floating) {
+               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
+            }
+            else
+               res[i] = dst[i];
+         }
+         else if (blend->blend_enable) {
+            unsigned src_factor = i < 3 ? blend->rgb_src_factor : blend->alpha_src_factor;
+            unsigned dst_factor = i < 3 ? blend->rgb_dst_factor : blend->alpha_dst_factor;
+            unsigned func = i < 3 ? blend->rgb_func : blend->alpha_func;
+            boolean func_commutative = lp_build_blend_func_commutative(func);
+
+            /* It makes no sense to blend unless values are normalized */
+            assert(type.norm);
+
+            /*
+             * Compute src/dst factors.
+             */
+
+            bld.factor[0][0][i] = src[i];
+            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
+            bld.factor[1][0][i] = dst[i];
+            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
+
+            /*
+             * Compute src/dst terms
+             */
+
+            for(k = 0; k < 2; ++k) {
+               /* See if this multiplication has been previously computed */
+               for(j = 0; j < i; ++j) {
+                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
+                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][0][i]))
+                     break;
+               }
+
+               if(j < i)
+                  bld.term[k][i] = bld.term[k][j];
+               else
+                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
+            }
+
+            /*
+             * Combine terms
+             */
+
+            /* See if this function has been previously applied */
+            for(j = 0; j < i; ++j) {
+               unsigned prev_func = j < 3 ? blend->rgb_func : blend->alpha_func;
+               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
+
+               if((!func_reverse &&
+                   bld.term[0][j] == bld.term[0][i] &&
+                   bld.term[1][j] == bld.term[1][i]) ||
+                  ((func_commutative || func_reverse) &&
+                   bld.term[0][j] == bld.term[1][i] &&
+                   bld.term[1][j] == bld.term[0][i]))
+                  break;
+            }
+
+            if(j < i)
+               res[i] = res[j];
+            else
+               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
+         }
+         else {
+            res[i] = src[i];
+         }
+      }
+      else {
+         res[i] = dst[i];
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.c b/src/gallium/drivers/llvmpipe/lp_bld_const.c
new file mode 100644
index 0000000000..c8eaa8c394
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.c
@@ -0,0 +1,369 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for constant building.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include <float.h>
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+
+
+unsigned
+lp_mantissa(struct lp_type type)
+{
+   assert(type.floating);
+
+   if(type.floating) {
+      switch(type.width) {
+      case 32:
+         return 23;
+      case 64:
+         return 53;
+      default:
+         assert(0);
+         return 0;
+      }
+   }
+   else {
+      if(type.sign)
+         return type.width - 1;
+      else
+         return type.width;
+   }
+}
+
+
+/**
+ * Shift of the unity.
+ *
+ * Same as lp_const_scale(), but in terms of shifts.
+ */
+unsigned
+lp_const_shift(struct lp_type type)
+{
+   if(type.floating)
+      return 0;
+   else if(type.fixed)
+      return type.width/2;
+   else if(type.norm)
+      return type.sign ? type.width - 1 : type.width;
+   else
+      return 0;
+}
+
+
+unsigned
+lp_const_offset(struct lp_type type)
+{
+   if(type.floating || type.fixed)
+      return 0;
+   else if(type.norm)
+      return 1;
+   else
+      return 0;
+}
+
+
+/**
+ * Scaling factor between the LLVM native value and its interpretation.
+ *
+ * This is 1.0 for all floating types and unnormalized integers, and something
+ * else for the fixed points types and normalized integers.
+ */
+double
+lp_const_scale(struct lp_type type)
+{
+   unsigned long long llscale;
+   double dscale;
+
+   llscale = (unsigned long long)1 << lp_const_shift(type);
+   llscale -= lp_const_offset(type);
+   dscale = (double)llscale;
+   assert((unsigned long long)dscale == llscale);
+
+   return dscale;
+}
+
+
+/**
+ * Minimum value representable by the type.
+ */
+double
+lp_const_min(struct lp_type type)
+{
+   unsigned bits;
+
+   if(!type.sign)
+      return 0.0;
+
+   if(type.norm)
+      return -1.0;
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return -FLT_MAX;
+      case 64:
+         return -DBL_MAX;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+
+   if(type.fixed)
+      /* FIXME: consider the fractional bits? */
+      bits = type.width / 2 - 1;
+   else
+      bits = type.width - 1;
+
+   return (double)-((long long)1 << bits);
+}
+
+
+/**
+ * Maximum value representable by the type.
+ */
+double
+lp_const_max(struct lp_type type)
+{
+   unsigned bits;
+
+   if(type.norm)
+      return 1.0;
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return FLT_MAX;
+      case 64:
+         return DBL_MAX;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+
+   if(type.fixed)
+      bits = type.width / 2;
+   else
+      bits = type.width;
+
+   if(type.sign)
+      bits -= 1;
+
+   return (double)(((unsigned long long)1 << bits) - 1);
+}
+
+
+double
+lp_const_eps(struct lp_type type)
+{
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return FLT_EPSILON;
+      case 64:
+         return DBL_EPSILON;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+   else {
+      double scale = lp_const_scale(type);
+      return 1.0/scale;
+   }
+}
+
+
+LLVMValueRef
+lp_build_undef(struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   return LLVMGetUndef(vec_type);
+}
+               
+
+LLVMValueRef
+lp_build_zero(struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   return LLVMConstNull(vec_type);
+}
+               
+
+LLVMValueRef
+lp_build_one(struct lp_type type)
+{
+   LLVMTypeRef elem_type;
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   elem_type = lp_build_elem_type(type);
+
+   if(type.floating)
+      elems[0] = LLVMConstReal(elem_type, 1.0);
+   else if(type.fixed)
+      elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
+   else if(!type.norm)
+      elems[0] = LLVMConstInt(elem_type, 1, 0);
+   else if(type.sign)
+      elems[0] = LLVMConstInt(elem_type, (1LL << (type.width - 1)) - 1, 0);
+   else {
+      /* special case' -- 1.0 for normalized types is more easily attained if
+       * we start with a vector consisting of all bits set */
+      LLVMTypeRef vec_type = LLVMVectorType(elem_type, type.length);
+      LLVMValueRef vec = LLVMConstAllOnes(vec_type);
+
+#if 0
+      if(type.sign)
+         /* TODO: Unfortunately this caused "Tried to create a shift operation
+          * on a non-integer type!" */
+         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
+#endif
+
+      return vec;
+   }
+
+   for(i = 1; i < type.length; ++i)
+      elems[i] = elems[0];
+
+   return LLVMConstVector(elems, type.length);
+}
+               
+
+LLVMValueRef
+lp_build_const_scalar(struct lp_type type,
+                      double val)
+{
+   LLVMTypeRef elem_type = lp_build_elem_type(type);
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   if(type.floating) {
+      elems[0] = LLVMConstReal(elem_type, val);
+   }
+   else {
+      double dscale = lp_const_scale(type);
+
+      elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
+   }
+
+   for(i = 1; i < type.length; ++i)
+      elems[i] = elems[0];
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_int_const_scalar(struct lp_type type,
+                          long long val)
+{
+   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < type.length; ++i)
+      elems[i] = LLVMConstInt(elem_type, val, type.sign ? 1 : 0);
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_const_aos(struct lp_type type, 
+                   double r, double g, double b, double a, 
+                   const unsigned char *swizzle)
+{
+   const unsigned char default_swizzle[4] = {0, 1, 2, 3};
+   LLVMTypeRef elem_type;
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length % 4 == 0);
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   elem_type = lp_build_elem_type(type);
+
+   if(swizzle == NULL)
+      swizzle = default_swizzle;
+
+   if(type.floating) {
+      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
+      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
+      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
+      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
+   }
+   else {
+      double dscale = lp_const_scale(type);
+
+      elems[swizzle[0]] = LLVMConstInt(elem_type, r*dscale + 0.5, 0);
+      elems[swizzle[1]] = LLVMConstInt(elem_type, g*dscale + 0.5, 0);
+      elems[swizzle[2]] = LLVMConstInt(elem_type, b*dscale + 0.5, 0);
+      elems[swizzle[3]] = LLVMConstInt(elem_type, a*dscale + 0.5, 0);
+   }
+
+   for(i = 4; i < type.length; ++i)
+      elems[i] = elems[i % 4];
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4])
+{
+   LLVMTypeRef elem_type = LLVMIntType(type.width);
+   LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   for(j = 0; j < type.length; j += 4)
+      for(i = 0; i < 4; ++i)
+         masks[j + i] = LLVMConstInt(elem_type, cond[i] ? ~0 : 0, 0);
+
+   return LLVMConstVector(masks, type.length);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
new file mode 100644
index 0000000000..ffb302f736
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.h
@@ -0,0 +1,108 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for constant building.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_CONST_H
+#define LP_BLD_CONST_H
+
+
+#include <llvm-c/Core.h>  
+
+#include <pipe/p_compiler.h>
+
+
+struct lp_type type;
+
+
+unsigned
+lp_mantissa(struct lp_type type);
+
+
+unsigned
+lp_const_shift(struct lp_type type);
+
+
+unsigned
+lp_const_offset(struct lp_type type);
+
+
+double
+lp_const_scale(struct lp_type type);
+
+double
+lp_const_min(struct lp_type type);
+
+
+double
+lp_const_max(struct lp_type type);
+
+
+double
+lp_const_eps(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_undef(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_zero(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_one(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_const_scalar(struct lp_type type,
+                      double val);
+
+
+LLVMValueRef
+lp_build_int_const_scalar(struct lp_type type,
+                          long long val);
+
+
+LLVMValueRef
+lp_build_const_aos(struct lp_type type, 
+                   double r, double g, double b, double a, 
+                   const unsigned char *swizzle);
+
+
+LLVMValueRef
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4]);
+
+
+#endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
new file mode 100644
index 0000000000..186cac70f6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -0,0 +1,702 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * We want to use the fastest type for a given computation whenever feasible.
+ * The other side of this is that we need to be able convert between several
+ * types accurately and efficiently.
+ *
+ * Conversion between types of different bit width is quite complex since a 
+ *
+ * To remember there are a few invariants in type conversions:
+ *
+ * - register width must remain constant:
+ *
+ *     src_type.width * src_type.length == dst_type.width * dst_type.length
+ *
+ * - total number of elements must remain constant:
+ *
+ *     src_type.length * num_srcs == dst_type.length * num_dsts
+ *
+ * It is not always possible to do the conversion both accurately and
+ * efficiently, usually due to lack of adequate machine instructions. In these
+ * cases it is important not to cut shortcuts here and sacrifice accuracy, as
+ * there this functions can be used anywhere. In the future we might have a
+ * precision parameter which can gauge the accuracy vs efficiency compromise,
+ * but for now if the data conversion between two stages happens to be the
+ * bottleneck, then most likely should just avoid converting at all and run
+ * both stages with the same type.
+ *
+ * Make sure to run lp_test_conv unit test after any change to this file.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_conv.h"
+
+
+/**
+ * Special case for converting clamped IEEE-754 floats to unsigned norms.
+ *
+ * The mathematical voodoo below may seem excessive but it is actually
+ * paramount we do it this way for several reasons. First, there is no single
+ * precision FP to unsigned integer conversion Intel SSE instruction. Second,
+ * secondly, even if there was, since the FP's mantissa takes only a fraction
+ * of register bits the typically scale and cast approach would require double
+ * precision for accurate results, and therefore half the throughput
+ *
+ * Although the result values can be scaled to an arbitrary bit width specified
+ * by dst_width, the actual result type will have the same width.
+ */
+LLVMValueRef
+lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
+                                        struct lp_type src_type,
+                                        unsigned dst_width,
+                                        LLVMValueRef src)
+{
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
+   LLVMValueRef res;
+   unsigned mantissa;
+   unsigned n;
+   unsigned long long ubound;
+   unsigned long long mask;
+   double scale;
+   double bias;
+
+   assert(src_type.floating);
+
+   mantissa = lp_mantissa(src_type);
+
+   /* We cannot carry more bits than the mantissa */
+   n = MIN2(mantissa, dst_width);
+
+   /* This magic coefficients will make the desired result to appear in the
+    * lowest significant bits of the mantissa.
+    */
+   ubound = ((unsigned long long)1 << n);
+   mask = ubound - 1;
+   scale = (double)mask/ubound;
+   bias = (double)((unsigned long long)1 << (mantissa - n));
+
+   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
+   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
+   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+
+   if(dst_width > n) {
+      int shift = dst_width - n;
+      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
+
+      /* TODO: Fill in the empty lower bits for additional precision? */
+#if 0
+      {
+         LLVMValueRef msb;
+         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
+         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
+         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
+         res = LLVMBuildOr(builder, res, msb, "");
+      }
+#elif 0
+      while(shift > 0) {
+         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
+         shift -= n;
+         n *= 2;
+      }
+#endif
+   }
+   else
+      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
+
+   return res;
+}
+
+
+/**
+ * Inverse of lp_build_clamped_float_to_unsigned_norm above.
+ */
+LLVMValueRef
+lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
+                                unsigned src_width,
+                                struct lp_type dst_type,
+                                LLVMValueRef src)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
+   LLVMValueRef bias_;
+   LLVMValueRef res;
+   unsigned mantissa;
+   unsigned n;
+   unsigned long long ubound;
+   unsigned long long mask;
+   double scale;
+   double bias;
+
+   mantissa = lp_mantissa(dst_type);
+
+   n = MIN2(mantissa, src_width);
+
+   ubound = ((unsigned long long)1 << n);
+   mask = ubound - 1;
+   scale = (double)ubound/mask;
+   bias = (double)((unsigned long long)1 << (mantissa - n));
+
+   res = src;
+
+   if(src_width > mantissa) {
+      int shift = src_width - mantissa;
+      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
+   }
+
+   bias_ = lp_build_const_scalar(dst_type, bias);
+
+   res = LLVMBuildOr(builder,
+                     res,
+                     LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
+
+   res = LLVMBuildBitCast(builder, res, vec_type, "");
+
+   res = LLVMBuildSub(builder, res, bias_, "");
+   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
+
+   return res;
+}
+
+
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Build shuffle vectors that match PACKxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_pack_shuffle(unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+static void
+lp_build_expand(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef src,
+               LLVMValueRef *dst, unsigned num_dsts)
+{
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length == dst_type.length * num_dsts);
+
+   num_tmps = 1;
+   dst[0] = src;
+
+   while(src_type.width < dst_type.width) {
+      struct lp_type new_type = src_type;
+      LLVMTypeRef new_vec_type;
+
+      new_type.width *= 2;
+      new_type.length /= 2;
+      new_vec_type = lp_build_vec_type(new_type);
+
+      for(i = num_tmps; i--; ) {
+         LLVMValueRef zero;
+         LLVMValueRef shuffle_lo;
+         LLVMValueRef shuffle_hi;
+         LLVMValueRef lo;
+         LLVMValueRef hi;
+
+         zero = lp_build_zero(src_type);
+         shuffle_lo = lp_build_const_unpack_shuffle(src_type.length, 0);
+         shuffle_hi = lp_build_const_unpack_shuffle(src_type.length, 1);
+
+         /*  PUNPCKLBW, PUNPCKHBW */
+         lo = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_lo, "");
+         hi = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_hi, "");
+
+         dst[2*i + 0] = LLVMBuildBitCast(builder, lo, new_vec_type, "");
+         dst[2*i + 1] = LLVMBuildBitCast(builder, hi, new_vec_type, "");
+      }
+
+      src_type = new_type;
+
+      num_tmps *= 2;
+   }
+
+   assert(num_tmps == num_dsts);
+}
+
+
+/**
+ * Non-interleaved pack.
+ *
+ * This will move values as
+ *
+ *   lo =   __ l0 __ l1 __ l2 __..  __ ln
+ *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * TODO: handle saturation consistently.
+ */
+static LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               boolean clamped,
+               LLVMValueRef lo,
+               LLVMValueRef hi)
+{
+   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
+   LLVMValueRef shuffle;
+   LLVMValueRef res;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * 2 == dst_type.length);
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(src_type.width * src_type.length == 128) {
+      /* All X86 non-interleaved pack instructions all take signed inputs and
+       * saturate them, so saturate beforehand. */
+      if(!src_type.sign && !clamped) {
+         struct lp_build_context bld;
+         unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
+         LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+         lp_build_context_init(&bld, builder, src_type);
+         lo = lp_build_min(&bld, lo, dst_max);
+         hi = lp_build_min(&bld, hi, dst_max);
+      }
+
+      switch(src_type.width) {
+      case 32:
+         if(dst_type.sign)
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+         else
+            /* PACKUSDW is the only instrinsic with a consistent signature */
+            return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
+         break;
+
+      case 16:
+         if(dst_type.sign)
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+         else
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+         break;
+
+      default:
+         assert(0);
+         return LLVMGetUndef(dst_vec_type);
+         break;
+      }
+
+      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      return res;
+   }
+#endif
+
+   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
+   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
+
+   shuffle = lp_build_const_pack_shuffle(dst_type.length);
+
+   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
+
+   return res;
+}
+
+
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
+static LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
+{
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length);
+
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+
+   while(src_type.width > dst_type.width) {
+      struct lp_type new_type = src_type;
+
+      new_type.width /= 2;
+      new_type.length *= 2;
+
+      /* Take in consideration the sign changes only in the last step */
+      if(new_type.width == dst_type.width)
+         new_type.sign = dst_type.sign;
+
+      num_srcs /= 2;
+
+      for(i = 0; i < num_srcs; ++i)
+         tmp[i] = lp_build_pack2(builder, src_type, new_type, clamped,
+                                 tmp[2*i + 0], tmp[2*i + 1]);
+
+      src_type = new_type;
+   }
+
+   assert(num_srcs == 1);
+
+   return tmp[0];
+}
+
+
+/**
+ * Generic type conversion.
+ *
+ * TODO: Take a precision argument, or even better, add a new precision member
+ * to the lp_type union.
+ */
+void
+lp_build_conv(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              const LLVMValueRef *src, unsigned num_srcs,
+              LLVMValueRef *dst, unsigned num_dsts)
+{
+   struct lp_type tmp_type;
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+
+   tmp_type = src_type;
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+   num_tmps = num_srcs;
+
+   /*
+    * Clamp if necessary
+    */
+
+   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
+      struct lp_build_context bld;
+      double src_min = lp_const_min(src_type);
+      double dst_min = lp_const_min(dst_type);
+      double src_max = lp_const_max(src_type);
+      double dst_max = lp_const_max(dst_type);
+      LLVMValueRef thres;
+
+      lp_build_context_init(&bld, builder, tmp_type);
+
+      if(src_min < dst_min) {
+         if(dst_min == 0.0)
+            thres = bld.zero;
+         else
+            thres = lp_build_const_scalar(src_type, dst_min);
+         for(i = 0; i < num_tmps; ++i)
+            tmp[i] = lp_build_max(&bld, tmp[i], thres);
+      }
+
+      if(src_max > dst_max) {
+         if(dst_max == 1.0)
+            thres = bld.one;
+         else
+            thres = lp_build_const_scalar(src_type, dst_max);
+         for(i = 0; i < num_tmps; ++i)
+            tmp[i] = lp_build_min(&bld, tmp[i], thres);
+      }
+   }
+
+   /*
+    * Scale to the narrowest range
+    */
+
+   if(dst_type.floating) {
+      /* Nothing to do */
+   }
+   else if(tmp_type.floating) {
+      if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
+         for(i = 0; i < num_tmps; ++i) {
+            tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                             tmp_type,
+                                                             dst_type.width,
+                                                             tmp[i]);
+         }
+         tmp_type.floating = FALSE;
+      }
+      else {
+         double dst_scale = lp_const_scale(dst_type);
+         LLVMTypeRef tmp_vec_type;
+
+         if (dst_scale != 1.0) {
+            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
+            for(i = 0; i < num_tmps; ++i)
+               tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+         }
+
+         /* Use an equally sized integer for intermediate computations */
+         tmp_type.floating = FALSE;
+         tmp_vec_type = lp_build_vec_type(tmp_type);
+         for(i = 0; i < num_tmps; ++i) {
+#if 0
+            if(dst_type.sign)
+               tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
+            else
+               tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
+#else
+           /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
+            tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
+#endif
+         }
+      }
+   }
+   else {
+      unsigned src_shift = lp_const_shift(src_type);
+      unsigned dst_shift = lp_const_shift(dst_type);
+
+      /* FIXME: compensate different offsets too */
+      if(src_shift > dst_shift) {
+         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
+         for(i = 0; i < num_tmps; ++i)
+            if(src_type.sign)
+               tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
+            else
+               tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
+      }
+   }
+
+   /*
+    * Truncate or expand bit width
+    */
+
+   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
+
+   if(tmp_type.width > dst_type.width) {
+      assert(num_dsts == 1);
+      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
+      tmp_type.width = dst_type.width;
+      tmp_type.length = dst_type.length;
+      num_tmps = 1;
+   }
+
+   if(tmp_type.width < dst_type.width) {
+      assert(num_tmps == 1);
+      lp_build_expand(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
+      tmp_type.width = dst_type.width;
+      tmp_type.length = dst_type.length;
+      num_tmps = num_dsts;
+   }
+
+   assert(tmp_type.width == dst_type.width);
+   assert(tmp_type.length == dst_type.length);
+   assert(num_tmps == num_dsts);
+
+   /*
+    * Scale to the widest range
+    */
+
+   if(src_type.floating) {
+      /* Nothing to do */
+   }
+   else if(!src_type.floating && dst_type.floating) {
+      if(!src_type.fixed && !src_type.sign && src_type.norm) {
+         for(i = 0; i < num_tmps; ++i) {
+            tmp[i] = lp_build_unsigned_norm_to_float(builder,
+                                                     src_type.width,
+                                                     dst_type,
+                                                     tmp[i]);
+         }
+         tmp_type.floating = TRUE;
+      }
+      else {
+         double src_scale = lp_const_scale(src_type);
+         LLVMTypeRef tmp_vec_type;
+
+         /* Use an equally sized integer for intermediate computations */
+         tmp_type.floating = TRUE;
+         tmp_type.sign = TRUE;
+         tmp_vec_type = lp_build_vec_type(tmp_type);
+         for(i = 0; i < num_tmps; ++i) {
+#if 0
+            if(dst_type.sign)
+               tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
+            else
+               tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
+#else
+            /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
+            tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
+#endif
+          }
+
+          if (src_scale != 1.0) {
+             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
+             for(i = 0; i < num_tmps; ++i)
+                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+          }
+      }
+    }
+    else {
+       unsigned src_shift = lp_const_shift(src_type);
+       unsigned dst_shift = lp_const_shift(dst_type);
+
+       /* FIXME: compensate different offsets too */
+       if(src_shift < dst_shift) {
+          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
+          for(i = 0; i < num_tmps; ++i)
+             tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+       }
+    }
+
+   for(i = 0; i < num_dsts; ++i)
+      dst[i] = tmp[i];
+}
+
+
+/**
+ * Bit mask conversion.
+ *
+ * This will convert the integer masks that match the given types.
+ *
+ * The mask values should 0 or -1, i.e., all bits either set to zero or one.
+ * Any other value will likely cause in unpredictable results.
+ *
+ * This is basically a very trimmed down version of lp_build_conv.
+ */
+void
+lp_build_conv_mask(LLVMBuilderRef builder,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
+                   const LLVMValueRef *src, unsigned num_srcs,
+                   LLVMValueRef *dst, unsigned num_dsts)
+{
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   /*
+    * Drop
+    *
+    * We assume all values are 0 or -1
+    */
+
+   src_type.floating = FALSE;
+   src_type.fixed = FALSE;
+   src_type.sign = TRUE;
+   src_type.norm = FALSE;
+
+   dst_type.floating = FALSE;
+   dst_type.fixed = FALSE;
+   dst_type.sign = TRUE;
+   dst_type.norm = FALSE;
+
+   /*
+    * Truncate or expand bit width
+    */
+
+   if(src_type.width > dst_type.width) {
+      assert(num_dsts == 1);
+      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
+   }
+   else if(src_type.width < dst_type.width) {
+      assert(num_srcs == 1);
+      lp_build_expand(builder, src_type, dst_type, src[0], dst, num_dsts);
+   }
+   else {
+      assert(num_srcs == num_dsts);
+      memcpy(dst, src, num_dsts * sizeof *dst);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
new file mode 100644
index 0000000000..ca378804d2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_CONV_H
+#define LP_BLD_CONV_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type type;
+
+
+LLVMValueRef
+lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
+                                        struct lp_type src_type,
+                                        unsigned dst_width,
+                                        LLVMValueRef src);
+
+LLVMValueRef
+lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
+                                unsigned src_width,
+                                struct lp_type dst_type,
+                                LLVMValueRef src);
+
+
+void
+lp_build_conv(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              const LLVMValueRef *srcs, unsigned num_srcs,
+              LLVMValueRef *dsts, unsigned num_dsts);
+
+void
+lp_build_conv_mask(LLVMBuilderRef builder,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
+                   const LLVMValueRef *src, unsigned num_srcs,
+                   LLVMValueRef *dst, unsigned num_dsts);
+
+#endif /* !LP_BLD_CONV_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
new file mode 100644
index 0000000000..59d8f492e6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -0,0 +1,125 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifdef HAVE_UDIS86
+#include <udis86.h>
+#endif
+
+#include "util/u_math.h"
+#include "util/u_debug.h"
+#include "lp_bld_debug.h"
+
+
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_pot(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
+void
+lp_disassemble(const void* func)
+{
+#ifdef HAVE_UDIS86
+   ud_t ud_obj;
+   uint64_t max_jmp_pc;
+
+   ud_init(&ud_obj);
+
+   ud_set_input_buffer(&ud_obj, (void*)func, 0xffff);
+
+   max_jmp_pc = (uint64_t) (uintptr_t) func;
+   ud_set_pc(&ud_obj, max_jmp_pc);
+
+#ifdef PIPE_ARCH_X86
+   ud_set_mode(&ud_obj, 32);
+#endif
+#ifdef PIPE_ARCH_X86_64
+   ud_set_mode(&ud_obj, 64);
+#endif
+
+   ud_set_syntax(&ud_obj, UD_SYN_ATT);
+
+   while (ud_disassemble(&ud_obj)) {
+
+#ifdef PIPE_ARCH_X86
+      debug_printf("%08lx: ", (unsigned long)ud_insn_off(&ud_obj));
+#endif
+#ifdef PIPE_ARCH_X86_64
+      debug_printf("%016llx: ", (unsigned long long)ud_insn_off(&ud_obj));
+#endif
+
+#if 0
+      debug_printf("%-16s ", ud_insn_hex(&ud_obj));
+#endif
+
+      debug_printf("%s\n", ud_insn_asm(&ud_obj));
+
+      if(ud_obj.mnemonic != UD_Icall) {
+         unsigned i;
+         for(i = 0; i < 3; ++i) {
+            const struct ud_operand *op = &ud_obj.operand[i];
+            if (op->type == UD_OP_JIMM){
+               uint64_t pc = ud_obj.pc;
+
+               switch (op->size) {
+               case 8:
+                  pc += op->lval.sbyte;
+                  break;
+               case 16:
+                  pc += op->lval.sword;
+                  break;
+               case 32:
+                  pc += op->lval.sdword;
+                  break;
+               default:
+                  break;
+               }
+               if(pc > max_jmp_pc)
+                  max_jmp_pc = pc;
+            }
+         }
+      }
+
+      if (ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret)
+         break;
+   }
+   debug_printf("\n");
+#else
+   (void)func;
+#endif
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.h b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
new file mode 100644
index 0000000000..583e6132b4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
@@ -0,0 +1,64 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_DEBUG_H
+#define LP_BLD_DEBUG_H
+
+
+#include <llvm-c/Core.h>
+
+#include "pipe/p_compiler.h"
+#include "util/u_string.h"
+
+
+static INLINE void
+lp_build_name(LLVMValueRef val, const char *format, ...)
+{
+#ifdef DEBUG
+   char name[32];
+   va_list ap;
+   va_start(ap, format);
+   util_vsnprintf(name, sizeof name, format, ap);
+   va_end(ap);
+   LLVMSetValueName(val, name);
+#else
+   (void)val;
+   (void)format;
+#endif
+}
+
+
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment);
+
+
+void
+lp_disassemble(const void* func);
+
+
+#endif /* !LP_BLD_DEBUG_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
new file mode 100644
index 0000000000..21c665c4d4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -0,0 +1,216 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * We are free to use a different pixel layout though. Since our basic
+ * processing unit is a quad (2x2 pixel block) we store the depth/stencil
+ * values tiled, a quad at time. That is, a depth buffer containing 
+ *
+ *  Z11 Z12 Z13 Z14 ...
+ *  Z21 Z22 Z23 Z24 ...
+ *  Z31 Z32 Z33 Z34 ...
+ *  Z41 Z42 Z43 Z44 ...
+ *  ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
+ *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
+ *  ... ... ... ... ... ... ... ... ...
+ *
+ * FIXME: Code generate stencil test
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_depth.h"
+
+
+/**
+ * Return a type appropriate for depth/stencil testing.
+ */
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length)
+{
+   struct lp_type type;
+   unsigned swizzle;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   swizzle = format_desc->swizzle[0];
+   assert(swizzle < 4);
+
+   memset(&type, 0, sizeof type);
+   type.width = format_desc->block.bits;
+
+   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+      type.floating = TRUE;
+      assert(swizzle = 0);
+      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
+   }
+   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+      assert(format_desc->block.bits <= 32);
+      if(format_desc->channel[swizzle].normalized)
+         type.norm = TRUE;
+   }
+   else
+      assert(0);
+
+   assert(type.width <= length);
+   type.length = length / type.width;
+
+   return type;
+}
+
+
+/**
+ * Depth test.
+ */
+void
+lp_build_depth_test(LLVMBuilderRef builder,
+                    const struct pipe_depth_state *state,
+                    struct lp_type type,
+                    const struct util_format_description *format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef src,
+                    LLVMValueRef dst_ptr)
+{
+   struct lp_build_context bld;
+   unsigned z_swizzle;
+   LLVMValueRef dst;
+   LLVMValueRef z_bitmask = NULL;
+   LLVMValueRef test;
+
+   if(!state->enabled)
+      return;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   z_swizzle = format_desc->swizzle[0];
+   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return;
+
+   /* Sanity checking */
+   assert(z_swizzle < 4);
+   assert(format_desc->block.bits == type.width);
+   if(type.floating) {
+      assert(z_swizzle == 0);
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
+      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
+   }
+   else {
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+      assert(format_desc->channel[z_swizzle].normalized);
+      assert(!type.fixed);
+      assert(!type.sign);
+      assert(type.norm);
+   }
+
+   /* Setup build context */
+   lp_build_context_init(&bld, builder, type);
+
+   dst = LLVMBuildLoad(builder, dst_ptr, "");
+
+   lp_build_name(dst, "zsbuf");
+
+   /* Align the source depth bits with the destination's, and mask out any
+    * stencil or padding bits from both */
+   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
+      assert(z_swizzle == 0);
+      /* nothing to do */
+   }
+   else {
+      unsigned padding_left;
+      unsigned padding_right;
+      unsigned chan;
+
+      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
+      assert(format_desc->channel[z_swizzle].normalized);
+
+      padding_right = 0;
+      for(chan = 0; chan < z_swizzle; ++chan)
+         padding_right += format_desc->channel[chan].size;
+      padding_left = format_desc->block.bits -
+                     (padding_right + format_desc->channel[z_swizzle].size);
+
+      if(padding_left || padding_right) {
+         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
+         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
+         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
+      }
+
+      if(padding_left)
+         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
+      if(padding_right)
+         src = LLVMBuildAnd(builder, src, z_bitmask, "");
+      if(padding_left || padding_right)
+         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
+   }
+
+   lp_build_name(dst, "zsbuf.z");
+
+   test = lp_build_cmp(&bld, state->func, src, dst);
+   lp_build_mask_update(mask, test);
+
+   if(state->writemask) {
+      if(z_bitmask)
+         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
+      else
+         z_bitmask = mask->value;
+
+      dst = lp_build_select(&bld, z_bitmask, src, dst);
+      LLVMBuildStore(builder, dst, dst_ptr);
+   }
+
+   /* FIXME */
+   assert(!state->occlusion_count);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
new file mode 100644
index 0000000000..79d6981bb5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_DEPTH_H
+#define LP_BLD_DEPTH_H
+
+
+#include <llvm-c/Core.h>  
+
+ 
+struct pipe_depth_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length);
+
+
+void
+lp_build_depth_test(LLVMBuilderRef builder,
+                    const struct pipe_depth_state *state,
+                    struct lp_type type,
+                    const struct util_format_description *format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef src,
+                    LLVMValueRef dst_ptr);
+
+
+#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
new file mode 100644
index 0000000000..dcc25fbff8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -0,0 +1,493 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * LLVM control flow build helpers.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_flow.h"
+
+
+#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_DEPTH 32
+
+
+/**
+ * Enumeration of all possible flow constructs.
+ */
+enum lp_build_flow_construct_kind {
+   lP_BUILD_FLOW_SCOPE,
+   LP_BUILD_FLOW_SKIP,
+};
+
+
+/**
+ * Variable declaration scope.
+ */
+struct lp_build_flow_scope
+{
+   /** Number of variables declared in this scope */
+   unsigned num_variables;
+};
+
+
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_flow_skip
+{
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+
+   /** Number of variables declared at the beginning */
+   unsigned num_variables;
+
+   LLVMValueRef *phi;
+};
+
+
+/**
+ * Union of all possible flow constructs' data
+ */
+union lp_build_flow_construct_data
+{
+   struct lp_build_flow_scope scope;
+   struct lp_build_flow_skip skip;
+};
+
+
+/**
+ * Element of the flow construct stack.
+ */
+struct lp_build_flow_construct
+{
+   enum lp_build_flow_construct_kind kind;
+   union lp_build_flow_construct_data data;
+};
+
+
+/**
+ * All necessary data to generate LLVM control flow constructs.
+ *
+ * Besides keeping track of the control flow construct themselves we also
+ * need to keep track of variables in order to generate SSA Phi values.
+ */
+struct lp_build_flow_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * Control flow stack.
+    */
+   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
+   unsigned num_constructs;
+
+   /**
+    * Variable stack
+    */
+   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
+   unsigned num_variables;
+};
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder)
+{
+   struct lp_build_flow_context *flow;
+
+   flow = CALLOC_STRUCT(lp_build_flow_context);
+   if(!flow)
+      return NULL;
+
+   flow->builder = builder;
+
+   return flow;
+}
+
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow)
+{
+   assert(flow->num_constructs == 0);
+   assert(flow->num_variables == 0);
+   FREE(flow);
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_push(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
+   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
+      return NULL;
+
+   flow->constructs[flow->num_constructs].kind = kind;
+   return &flow->constructs[flow->num_constructs++].data;
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_peek(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[flow->num_constructs - 1].data;
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_pop(struct lp_build_flow_context *flow,
+                  enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[--flow->num_constructs].data;
+}
+
+
+/**
+ * Begin a variable scope.
+ *
+ *
+ */
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_push(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   scope->num_variables = 0;
+}
+
+
+/**
+ * Declare a variable.
+ *
+ * A variable is a named entity which can have different LLVMValueRef's at
+ * different points of the program. This is relevant for control flow because
+ * when there are mutiple branches to a same location we need to replace
+ * the variable's value with a Phi function as explained in
+ * http://en.wikipedia.org/wiki/Static_single_assignment_form .
+ *
+ * We keep track of variables by keeping around a pointer to where their
+ * current.
+ *
+ * There are a few cautions to observe:
+ *
+ * - Variable's value must not be NULL. If there is no initial value then
+ *   LLVMGetUndef() should be used.
+ *
+ * - Variable's value must be kept up-to-date. If the variable is going to be
+ *   modified by a function then a pointer should be passed so that its value
+ *   is accurate. Failure to do this will cause some of the variables'
+ *   transient values to be lost, leading to wrong results.
+ *
+ * - A program should be written from top to bottom, by always appending
+ *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
+ *   modifying existing statements will most likely lead to wrong results.
+ *
+ */
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_peek(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   assert(*variable);
+   if(!*variable)
+      return;
+
+   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
+   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
+      return;
+
+   flow->variables[flow->num_variables++] = variable;
+   ++scope->num_variables;
+}
+
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_pop(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   assert(flow->num_variables >= scope->num_variables);
+   if(flow->num_variables < scope->num_variables) {
+      flow->num_variables = 0;
+      return;
+   }
+
+   flow->num_variables -= scope->num_variables;
+}
+
+
+static LLVMBasicBlockRef
+lp_build_flow_insert_block(struct lp_build_flow_context *flow)
+{
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef next_block;
+   LLVMBasicBlockRef new_block;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   next_block = LLVMGetNextBasicBlock(current_block);
+   if(next_block) {
+      new_block = LLVMInsertBasicBlock(next_block, "");
+   }
+   else {
+      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+      new_block = LLVMAppendBasicBlock(function, "");
+   }
+
+   return new_block;
+}
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBuilderRef builder;
+   unsigned i;
+
+   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   skip->block = lp_build_flow_insert_block(flow);
+   skip->num_variables = flow->num_variables;
+   if(!skip->num_variables) {
+      skip->phi = NULL;
+      return;
+   }
+
+   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
+   if(!skip->phi) {
+      skip->num_variables = 0;
+      return;
+   }
+
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, skip->block);
+
+   for(i = 0; i < skip->num_variables; ++i)
+      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+   LLVMDisposeBuilder(builder);
+}
+
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef new_block;
+   unsigned i;
+
+   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   new_block = lp_build_flow_insert_block(flow);
+
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+   }
+
+   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+
+   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+ }
+
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   unsigned i;
+
+   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+      *flow->variables[i] = skip->phi[i];
+   }
+
+   LLVMBuildBr(flow->builder, skip->block);
+   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+
+   FREE(skip->phi);
+}
+
+
+static void
+lp_build_mask_check(struct lp_build_mask_context *mask)
+{
+   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef cond;
+
+   cond = LLVMBuildICmp(builder,
+                        LLVMIntEQ,
+                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMConstNull(mask->reg_type),
+                        "");
+
+   lp_build_flow_skip_cond_break(mask->flow, cond);
+}
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value)
+{
+   memset(mask, 0, sizeof *mask);
+
+   mask->flow = flow;
+   mask->reg_type = LLVMIntType(type.width * type.length);
+   mask->value = value;
+
+   lp_build_flow_scope_begin(flow);
+   lp_build_flow_scope_declare(flow, &mask->value);
+   lp_build_flow_skip_begin(flow);
+
+   lp_build_mask_check(mask);
+}
+
+
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value)
+{
+   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+
+   lp_build_mask_check(mask);
+}
+
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask)
+{
+   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_scope_end(mask->flow);
+   return mask->value;
+}
+
+
+
+void
+lp_build_loop_begin(LLVMBuilderRef builder,
+                    LLVMValueRef start,
+                    struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+
+   state->block = LLVMAppendBasicBlock(function, "loop");
+
+   LLVMBuildBr(builder, state->block);
+
+   LLVMPositionBuilderAtEnd(builder, state->block);
+
+   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
+
+   LLVMAddIncoming(state->counter, &start, &block, 1);
+
+}
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   LLVMValueRef next;
+   LLVMValueRef cond;
+   LLVMBasicBlockRef after_block;
+
+   if (!step)
+      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
+
+   next = LLVMBuildAdd(builder, state->counter, step, "");
+
+   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
+
+   after_block = LLVMAppendBasicBlock(function, "");
+
+   LLVMBuildCondBr(builder, cond, after_block, state->block);
+
+   LLVMAddIncoming(state->counter, &next, &block, 1);
+
+   LLVMPositionBuilderAtEnd(builder, after_block);
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.h b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
new file mode 100644
index 0000000000..e61999ff06
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
@@ -0,0 +1,129 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * LLVM control flow build helpers.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_FLOW_H
+#define LP_BLD_FLOW_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+struct lp_build_flow_context;
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder);
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable);
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond);
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+
+
+struct lp_build_mask_context
+{
+   struct lp_build_flow_context *flow;
+
+   LLVMTypeRef reg_type;
+
+   LLVMValueRef value;
+};
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value);
+
+/**
+ * Bitwise AND the mask with the given value, if a previous mask was set.
+ */
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value);
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask);
+
+
+/**
+ * LLVM's IR doesn't represent for-loops directly. Furthermore it
+ * it requires creating code blocks, branches, phi variables, so it
+ * requires a fair amount of code.
+ *
+ * @sa http://www.llvm.org/docs/tutorial/LangImpl5.html#for
+ */
+struct lp_build_loop_state
+{
+  LLVMBasicBlockRef block;
+  LLVMValueRef counter;
+};
+
+
+void
+lp_build_loop_begin(LLVMBuilderRef builder,
+                    LLVMValueRef start,
+                    struct lp_build_loop_state *state);
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state);
+
+
+
+#endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
new file mode 100644
index 0000000000..6d3f692619
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format.h
@@ -0,0 +1,119 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_H
+#define LP_BLD_H
+
+
+/**
+ * @file
+ * Pixel format helpers.
+ */
+
+#include <llvm-c/Core.h>  
+
+#include "pipe/p_format.h"
+
+struct util_format_description;
+struct lp_type;
+
+
+/**
+ * Unpack a pixel into its RGBA components.
+ *
+ * @param packed integer.
+ *
+ * @return RGBA in a 4 floats vector.
+ */
+LLVMValueRef
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed);
+
+
+/**
+ * Pack a pixel.
+ *
+ * @param rgba 4 float vector with the unpacked components.
+ */
+LLVMValueRef
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba);
+
+
+/**
+ * Load a pixel into its RGBA components.
+ *
+ * @param ptr value with the pointer to the packed pixel. Pointer type is
+ * irrelevant.
+ *
+ * @return RGBA in a 4 floats vector.
+ */
+LLVMValueRef
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr);
+
+
+/**
+ * Store a pixel.
+ *
+ * @param rgba 4 float vector with the unpacked components.
+ */
+void 
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba);
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba);
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba);
+
+#endif /* !LP_BLD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
new file mode 100644
index 0000000000..b9b5d84bed
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -0,0 +1,303 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_format.h"
+
+
+LLVMValueRef
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed)
+{
+   const struct util_format_description *desc;
+   LLVMTypeRef type;
+   LLVMValueRef shifted, casted, scaled, masked;
+   LLVMValueRef shifts[4];
+   LLVMValueRef masks[4];
+   LLVMValueRef scales[4];
+   LLVMValueRef swizzles[4];
+   LLVMValueRef aux[4];
+   bool normalized;
+   int empty_channel;
+   unsigned shift;
+   unsigned i;
+
+   desc = util_format_description(format);
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   type = LLVMIntType(desc->block.bits);
+
+   /* Do the intermediate integer computations with 32bit integers since it
+    * matches floating point size */
+   if (desc->block.bits < 32)
+      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+
+   /* Broadcast the packed value to all four channels */
+   packed = LLVMBuildInsertElement(builder,
+                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   packed,
+                                   LLVMConstNull(LLVMInt32Type()),
+                                   "");
+   packed = LLVMBuildShuffleVector(builder,
+                                   packed,
+                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   "");
+
+   /* Initialize vector constants */
+   normalized = FALSE;
+   empty_channel = -1;
+   shift = 0;
+   for (i = 0; i < 4; ++i) {
+      unsigned bits = desc->channel[i].size;
+
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
+         shifts[i] = LLVMGetUndef(LLVMInt32Type());
+         masks[i] = LLVMConstNull(LLVMInt32Type());
+         scales[i] =  LLVMConstNull(LLVMFloatType());
+         empty_channel = i;
+      }
+      else {
+         unsigned mask = (1 << bits) - 1;
+
+         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(bits < 32);
+
+         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
+         masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0);
+
+         if (desc->channel[i].normalized) {
+            scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask);
+            normalized = TRUE;
+         }
+         else
+            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
+      }
+
+      shift += bits;
+   }
+
+   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   // UIToFP can't be expressed in SSE2
+   casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
+
+   if (normalized)
+      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
+   else
+      scaled = casted;
+
+   for (i = 0; i < 4; ++i)
+      aux[i] = LLVMGetUndef(LLVMFloatType());
+
+   for (i = 0; i < 4; ++i) {
+      enum util_format_swizzle swizzle = desc->swizzle[i];
+
+      switch (swizzle) {
+      case UTIL_FORMAT_SWIZZLE_X:
+      case UTIL_FORMAT_SWIZZLE_Y:
+      case UTIL_FORMAT_SWIZZLE_Z:
+      case UTIL_FORMAT_SWIZZLE_W:
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_0:
+         assert(empty_channel >= 0);
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_1:
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
+         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_NONE:
+         swizzles[i] = LLVMGetUndef(LLVMFloatType());
+         assert(0);
+         break;
+      }
+   }
+
+   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), "");
+}
+
+
+LLVMValueRef
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba)
+{
+   const struct util_format_description *desc;
+   LLVMTypeRef type;
+   LLVMValueRef packed = NULL;
+   LLVMValueRef swizzles[4];
+   LLVMValueRef shifted, casted, scaled, unswizzled;
+   LLVMValueRef shifts[4];
+   LLVMValueRef scales[4];
+   bool normalized;
+   unsigned shift;
+   unsigned i, j;
+
+   desc = util_format_description(format);
+
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+
+   type = LLVMIntType(desc->block.bits);
+
+   /* Unswizzle the color components into the source vector. */
+   for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) {
+         if (desc->swizzle[j] == i)
+            break;
+      }
+      if (j < 4)
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0);
+      else
+         swizzles[i] = LLVMGetUndef(LLVMInt32Type());
+   }
+
+   unswizzled = LLVMBuildShuffleVector(builder, rgba,
+                                       LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)),
+                                       LLVMConstVector(swizzles, 4), "");
+
+   normalized = FALSE;
+   shift = 0;
+   for (i = 0; i < 4; ++i) {
+      unsigned bits = desc->channel[i].size;
+
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
+         shifts[i] = LLVMGetUndef(LLVMInt32Type());
+         scales[i] =  LLVMGetUndef(LLVMFloatType());
+      }
+      else {
+         unsigned mask = (1 << bits) - 1;
+
+         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(bits < 32);
+
+         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
+
+         if (desc->channel[i].normalized) {
+            scales[i] = LLVMConstReal(LLVMFloatType(), mask);
+            normalized = TRUE;
+         }
+         else
+            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
+      }
+
+      shift += bits;
+   }
+
+   if (normalized)
+      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
+   else
+      scaled = unswizzled;
+
+   casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), "");
+
+   shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
+   
+   /* Bitwise or all components */
+   for (i = 0; i < 4; ++i) {
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), "");
+         if (packed)
+            packed = LLVMBuildOr(builder, packed, component, "");
+         else
+            packed = component;
+      }
+   }
+
+   if (!packed)
+      packed = LLVMGetUndef(LLVMInt32Type());
+
+   if (desc->block.bits < 32)
+      packed = LLVMBuildTrunc(builder, packed, type, "");
+
+   return packed;
+}
+
+
+LLVMValueRef
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr)
+{
+   const struct util_format_description *desc;
+   LLVMTypeRef type;
+   LLVMValueRef packed;
+
+   desc = util_format_description(format);
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   type = LLVMIntType(desc->block.bits);
+
+   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
+
+   packed = LLVMBuildLoad(builder, ptr, "");
+
+   return lp_build_unpack_rgba_aos(builder, format, packed);
+}
+
+
+void
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba)
+{
+   const struct util_format_description *desc;
+   LLVMTypeRef type;
+   LLVMValueRef packed;
+
+   desc = util_format_description(format);
+
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+
+   type = LLVMIntType(desc->block.bits);
+
+   packed = lp_build_pack_rgba_aos(builder, format, rgba);
+
+   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
+
+   LLVMBuildStore(builder, packed, ptr);
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
new file mode 100644
index 0000000000..b5ff434e1a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+static LLVMValueRef
+lp_build_format_swizzle(struct lp_type type,
+                        const LLVMValueRef *inputs,
+                        enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case UTIL_FORMAT_SWIZZLE_X:
+   case UTIL_FORMAT_SWIZZLE_Y:
+   case UTIL_FORMAT_SWIZZLE_Z:
+   case UTIL_FORMAT_SWIZZLE_W:
+      return inputs[swizzle];
+   case UTIL_FORMAT_SWIZZLE_0:
+      return lp_build_zero(type);
+   case UTIL_FORMAT_SWIZZLE_1:
+      return lp_build_one(type);
+   case UTIL_FORMAT_SWIZZLE_NONE:
+      return lp_build_undef(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba)
+{
+   LLVMValueRef inputs[4];
+   unsigned start;
+   unsigned chan;
+
+   /* FIXME: Support more formats */
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   /* Decode the input vector components */
+   start = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned width = format_desc->channel[chan].size;
+      unsigned stop = start + width;
+      LLVMValueRef input;
+
+      input = packed;
+
+      switch(format_desc->channel[chan].type) {
+      case UTIL_FORMAT_TYPE_VOID:
+         input = NULL;
+         break;
+
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         if(type.floating) {
+            if(start)
+               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+            if(stop < format_desc->block.bits) {
+               unsigned mask = ((unsigned long long)1 << width) - 1;
+               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+            }
+
+            if(format_desc->channel[chan].normalized)
+               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
+            else
+               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      default:
+         /* fall through */
+         input = lp_build_undef(type);
+         break;
+      }
+
+      inputs[chan] = input;
+
+      start = stop;
+   }
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle(type, inputs, swizzle);
+      rgba[2] = rgba[1] = rgba[0] = depth;
+      rgba[3] = lp_build_one(type);
+   }
+   else {
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         rgba[chan] = lp_build_format_swizzle(type, inputs, swizzle);
+      }
+   }
+}
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba)
+{
+   LLVMValueRef packed;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   packed = lp_build_gather(builder,
+                            type.length, format_desc->block.bits, type.width,
+                            base_ptr, offsets);
+
+   lp_build_unpack_rgba_soa(builder, format_desc, type, packed, rgba);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
new file mode 100644
index 0000000000..338dbca6d1
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -0,0 +1,377 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_interp.h"
+
+
+static void
+attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
+{
+   if(attrib == 0)
+      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
+   else
+      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
+}
+
+
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
+{
+   LLVMBuilderRef builder = bld->base.builder;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
+            LLVMValueRef a0 = NULL;
+            LLVMValueRef dadx = NULL;
+            LLVMValueRef dady = NULL;
+
+            switch( mode ) {
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               /* fall-through */
+
+            case TGSI_INTERPOLATE_LINEAR:
+               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
+               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
+               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
+               dady = lp_build_broadcast_scalar(&bld->base, dady);
+               attrib_name(dadx, attrib, chan, ".dadx");
+               attrib_name(dady, attrib, chan, ".dady");
+               /* fall-through */
+
+            case TGSI_INTERPOLATE_CONSTANT:
+               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
+               a0 = lp_build_broadcast_scalar(&bld->base, a0);
+               attrib_name(a0, attrib, chan, ".dady");
+               break;
+
+            default:
+               assert(0);
+               break;
+            }
+
+            bld->a0  [attrib][chan] = a0;
+            bld->dadx[attrib][chan] = dadx;
+            bld->dady[attrib][chan] = dady;
+         }
+      }
+   }
+}
+
+
+/**
+ * Small vector x scale multiplication optimization.
+ *
+ * TODO: Should be elsewhere.
+ */
+static LLVMValueRef
+coeff_multiply(struct lp_build_interp_soa_context *bld,
+               LLVMValueRef coeff,
+               int step)
+{
+   LLVMValueRef factor;
+
+   switch(step) {
+   case 0:
+      return bld->base.zero;
+   case 1:
+      return coeff;
+   case 2:
+      return lp_build_add(&bld->base, coeff, coeff);
+   default:
+      factor = lp_build_const_scalar(bld->base.type, (double)step);
+      return lp_build_mul(&bld->base, coeff, factor);
+   }
+}
+
+
+/**
+ * Multiply the dadx and dady with the xstep and ystep respectively.
+ */
+static void
+coeffs_update(struct lp_build_interp_soa_context *bld)
+{
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+      if (mode != TGSI_INTERPOLATE_CONSTANT) {
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(mask & (1 << chan)) {
+               bld->dadx[attrib][chan] = coeff_multiply(bld, bld->dadx[attrib][chan], bld->xstep);
+               bld->dady[attrib][chan] = coeff_multiply(bld, bld->dady[attrib][chan], bld->ystep);
+            }
+         }
+      }
+   }
+}
+
+
+static void
+attribs_init(struct lp_build_interp_soa_context *bld)
+{
+   LLVMValueRef x = bld->pos[0];
+   LLVMValueRef y = bld->pos[1];
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef a0   = bld->a0  [attrib][chan];
+            LLVMValueRef dadx = bld->dadx[attrib][chan];
+            LLVMValueRef dady = bld->dady[attrib][chan];
+            LLVMValueRef res;
+
+            res = a0;
+
+            if (mode != TGSI_INTERPOLATE_CONSTANT) {
+               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
+               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
+            }
+
+            /* Keep the value of the attribue before perspective divide for faster updates */
+            bld->attribs_pre[attrib][chan] = res;
+
+            if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
+               LLVMValueRef w = bld->pos[3];
+               assert(attrib != 0);
+               if(!oow)
+                  oow = lp_build_rcp(&bld->base, w);
+               res = lp_build_mul(&bld->base, res, oow);
+            }
+
+            attrib_name(res, attrib, chan, "");
+
+            bld->attribs[attrib][chan] = res;
+         }
+      }
+   }
+}
+
+
+static void
+attribs_update(struct lp_build_interp_soa_context *bld)
+{
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+
+      if (mode != TGSI_INTERPOLATE_CONSTANT) {
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(mask & (1 << chan)) {
+               LLVMValueRef dadx = bld->dadx[attrib][chan];
+               LLVMValueRef dady = bld->dady[attrib][chan];
+               LLVMValueRef res;
+
+               res = bld->attribs_pre[attrib][chan];
+
+               if(bld->xstep)
+                  res = lp_build_add(&bld->base, res, dadx);
+
+               if(bld->ystep)
+                  res = lp_build_add(&bld->base, res, dady);
+
+               bld->attribs_pre[attrib][chan] = res;
+
+               if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
+                  LLVMValueRef w = bld->pos[3];
+                  assert(attrib != 0);
+                  if(!oow)
+                     oow = lp_build_rcp(&bld->base, w);
+                  res = lp_build_mul(&bld->base, res, oow);
+               }
+
+               attrib_name(res, attrib, chan, "");
+
+               bld->attribs[attrib][chan] = res;
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Generate the position vectors.
+ *
+ * Parameter x0, y0 are the integer values with the quad upper left coordinates.
+ */
+static void
+pos_init(struct lp_build_interp_soa_context *bld,
+         LLVMValueRef x0,
+         LLVMValueRef y0)
+{
+   lp_build_name(x0, "pos.x");
+   lp_build_name(y0, "pos.y");
+
+   bld->attribs[0][0] = x0;
+   bld->attribs[0][1] = y0;
+}
+
+
+static void
+pos_update(struct lp_build_interp_soa_context *bld)
+{
+   LLVMValueRef x = bld->attribs[0][0];
+   LLVMValueRef y = bld->attribs[0][1];
+
+   if(bld->xstep)
+      x = lp_build_add(&bld->base, x, lp_build_const_scalar(bld->base.type, bld->xstep));
+
+   if(bld->ystep)
+      y = lp_build_add(&bld->base, y, lp_build_const_scalar(bld->base.type, bld->ystep));
+
+   lp_build_name(x, "pos.x");
+   lp_build_name(y, "pos.y");
+
+   bld->attribs[0][0] = x;
+   bld->attribs[0][1] = y;
+}
+
+
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         const struct tgsi_token *tokens,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0,
+                         int xstep,
+                         int ystep)
+{
+   struct tgsi_parse_context parse;
+   struct tgsi_full_declaration *decl;
+
+   memset(bld, 0, sizeof *bld);
+
+   lp_build_context_init(&bld->base, builder, type);
+
+   /* For convenience */
+   bld->pos = bld->attribs[0];
+   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
+
+   /* Position */
+   bld->num_attribs = 1;
+   bld->mask[0] = TGSI_WRITEMASK_ZW;
+   bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
+
+   /* Inputs */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         decl = &parse.FullToken.FullDeclaration;
+         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+            unsigned first, last, mask;
+            unsigned attrib;
+
+            first = decl->DeclarationRange.First;
+            last = decl->DeclarationRange.Last;
+            mask = decl->Declaration.UsageMask;
+
+            for( attrib = first; attrib <= last; ++attrib ) {
+               bld->mask[1 + attrib] = mask;
+               bld->mode[1 + attrib] = decl->Declaration.Interpolate;
+            }
+
+            bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free( &parse );
+
+   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+
+   pos_init(bld, x0, y0);
+
+   attribs_init(bld);
+
+   bld->xstep = xstep;
+   bld->ystep = ystep;
+
+   coeffs_update(bld);
+}
+
+
+/**
+ * Advance the position and inputs with the xstep and ystep.
+ */
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld)
+{
+   pos_update(bld);
+
+   attribs_update(bld);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
new file mode 100644
index 0000000000..9c57a10879
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -0,0 +1,99 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * Special attention is given to the interpolation of side by side quads.
+ * Multiplications are made only for the first quad. Interpolation of
+ * inputs for posterior quads are done exclusively with additions, and
+ * perspective divide if necessary.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_INTERP_H
+#define LP_BLD_INTERP_H
+
+
+#include <llvm-c/Core.h>
+
+#include "tgsi/tgsi_exec.h"
+
+#include "lp_bld_type.h"
+
+
+struct tgsi_token;
+
+
+struct lp_build_interp_soa_context
+{
+   struct lp_build_context base;
+
+   unsigned num_attribs;
+   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS];
+   unsigned mode[1 + PIPE_MAX_SHADER_INPUTS];
+
+   LLVMValueRef a0  [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   LLVMValueRef dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   LLVMValueRef dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   int xstep;
+   int ystep;
+
+   /* Attribute values before perspective divide */
+   LLVMValueRef attribs_pre[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   /*
+    * Convenience pointers. Callers may access this one.
+    */
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+};
+
+
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         const struct tgsi_token *tokens,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0,
+                         int xstep,
+                         int ystep);
+
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld);
+
+
+#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.c b/src/gallium/drivers/llvmpipe/lp_bld_intr.c
new file mode 100644
index 0000000000..9895749d56
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_intr.c
@@ -0,0 +1,192 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helpers for emiting intrinsic calls.
+ *
+ * LLVM vanilla IR doesn't represent all basic arithmetic operations we care
+ * about, and it is often necessary to resort target-specific intrinsics for
+ * performance, convenience.
+ *
+ * Ideally we would like to stay away from target specific intrinsics and
+ * move all the instruction selection logic into upstream LLVM where it belongs.
+ *
+ * These functions are also used for calling C functions provided by us from
+ * generated LLVM code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_intr.h"
+
+
+LLVMValueRef
+lp_declare_intrinsic(LLVMModuleRef module,
+                     const char *name,
+                     LLVMTypeRef ret_type,
+                     LLVMTypeRef *arg_types,
+                     unsigned num_args)
+{
+   LLVMTypeRef function_type;
+   LLVMValueRef function;
+
+   assert(!LLVMGetNamedFunction(module, name));
+
+   function_type = LLVMFunctionType(ret_type, arg_types, num_args, 0);
+   function = LLVMAddFunction(module, name, function_type);
+
+   LLVMSetFunctionCallConv(function, LLVMCCallConv);
+   LLVMSetLinkage(function, LLVMExternalLinkage);
+
+   assert(LLVMIsDeclaration(function));
+
+   if(name[0] == 'l' &&
+      name[1] == 'l' &&
+      name[2] == 'v' &&
+      name[3] == 'm' &&
+      name[4] == '.')
+      assert(LLVMGetIntrinsicID(function));
+
+   return function;
+}
+
+
+LLVMValueRef
+lp_build_intrinsic(LLVMBuilderRef builder,
+                   const char *name,
+                   LLVMTypeRef ret_type,
+                   LLVMValueRef *args,
+                   unsigned num_args)
+{
+   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+   LLVMValueRef function;
+
+   function = LLVMGetNamedFunction(module, name);
+   if(!function) {
+      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
+      unsigned i;
+
+      assert(num_args <= LP_MAX_FUNC_ARGS);
+
+      for(i = 0; i < num_args; ++i) {
+         assert(args[i]);
+         arg_types[i] = LLVMTypeOf(args[i]);
+      }
+
+      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
+   }
+
+   return LLVMBuildCall(builder, function, args, num_args, "");
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_unary(LLVMBuilderRef builder,
+                         const char *name,
+                         LLVMTypeRef ret_type,
+                         LLVMValueRef a)
+{
+   return lp_build_intrinsic(builder, name, ret_type, &a, 1);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_binary(LLVMBuilderRef builder,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMValueRef a,
+                          LLVMValueRef b)
+{
+   LLVMValueRef args[2];
+
+   args[0] = a;
+   args[1] = b;
+
+   return lp_build_intrinsic(builder, name, ret_type, args, 2);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map(LLVMBuilderRef builder,
+                       const char *name,
+                       LLVMTypeRef ret_type,
+                       LLVMValueRef *args,
+                       unsigned num_args)
+{
+   LLVMTypeRef ret_elem_type = LLVMGetElementType(ret_type);
+   unsigned n = LLVMGetVectorSize(ret_type);
+   unsigned i, j;
+   LLVMValueRef res;
+
+   assert(num_args <= LP_MAX_FUNC_ARGS);
+
+   res = LLVMGetUndef(ret_type);
+   for(i = 0; i < n; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef arg_elems[LP_MAX_FUNC_ARGS];
+      LLVMValueRef res_elem;
+      for(j = 0; j < num_args; ++j)
+         arg_elems[j] = LLVMBuildExtractElement(builder, args[j], index, "");
+      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args);
+      res = LLVMBuildInsertElement(builder, res, res_elem, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
+                             const char *name,
+                             LLVMTypeRef ret_type,
+                             LLVMValueRef a)
+{
+   return lp_build_intrinsic_map(builder, name, ret_type, &a, 1);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
+                              const char *name,
+                              LLVMTypeRef ret_type,
+                              LLVMValueRef a,
+                              LLVMValueRef b)
+{
+   LLVMValueRef args[2];
+
+   args[0] = a;
+   args[1] = b;
+
+   return lp_build_intrinsic_map(builder, name, ret_type, args, 2);
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.h b/src/gallium/drivers/llvmpipe/lp_bld_intr.h
new file mode 100644
index 0000000000..f813f27074
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_intr.h
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for calling intrinsics.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_INTR_H
+#define LP_BLD_INTR_H
+
+
+#include <llvm-c/Core.h>  
+
+
+/**
+ * Max number of arguments in an intrinsic.
+ */
+#define LP_MAX_FUNC_ARGS 32
+
+
+LLVMValueRef
+lp_declare_intrinsic(LLVMModuleRef module,
+                     const char *name,
+                     LLVMTypeRef ret_type,
+                     LLVMTypeRef *arg_types,
+                     unsigned num_args);
+
+LLVMValueRef
+lp_build_intrinsic(LLVMBuilderRef builder,
+                   const char *name,
+                   LLVMTypeRef ret_type,
+                   LLVMValueRef *args,
+                   unsigned num_args);
+
+
+LLVMValueRef
+lp_build_intrinsic_unary(LLVMBuilderRef builder,
+                         const char *name,
+                         LLVMTypeRef ret_type,
+                         LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_intrinsic_binary(LLVMBuilderRef builder,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMValueRef a,
+                          LLVMValueRef b);
+
+
+LLVMValueRef
+lp_build_intrinsic_map(LLVMBuilderRef builder,
+                       const char *name,
+                       LLVMTypeRef ret_type,
+                       LLVMValueRef *args,
+                       unsigned num_args);
+
+
+LLVMValueRef
+lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
+                             const char *name,
+                             LLVMTypeRef ret_type,
+                             LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
+                              const char *name,
+                              LLVMTypeRef ret_type,
+                              LLVMValueRef a,
+                              LLVMValueRef b);
+
+
+#endif /* !LP_BLD_INTR_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
new file mode 100644
index 0000000000..6b6f820769
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -0,0 +1,394 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+
+
+LLVMValueRef
+lp_build_cmp(struct lp_build_context *bld,
+             unsigned func,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
+   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+   unsigned i;
+
+   if(func == PIPE_FUNC_NEVER)
+      return zeros;
+   if(func == PIPE_FUNC_ALWAYS)
+      return ones;
+
+   /* TODO: optimize the constant case */
+
+   /* XXX: It is not clear if we should use the ordered or unordered operators */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width * type.length == 128) {
+      if(type.floating) {
+         LLVMValueRef args[3];
+         unsigned cc;
+         boolean swap;
+
+         swap = FALSE;
+         switch(func) {
+         case PIPE_FUNC_EQUAL:
+            cc = 0;
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            cc = 4;
+            break;
+         case PIPE_FUNC_LESS:
+            cc = 1;
+            break;
+         case PIPE_FUNC_LEQUAL:
+            cc = 2;
+            break;
+         case PIPE_FUNC_GREATER:
+            cc = 1;
+            swap = TRUE;
+            break;
+         case PIPE_FUNC_GEQUAL:
+            cc = 2;
+            swap = TRUE;
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+
+         if(swap) {
+            args[0] = b;
+            args[1] = a;
+         }
+         else {
+            args[0] = a;
+            args[1] = b;
+         }
+
+         args[2] = LLVMConstInt(LLVMInt8Type(), cc, 0);
+         res = lp_build_intrinsic(bld->builder,
+                                  "llvm.x86.sse.cmp.ps",
+                                  vec_type,
+                                  args, 3);
+         res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
+         return res;
+      }
+      else {
+         static const struct {
+            unsigned swap:1;
+            unsigned eq:1;
+            unsigned gt:1;
+            unsigned not:1;
+         } table[] = {
+            {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */
+            {1, 0, 1, 0}, /* PIPE_FUNC_LESS */
+            {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */
+            {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */
+            {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */
+            {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */
+            {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */
+            {0, 0, 0, 0}  /* PIPE_FUNC_ALWAYS */
+         };
+         const char *pcmpeq;
+         const char *pcmpgt;
+         LLVMValueRef args[2];
+         LLVMValueRef res;
+
+         switch (type.width) {
+         case 8:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.b";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.b";
+            break;
+         case 16:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.w";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.w";
+            break;
+         case 32:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.d";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.d";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+
+         /* There are no signed byte and unsigned word/dword comparison
+          * instructions. So flip the sign bit so that the results match.
+          */
+         if(table[func].gt &&
+            ((type.width == 8 && type.sign) ||
+             (type.width != 8 && !type.sign))) {
+            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+            a = LLVMBuildXor(bld->builder, a, msb, "");
+            b = LLVMBuildXor(bld->builder, b, msb, "");
+         }
+
+         if(table[func].swap) {
+            args[0] = b;
+            args[1] = a;
+         }
+         else {
+            args[0] = a;
+            args[1] = b;
+         }
+
+         if(table[func].eq)
+            res = lp_build_intrinsic(bld->builder, pcmpeq, vec_type, args, 2);
+         else if (table[func].gt)
+            res = lp_build_intrinsic(bld->builder, pcmpgt, vec_type, args, 2);
+         else
+            res = LLVMConstNull(vec_type);
+
+         if(table[func].not)
+            res = LLVMBuildNot(bld->builder, res, "");
+
+         return res;
+      }
+   }
+#endif
+
+   if(type.floating) {
+      LLVMRealPredicate op;
+      switch(func) {
+      case PIPE_FUNC_NEVER:
+         op = LLVMRealPredicateFalse;
+         break;
+      case PIPE_FUNC_ALWAYS:
+         op = LLVMRealPredicateTrue;
+         break;
+      case PIPE_FUNC_EQUAL:
+         op = LLVMRealUEQ;
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         op = LLVMRealUNE;
+         break;
+      case PIPE_FUNC_LESS:
+         op = LLVMRealULT;
+         break;
+      case PIPE_FUNC_LEQUAL:
+         op = LLVMRealULE;
+         break;
+      case PIPE_FUNC_GREATER:
+         op = LLVMRealUGT;
+         break;
+      case PIPE_FUNC_GEQUAL:
+         op = LLVMRealUGE;
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+#if 0
+      /* XXX: Although valid IR, no LLVM target currently support this */
+      cond = LLVMBuildFCmp(bld->builder, op, a, b, "");
+      res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
+#else
+      debug_printf("%s: warning: using slow element-wise vector comparison\n",
+                   __FUNCTION__);
+      res = LLVMGetUndef(int_vec_type);
+      for(i = 0; i < type.length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         cond = LLVMBuildFCmp(bld->builder, op,
+                              LLVMBuildExtractElement(bld->builder, a, index, ""),
+                              LLVMBuildExtractElement(bld->builder, b, index, ""),
+                              "");
+         cond = LLVMBuildSelect(bld->builder, cond,
+                                LLVMConstExtractElement(ones, index),
+                                LLVMConstExtractElement(zeros, index),
+                                "");
+         res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
+      }
+#endif
+   }
+   else {
+      LLVMIntPredicate op;
+      switch(func) {
+      case PIPE_FUNC_EQUAL:
+         op = LLVMIntEQ;
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         op = LLVMIntNE;
+         break;
+      case PIPE_FUNC_LESS:
+         op = type.sign ? LLVMIntSLT : LLVMIntULT;
+         break;
+      case PIPE_FUNC_LEQUAL:
+         op = type.sign ? LLVMIntSLE : LLVMIntULE;
+         break;
+      case PIPE_FUNC_GREATER:
+         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
+         break;
+      case PIPE_FUNC_GEQUAL:
+         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+#if 0
+      /* XXX: Although valid IR, no LLVM target currently support this */
+      cond = LLVMBuildICmp(bld->builder, op, a, b, "");
+      res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
+#else
+      debug_printf("%s: warning: using slow element-wise vector comparison\n",
+                   __FUNCTION__);
+      res = LLVMGetUndef(int_vec_type);
+      for(i = 0; i < type.length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         cond = LLVMBuildICmp(bld->builder, op,
+                              LLVMBuildExtractElement(bld->builder, a, index, ""),
+                              LLVMBuildExtractElement(bld->builder, b, index, ""),
+                              "");
+         cond = LLVMBuildSelect(bld->builder, cond,
+                                LLVMConstExtractElement(ones, index),
+                                LLVMConstExtractElement(zeros, index),
+                                "");
+         res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
+      }
+#endif
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_select(struct lp_build_context *bld,
+                LLVMValueRef mask,
+                LLVMValueRef a,
+                LLVMValueRef b)
+{
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(a == b)
+      return a;
+
+   if(type.floating) {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   }
+
+   a = LLVMBuildAnd(bld->builder, a, mask, "");
+
+   /* This often gets translated to PANDN, but sometimes the NOT is
+    * pre-computed and stored in another constant. The best strategy depends
+    * on available registers, so it is not a big deal -- hopefully LLVM does
+    * the right decision attending the rest of the program.
+    */
+   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+
+   res = LLVMBuildOr(bld->builder, a, b, "");
+
+   if(type.floating) {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_select_aos(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b,
+                    const boolean cond[4])
+{
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
+   unsigned i, j;
+
+   if(a == b)
+      return a;
+   if(cond[0] && cond[1] && cond[2] && cond[3])
+      return a;
+   if(!cond[0] && !cond[1] && !cond[2] && !cond[3])
+      return b;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   /*
+    * There are three major ways of accomplishing this:
+    * - with a shuffle,
+    * - with a select,
+    * - or with a bit mask.
+    *
+    * Select isn't supported for vector types yet.
+    * The flip between these is empirical and might need to be.
+    */
+   if (n <= 4) {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, (cond[i] ? 0 : n) + j + i, 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+   }
+   else {
+#if 0
+      /* XXX: Unfortunately select of vectors do not work */
+      /* Use a select */
+      LLVMTypeRef elem_type = LLVMInt1Type();
+      LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
+
+      return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
+#else
+      LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
+      return lp_build_select(bld, mask, a, b);
+#endif
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
new file mode 100644
index 0000000000..a4ee7723b5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_LOGIC_H
+#define LP_BLD_LOGIC_H
+
+
+#include <llvm-c/Core.h>  
+
+#include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
+
+
+struct lp_type type;
+struct lp_build_context;
+
+
+/**
+ * @param func is one of PIPE_FUNC_xxx
+ */
+LLVMValueRef
+lp_build_cmp(struct lp_build_context *bld,
+             unsigned func,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+
+LLVMValueRef
+lp_build_select(struct lp_build_context *bld,
+                LLVMValueRef mask,
+                LLVMValueRef a,
+                LLVMValueRef b);
+
+LLVMValueRef
+lp_build_select_aos(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b,
+                    const boolean cond[4]);
+
+
+#endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
new file mode 100644
index 0000000000..403d0e4836
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_SAMPLE_H
+#define LP_BLD_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+struct pipe_texture;
+struct pipe_sampler_state;
+struct lp_type;
+
+
+/**
+ * Sampler static state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are embedded in the generated code.
+ */
+struct lp_sampler_static_state
+{
+   /* pipe_texture's state */
+   enum pipe_format format;
+   unsigned target:2;
+   unsigned pot_width:1;
+   unsigned pot_height:1;
+   unsigned pot_depth:1;
+
+   /* pipe_sampler_state's state */
+   unsigned wrap_s:3;
+   unsigned wrap_t:3;
+   unsigned wrap_r:3;
+   unsigned min_img_filter:2;
+   unsigned min_mip_filter:2;
+   unsigned mag_img_filter:2;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+   unsigned normalized_coords:1;
+   unsigned prefilter:4;
+};
+
+
+/**
+ * Sampler dynamic state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are computed in runtime.
+ *
+ * There are obtained through callbacks, as we don't want to tie the texture
+ * sampling code generation logic to any particular texture layout or pipe
+ * driver.
+ */
+struct lp_sampler_dynamic_state
+{
+
+   /** Obtain the base texture width. */
+   LLVMValueRef
+   (*width)( struct lp_sampler_dynamic_state *state,
+             LLVMBuilderRef builder,
+             unsigned unit);
+
+   /** Obtain the base texture height. */
+   LLVMValueRef
+   (*height)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*stride)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+                LLVMBuilderRef builder,
+                unsigned unit);
+
+};
+
+
+/**
+ * Derive the sampler static state.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler);
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type fp_type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel);
+
+
+
+#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
new file mode 100644
index 0000000000..8ca1be6f1b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -0,0 +1,416 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width[0]);
+   state->pot_height        = util_is_pot(texture->height[0]);
+   state->pot_depth         = util_is_pot(texture->depth[0]);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   if(sampler->compare_mode) {
+      state->compare_mode      = sampler->compare_mode;
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+   state->prefilter         = sampler->prefilter;
+}
+
+
+
+/**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+static void
+lp_build_sample_texel(struct lp_build_sample_context *bld,
+                      LLVMValueRef x,
+                      LLVMValueRef y,
+                      LLVMValueRef y_stride,
+                      LLVMValueRef data_ptr,
+                      LLVMValueRef *texel)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->int_coord_type, bld->format_desc->block.bits/8);
+
+   if(bld->format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, int_coord_bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, int_coord_bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, int_coord_bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, int_coord_bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->int_coord_type, 2*bld->format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->int_coord_type, 4*bld->format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, int_coord_bld->one, "");
+
+      x_offset_lo = lp_build_mul(int_coord_bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(int_coord_bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(int_coord_bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(int_coord_bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(int_coord_bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(int_coord_bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(int_coord_bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(int_coord_bld, x, x_stride);
+      y_offset = lp_build_mul(int_coord_bld, y, y_stride);
+
+      offset = lp_build_add(int_coord_bld, x_offset, y_offset);
+   }
+
+   lp_build_load_rgba_soa(bld->builder,
+                          bld->format_desc,
+                          bld->texel_type,
+                          data_ptr,
+                          offset,
+                          texel);
+}
+
+
+static LLVMValueRef
+lp_build_sample_wrap(struct lp_build_sample_context *bld,
+                     LLVMValueRef coord,
+                     LLVMValueRef length,
+                     boolean is_pot,
+                     unsigned wrap_mode)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      /* FIXME */
+      _debug_printf("warning: failed to translate texture wrap mode %u\n", wrap_mode);
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return coord;
+}
+
+
+static void
+lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
+                               LLVMValueRef s,
+                               LLVMValueRef t,
+                               LLVMValueRef width,
+                               LLVMValueRef height,
+                               LLVMValueRef stride,
+                               LLVMValueRef data_ptr,
+                               LLVMValueRef *texel)
+{
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   x = lp_build_ifloor(&bld->coord_bld, s);
+   y = lp_build_ifloor(&bld->coord_bld, t);
+
+   x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x, y, stride, data_ptr, texel);
+}
+
+
+static void
+lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMValueRef half;
+   LLVMValueRef s_ipart;
+   LLVMValueRef t_ipart;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2][4];
+   unsigned chan;
+
+   half = lp_build_const_scalar(bld->coord_type, 0.5);
+   s = lp_build_sub(&bld->coord_bld, s, half);
+   t = lp_build_sub(&bld->coord_bld, t, half);
+
+   s_ipart = lp_build_floor(&bld->coord_bld, s);
+   t_ipart = lp_build_floor(&bld->coord_bld, t);
+
+   s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
+   t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
+
+   x0 = lp_build_int(&bld->coord_bld, s_ipart);
+   y0 = lp_build_int(&bld->coord_bld, t_ipart);
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
+
+   /* TODO: Don't interpolate missing channels */
+   for(chan = 0; chan < 4; ++chan) {
+      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                     s_fpart, t_fpart,
+                                     neighbors[0][0][chan],
+                                     neighbors[0][1][chan],
+                                     neighbors[1][0][chan],
+                                     neighbors[1][1][chan]);
+   }
+}
+
+
+static void
+lp_build_sample_compare(struct lp_build_sample_context *bld,
+                        LLVMValueRef p,
+                        LLVMValueRef *texel)
+{
+   struct lp_build_context *texel_bld = &bld->texel_bld;
+   LLVMValueRef res;
+   unsigned chan;
+
+   if(!bld->static_state->compare_mode)
+      return;
+
+   /* TODO: Compare before swizzling, to avoid redundant computations */
+   res = NULL;
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef cmp;
+      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
+      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
+
+      if(res)
+         res = lp_build_add(texel_bld, res, cmp);
+      else
+         res = cmp;
+   }
+
+   assert(res);
+   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for(chan = 0; chan < 3; ++chan)
+      texel[chan] = res;
+   texel[3] = texel_bld->one;
+}
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel)
+{
+   struct lp_build_sample_context bld;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef stride;
+   LLVMValueRef data_ptr;
+   LLVMValueRef s;
+   LLVMValueRef t;
+   LLVMValueRef p;
+
+   /* Setup our build context */
+   memset(&bld, 0, sizeof bld);
+   bld.builder = builder;
+   bld.static_state = static_state;
+   bld.dynamic_state = dynamic_state;
+   bld.format_desc = util_format_description(static_state->format);
+   bld.coord_type = type;
+   bld.int_coord_type = lp_int_type(type);
+   bld.texel_type = type;
+   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
+   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
+
+   /* Get the dynamic state */
+   width = dynamic_state->width(dynamic_state, builder, unit);
+   height = dynamic_state->height(dynamic_state, builder, unit);
+   stride = dynamic_state->stride(dynamic_state, builder, unit);
+   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+
+   s = coords[0];
+   t = coords[1];
+   p = coords[2];
+
+   width = lp_build_broadcast_scalar(&bld.int_coord_bld, width);
+   height = lp_build_broadcast_scalar(&bld.int_coord_bld, height);
+   stride = lp_build_broadcast_scalar(&bld.int_coord_bld, stride);
+
+   if(static_state->target == PIPE_TEXTURE_1D)
+      t = bld.coord_bld.zero;
+
+   if(static_state->normalized_coords) {
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld.coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(builder, width, coord_vec_type, "");
+      LLVMValueRef fp_height = LLVMBuildSIToFP(builder, height, coord_vec_type, "");
+      s = lp_build_mul(&bld.coord_bld, s, fp_width);
+      t = lp_build_mul(&bld.coord_bld, t, fp_height);
+   }
+
+   switch (static_state->min_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   default:
+      assert(0);
+   }
+
+   /* FIXME: respect static_state->min_mip_filter */;
+   /* FIXME: respect static_state->mag_img_filter */;
+   /* FIXME: respect static_state->prefilter */;
+
+   lp_build_sample_compare(&bld, p, texel);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.c b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
new file mode 100644
index 0000000000..3998ac374f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for manipulation structures.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "lp_bld_debug.h"
+#include "lp_bld_struct.h"
+
+
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef member_ptr;
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
+   return member_ptr;
+}
+
+
+LLVMValueRef
+lp_build_struct_get(LLVMBuilderRef builder,
+                    LLVMValueRef ptr,
+                    unsigned member,
+                    const char *name)
+{
+   LLVMValueRef member_ptr;
+   LLVMValueRef res;
+   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
+   res = LLVMBuildLoad(builder, member_ptr, "");
+   lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
+   return res;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.h b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
new file mode 100644
index 0000000000..740392f561
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_STRUCT_H
+#define LP_BLD_STRUCT_H
+
+
+#include <llvm-c/Core.h>  
+#include <llvm-c/Target.h>
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+
+#define LP_CHECK_STRUCT_SIZE(_ctype, _ltarget, _ltype) \
+      assert(LLVMABISizeOfType(_ltarget, _ltype) == \
+             sizeof(_ctype))
+
+#define LP_CHECK_MEMBER_OFFSET(_ctype, _cmember, _ltarget, _ltype, _lindex) \
+      assert(LLVMOffsetOfElement(_ltarget, _ltype, _lindex) == \
+             offsetof(_ctype, _cmember))
+
+
+/**
+ * Get value pointer to a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name);
+
+/**
+ * Get the value of a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get(LLVMBuilderRef builder,
+                    LLVMValueRef ptr,
+                    unsigned member,
+                    const char *name);
+
+
+#endif /* !LP_BLD_STRUCT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
new file mode 100644
index 0000000000..64e81f7b1f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+
+
+LLVMValueRef
+lp_build_broadcast(LLVMBuilderRef builder,
+                   LLVMTypeRef vec_type,
+                   LLVMValueRef scalar)
+{
+   const unsigned n = LLVMGetVectorSize(vec_type);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(vec_type);
+   for(i = 0; i < n; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(builder, res, scalar, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_broadcast_scalar(struct lp_build_context *bld,
+                          LLVMValueRef scalar)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+   unsigned i;
+
+   res = bld->undef;
+   for(i = 0; i < type.length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld->builder, res, scalar, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_broadcast_aos(struct lp_build_context *bld,
+                       LLVMValueRef a,
+                       unsigned channel)
+{
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
+   unsigned i, j;
+
+   if(a == bld->undef || a == bld->zero || a == bld->one)
+      return a;
+
+   /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
+    * using shuffles here actually causes worst results. More investigation is
+    * needed. */
+   if (n <= 4) {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
+   }
+   else {
+      /*
+       * Bit mask and recursive shifts
+       *
+       *   XYZW XYZW .... XYZW  <= input
+       *   0Y00 0Y00 .... 0Y00
+       *   YY00 YY00 .... YY00
+       *   YYYY YYYY .... YYYY  <= output
+       */
+      struct lp_type type4 = type;
+      const char shifts[4][2] = {
+         { 1,  2},
+         {-1,  2},
+         { 1, -2},
+         {-1, -2}
+      };
+      boolean cond[4];
+      unsigned i;
+
+      memset(cond, 0, sizeof cond);
+      cond[channel] = 1;
+
+      a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
+
+      type4.width *= 4;
+      type4.length /= 4;
+
+      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
+
+      for(i = 0; i < 2; ++i) {
+         LLVMValueRef tmp = NULL;
+         int shift = shifts[channel][i];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         shift = -shift;
+#endif
+
+         if(shift > 0)
+            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
+         if(shift < 0)
+            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
+
+         assert(tmp);
+         if(tmp)
+            a = LLVMBuildOr(bld->builder, a, tmp, "");
+      }
+
+      return LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type), "");
+   }
+}
+
+
+LLVMValueRef
+lp_build_swizzle1_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      const unsigned char swizzle[4])
+{
+   const unsigned n = bld->type.length;
+   unsigned i, j;
+
+   if(a == bld->undef || a == bld->zero || a == bld->one)
+      return a;
+
+   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
+      return lp_build_broadcast_aos(bld, a, swizzle[0]);
+
+   {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
+   }
+}
+
+
+LLVMValueRef
+lp_build_swizzle2_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef b,
+                      const unsigned char swizzle[4])
+{
+   const unsigned n = bld->type.length;
+   unsigned i, j;
+
+   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
+      return lp_build_swizzle1_aos(bld, a, swizzle);
+
+   if(a == b) {
+      unsigned char swizzle1[4];
+      swizzle1[0] = swizzle[0] % 4;
+      swizzle1[1] = swizzle[1] % 4;
+      swizzle1[2] = swizzle[2] % 4;
+      swizzle1[3] = swizzle[3] % 4;
+      return lp_build_swizzle1_aos(bld, a, swizzle1);
+   }
+
+   if(swizzle[0] % 4 == 0 &&
+      swizzle[1] % 4 == 1 &&
+      swizzle[2] % 4 == 2 &&
+      swizzle[3] % 4 == 3) {
+      boolean cond[4];
+      cond[0] = swizzle[0] / 4;
+      cond[1] = swizzle[1] / 4;
+      cond[2] = swizzle[2] / 4;
+      cond[3] = swizzle[3] / 4;
+      return lp_build_select_aos(bld, a, b, cond);
+   }
+
+   {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+   }
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
new file mode 100644
index 0000000000..1f6da80448
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_SWIZZLE_H
+#define LP_BLD_SWIZZLE_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type type;
+struct lp_build_context;
+
+
+LLVMValueRef
+lp_build_broadcast(LLVMBuilderRef builder,
+                   LLVMTypeRef vec_type,
+                   LLVMValueRef scalar);
+
+
+LLVMValueRef
+lp_build_broadcast_scalar(struct lp_build_context *bld,
+                          LLVMValueRef scalar);
+
+
+/**
+ * Broadcast one channel of a vector composed of arrays of XYZW structures into
+ * all four channel.
+ */
+LLVMValueRef
+lp_build_broadcast_aos(struct lp_build_context *bld,
+                       LLVMValueRef a,
+                       unsigned channel);
+
+
+/**
+ * Swizzle a vector consisting of an array of XYZW structs.
+ *
+ * @param swizzle is the in [0,4[ range.
+ */
+LLVMValueRef
+lp_build_swizzle1_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      const unsigned char swizzle[4]);
+
+
+/**
+ * Swizzle two vector consisting of an array of XYZW structs.
+ *
+ * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
+ */
+LLVMValueRef
+lp_build_swizzle2_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef b,
+                      const unsigned char swizzle[4]);
+
+
+#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
new file mode 100644
index 0000000000..eddb7a83fa
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
@@ -0,0 +1,84 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * TGSI to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_TGSI_H
+#define LP_BLD_TGSI_H
+
+#include <llvm-c/Core.h>
+
+
+struct tgsi_token;
+struct lp_type;
+struct lp_build_context;
+struct lp_build_mask_context;
+
+
+/**
+ * Sampler code generation interface.
+ *
+ * Although texture sampling is a requirement for TGSI translation, it is
+ * a very different problem with several different approaches to it. This
+ * structure establishes an interface for texture sampling code generation, so
+ * that we can easily use different texture sampling strategies.
+ */
+struct lp_build_sampler_soa
+{
+   void
+   (*destroy)( struct lp_build_sampler_soa *sampler );
+
+   void
+   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+                        LLVMBuilderRef builder,
+                        struct lp_type type,
+                        unsigned unit,
+                        unsigned num_coords,
+                        const LLVMValueRef *coords,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef *texel);
+};
+
+
+void
+lp_build_tgsi_soa(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  struct lp_build_mask_context *mask,
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *pos,
+                  const LLVMValueRef (*inputs)[4],
+                  LLVMValueRef (*outputs)[4],
+                  struct lp_build_sampler_soa *sampler);
+
+
+#endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
new file mode 100644
index 0000000000..adc81569ed
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -0,0 +1,1484 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * TGSI to LLVM IR translation -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
+ * Brian Paul, and others.
+ */
+
+#include "pipe/p_config.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_tgsi.h"
+#include "lp_bld_debug.h"
+
+
+#define LP_MAX_TEMPS 256
+#define LP_MAX_IMMEDIATES 256
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST)->FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+
+struct lp_build_tgsi_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef consts_ptr;
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+   LLVMValueRef (*outputs)[NUM_CHANNELS];
+
+   struct lp_build_sampler_soa *sampler;
+
+   LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
+   LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
+
+   struct lp_build_mask_context *mask;
+};
+
+
+static const unsigned char
+swizzle_left[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
+   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+};
+
+
+static LLVMValueRef
+emit_ddx(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
+   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
+   return lp_build_sub(&bld->base, src_right, src_left);
+}
+
+
+static LLVMValueRef
+emit_ddy(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
+   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
+   return lp_build_sub(&bld->base, src_top, src_bottom);
+}
+
+
+/**
+ * Register fetch.
+ */
+static LLVMValueRef
+emit_fetch(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index )
+{
+   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[index];
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   LLVMValueRef res;
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_CONSTANT: {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->SrcRegister.Index*4 + swizzle, 0);
+         LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
+         LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+         res = lp_build_broadcast_scalar(&bld->base, scalar);
+         break;
+      }
+
+      case TGSI_FILE_IMMEDIATE:
+         res = bld->immediates[reg->SrcRegister.Index][swizzle];
+         assert(res);
+         break;
+
+      case TGSI_FILE_INPUT:
+         res = bld->inputs[reg->SrcRegister.Index][swizzle];
+         assert(res);
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         res = bld->temps[reg->SrcRegister.Index][swizzle];
+         if(!res)
+            return bld->base.undef;
+         break;
+
+      default:
+         assert( 0 );
+         return bld->base.undef;
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      res = bld->base.zero;
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      res = bld->base.one;
+      break;
+
+   default:
+      assert( 0 );
+      return bld->base.undef;
+   }
+
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      res = lp_build_abs( &bld->base, res );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      /* TODO: Use bitwese OR for floating point */
+      res = lp_build_abs( &bld->base, res );
+      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   return res;
+}
+
+
+/**
+ * Register fetch with derivatives.
+ */
+static void
+emit_fetch_deriv(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index,
+   LLVMValueRef *res,
+   LLVMValueRef *ddx,
+   LLVMValueRef *ddy)
+{
+   LLVMValueRef src;
+
+   src = emit_fetch(bld, inst, index, chan_index);
+
+   if(res)
+      *res = src;
+
+   /* TODO: use interpolation coeffs for inputs */
+
+   if(ddx)
+      *ddx = emit_ddx(bld, src);
+
+   if(ddy)
+      *ddy = emit_ddy(bld, src);
+}
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   unsigned chan_index,
+   LLVMValueRef value)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[index];
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      value = lp_build_max(&bld->base, value, bld->base.zero);
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_OUTPUT:
+      bld->outputs[reg->DstRegister.Index][chan_index] = value;
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      bld->temps[reg->DstRegister.Index][chan_index] = value;
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      /* FIXME */
+      assert(0);
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * High-level instruction translators.
+ */
+
+
+static void
+emit_tex( struct lp_build_tgsi_soa_context *bld,
+          const struct tgsi_full_instruction *inst,
+          boolean apply_lodbias,
+          boolean projected,
+          LLVMValueRef *texel)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   LLVMValueRef lodbias;
+   LLVMValueRef oow;
+   LLVMValueRef coords[3];
+   unsigned num_coords;
+   unsigned i;
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+      num_coords = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      num_coords = 2;
+      break;
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      num_coords = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if(apply_lodbias)
+      lodbias = emit_fetch( bld, inst, 0, 3 );
+   else
+      lodbias = bld->base.zero;
+
+   if (projected) {
+      oow = emit_fetch( bld, inst, 0, 3 );
+      oow = lp_build_rcp(&bld->base, oow);
+   }
+
+   for (i = 0; i < num_coords; i++) {
+      coords[i] = emit_fetch( bld, inst, 0, i );
+      if (projected)
+         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
+   }
+
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->base.builder,
+                                  bld->base.type,
+                                  unit, num_coords, coords, lodbias,
+                                  texel);
+}
+
+
+static void
+emit_kil(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst )
+{
+   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[0];
+   LLVMValueRef terms[NUM_CHANNELS];
+   LLVMValueRef mask;
+   unsigned chan_index;
+
+   memset(&terms, 0, sizeof terms);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* Unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+
+      /* Note that we test if the value is less than zero, so 1.0 and 0.0 need
+       * not to be tested. */
+      if(swizzle == TGSI_EXTSWIZZLE_ZERO || swizzle == TGSI_EXTSWIZZLE_ONE)
+         continue;
+
+      /* Check if the component has not been already tested. */
+      assert(swizzle < NUM_CHANNELS);
+      if( !terms[swizzle] )
+         /* TODO: change the comparison operator instead of setting the sign */
+         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
+   }
+
+   mask = NULL;
+   FOR_EACH_CHANNEL( chan_index ) {
+      if(terms[chan_index]) {
+         LLVMValueRef chan_mask;
+
+         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
+
+         if(mask)
+            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
+         else
+            mask = chan_mask;
+      }
+   }
+
+   if(mask)
+      lp_build_mask_update(bld->mask, mask);
+}
+
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+   uint i;
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
+      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->SrcRegister.Indirect)
+         return TRUE;
+   }
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
+      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->DstRegister.Indirect)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
+static int
+emit_instruction(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info)
+{
+   unsigned chan_index;
+   LLVMValueRef src0, src1, src2;
+   LLVMValueRef tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+   LLVMValueRef res;
+   LLVMValueRef dst0[NUM_CHANNELS];
+
+   /* we can't handle indirect addressing into temp register file yet */
+   if (indirect_temp_reference(inst))
+      return FALSE;
+
+   assert(info->num_dst <= 1);
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.undef;
+      }
+   }
+
+   switch (inst->Instruction.Opcode) {
+#if 0
+   case TGSI_OPCODE_ARL:
+      /* FIXME */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         emit_flr(bld, 0, 0);
+         emit_f2it( bld, 0 );
+         dst0[chan_index] = tmp0;
+      }
+      break;
+#endif
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
+         dst0[CHAN_X] = bld->base.one;
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         /* XMM[1] = SrcReg[0].yyyy */
+         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+         /* XMM[1] = max(XMM[1], 0) */
+         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
+         /* XMM[2] = SrcReg[0].wwww */
+         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
+         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
+         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      res = lp_build_rcp(&bld->base, src0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      src0 = lp_build_abs(&bld->base, src0);
+      res = lp_build_rsqrt(&bld->base, src0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
+         LLVMValueRef *p_exp2_int_part = NULL;
+         LLVMValueRef *p_frac_part = NULL;
+         LLVMValueRef *p_exp2 = NULL;
+
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            p_exp2_int_part = &tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            p_frac_part = &tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            p_exp2 = &tmp2;
+
+         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            dst0[CHAN_X] = tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            dst0[CHAN_Y] = tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            dst0[CHAN_Z] = tmp2;
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
+         LLVMValueRef *p_floor_log2;
+         LLVMValueRef *p_exp;
+         LLVMValueRef *p_log2;
+
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+         src0 = lp_build_abs( &bld->base, src0 );
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            p_floor_log2 = &tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            p_exp = &tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            p_log2 = &tmp2;
+
+         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
+
+         /* dst.x = floor(lg2(abs(src.x))) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            dst0[CHAN_X] = tmp0;
+         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
+            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
+         }
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            dst0[CHAN_Z] = tmp2;
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         dst0[CHAN_X] = bld->base.one;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
+         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
+         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
+      }
+      break;
+
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_sub( &bld->base, src1, src2 );
+         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
+         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp0 = lp_build_floor(&bld->base, src0);
+         tmp0 = lp_build_sub(&bld->base, tmp0, src0);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_max(&bld->base, tmp0, src1);
+         tmp0 = lp_build_min(&bld->base, tmp0, src2);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_FLR:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_EX2: {
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_exp2( &bld->base, tmp0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+   }
+
+   case TGSI_OPCODE_LG2:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_log2( &bld->base, tmp0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      src1 = emit_fetch( bld, inst, 1, CHAN_X );
+      res = lp_build_pow( &bld->base, src0, src1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_XPD:
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
+         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
+         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
+         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         tmp2 = tmp0;
+         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
+         tmp5 = tmp3;
+         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
+         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
+         dst0[CHAN_X] = tmp2;
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
+         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
+         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
+         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
+         dst0[CHAN_Y] = tmp3;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
+         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
+         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
+         dst0[CHAN_Z] = tmp5;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
+      return 0;
+
+   case TGSI_OPCODE_DPH:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_cos( &bld->base, tmp0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( bld, inst );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_sin( &bld->base, tmp0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_TEX:
+      emit_tex( bld, inst, FALSE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+#if 0
+   case TGSI_OPCODE_ARR:
+      /* FIXME */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         emit_rnd( bld, 0, 0 );
+         emit_f2it( bld, 0 );
+         dst0[chan_index] = tmp0;
+      }
+      break;
+#endif
+
+   case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
+      }
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         dst0[CHAN_Z] = bld->base.zero;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_NRM:
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
+             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
+             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
+             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
+
+            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+            /* xmm4 = src.x */
+            /* xmm0 = src.x * src.x */
+            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
+               tmp4 = tmp0;
+            }
+            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
+
+            /* xmm5 = src.y */
+            /* xmm0 = xmm0 + src.y * src.y */
+            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
+               tmp5 = tmp1;
+            }
+            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+
+            /* xmm6 = src.z */
+            /* xmm0 = xmm0 + src.z * src.z */
+            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
+               tmp6 = tmp1;
+            }
+            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+
+            if (dims == 4) {
+               /* xmm7 = src.w */
+               /* xmm0 = xmm0 + src.w * src.w */
+               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
+               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
+                  tmp7 = tmp1;
+               }
+               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+            }
+
+            /* xmm1 = 1 / sqrt(xmm0) */
+            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
+
+            /* dst.x = xmm1 * src.x */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
+               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
+            }
+
+            /* dst.y = xmm1 * src.y */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
+               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
+            }
+
+            /* dst.z = xmm1 * src.z */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
+               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
+            }
+
+            /* dst.w = xmm1 * src.w */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
+               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
+            }
+         }
+
+         /* dst.w = 1.0 */
+         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
+            dst0[CHAN_W] = bld->base.one;
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert( 0 );
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_TXL:
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_TXP:
+      emit_tex( bld, inst, FALSE, TRUE, dst0 );
+      break;
+      
+   case TGSI_OPCODE_BRK:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BGNFOR:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDFOR:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+   case TGSI_OPCODE_NOISE2:
+   case TGSI_OPCODE_NOISE3:
+   case TGSI_OPCODE_NOISE4:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      return 0;
+   }
+   
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+      }
+   }
+
+   return 1;
+}
+
+
+void
+lp_build_tgsi_soa(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  struct lp_build_mask_context *mask,
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *pos,
+                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  struct lp_build_sampler_soa *sampler)
+{
+   struct lp_build_tgsi_soa_context bld;
+   struct tgsi_parse_context parse;
+   uint num_immediates = 0;
+   unsigned i;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   bld.mask = mask;
+   bld.pos = pos;
+   bld.inputs = inputs;
+   bld.outputs = outputs;
+   bld.consts_ptr = consts_ptr;
+   bld.sampler = sampler;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* Inputs already interpolated */
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+            const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                             info ? info->mnemonic : "<invalid>");
+         }
+
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            assert(size <= 4);
+            assert(num_immediates < LP_MAX_IMMEDIATES);
+            for( i = 0; i < size; ++i )
+               bld.immediates[num_immediates][i] =
+                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
+            for( i = size; i < 4; ++i )
+               bld.immediates[num_immediates][i] = bld.base.undef;
+            num_immediates++;
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free( &parse );
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
new file mode 100644
index 0000000000..606243d6c5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.c
@@ -0,0 +1,182 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+
+
+LLVMTypeRef
+lp_build_elem_type(struct lp_type type)
+{
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return LLVMFloatType();
+         break;
+      case 64:
+         return LLVMDoubleType();
+         break;
+      default:
+         assert(0);
+         return LLVMFloatType();
+      }
+   }
+   else {
+      return LLVMIntType(type.width);
+   }
+}
+
+
+LLVMTypeRef
+lp_build_vec_type(struct lp_type type)
+{
+   LLVMTypeRef elem_type = lp_build_elem_type(type);
+   return LLVMVectorType(elem_type, type.length);
+}
+
+
+/**
+ * This function is a mirror of lp_build_elem_type() above.
+ *
+ * XXX: I'm not sure if it wouldn't be easier/efficient to just recreate the
+ * type and check for identity.
+ */
+boolean
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
+{
+   LLVMTypeKind elem_kind;
+
+   assert(elem_type);
+   if(!elem_type)
+      return FALSE;
+
+   elem_kind = LLVMGetTypeKind(elem_type);
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         if(elem_kind != LLVMFloatTypeKind)
+            return FALSE;
+         break;
+      case 64:
+         if(elem_kind != LLVMDoubleTypeKind)
+            return FALSE;
+         break;
+      default:
+         assert(0);
+         return FALSE;
+      }
+   }
+   else {
+      if(elem_kind != LLVMIntegerTypeKind)
+         return FALSE;
+
+      if(LLVMGetIntTypeWidth(elem_type) != type.width)
+         return FALSE;
+   }
+
+   return TRUE; 
+}
+
+
+boolean
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
+{
+   LLVMTypeRef elem_type;
+
+   assert(vec_type);
+   if(!vec_type)
+      return FALSE;
+
+   if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
+      return FALSE;
+
+   if(LLVMGetVectorSize(vec_type) != type.length)
+      return FALSE;
+
+   elem_type = LLVMGetElementType(vec_type);
+
+   return lp_check_elem_type(type, elem_type);
+}
+
+
+boolean
+lp_check_value(struct lp_type type, LLVMValueRef val) 
+{
+   LLVMTypeRef vec_type;
+
+   assert(val);
+   if(!val)
+      return FALSE;
+
+   vec_type = LLVMTypeOf(val);
+
+   return lp_check_vec_type(type, vec_type);
+}
+
+
+LLVMTypeRef
+lp_build_int_elem_type(struct lp_type type)
+{
+   return LLVMIntType(type.width);
+}
+
+
+LLVMTypeRef
+lp_build_int_vec_type(struct lp_type type)
+{
+   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
+   return LLVMVectorType(elem_type, type.length);
+}
+
+
+struct lp_type
+lp_int_type(struct lp_type type)
+{
+   struct lp_type int_type;
+
+   memset(&int_type, 0, sizeof int_type);
+   int_type.width = type.width;
+   int_type.length = type.length;
+   return int_type;
+}
+
+
+void
+lp_build_context_init(struct lp_build_context *bld,
+                      LLVMBuilderRef builder,
+                      struct lp_type type)
+{
+   bld->builder = builder;
+   bld->type = type;
+   bld->undef = lp_build_undef(type);
+   bld->zero = lp_build_zero(type);
+   bld->one = lp_build_one(type);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
new file mode 100644
index 0000000000..ee5ca3483c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.h
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Convenient representation of SIMD types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_TYPE_H
+#define LP_BLD_TYPE_H
+
+
+#include <llvm-c/Core.h>  
+
+#include <pipe/p_compiler.h>
+
+
+/**
+ * Several functions can only cope with vectors of length up to this value.
+ * You may need to increase that value if you want to represent bigger vectors.
+ */
+#define LP_MAX_VECTOR_LENGTH 16
+
+#define LP_MAX_TYPE_WIDTH 64
+
+
+/**
+ * The LLVM type system can't conveniently express all the things we care about
+ * on the types used for intermediate computations, such as signed vs unsigned,
+ * normalized values, or fixed point.
+ */
+struct lp_type {
+   /**
+    * Floating-point. Cannot be used with fixed. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned floating:1;
+
+   /**
+    * Fixed-point. Cannot be used with floating. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned fixed:1;
+
+   /**
+    * Whether it can represent negative values or not.
+    *
+    * If this is not set for floating point, it means that all values are
+    * assumed to be positive.
+    */
+   unsigned sign:1;
+
+   /**
+    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
+    * interval for signed types.
+    *
+    * For integer types it means the representable integer range should be
+    * interpreted as the interval above.
+    *
+    * For floating and fixed point formats it means the values should be
+    * clamped to the interval above.
+    */
+   unsigned norm:1;
+
+   /**
+    * Element width.
+    *
+    * For fixed point values, the fixed point is assumed to be at half the
+    * width.
+    */
+   unsigned width:14;
+
+   /**
+    * Vector length.
+    *
+    * width*length should be a power of two greater or equal to eight.
+    *
+    * @sa LP_MAX_VECTOR_LENGTH
+    */
+   unsigned length:14;
+};
+
+
+/**
+ * We need most of the information here in order to correctly and efficiently
+ * translate an arithmetic operation into LLVM IR. Putting it here avoids the
+ * trouble of passing it as parameters.
+ */
+struct lp_build_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * This not only describes the input/output LLVM types, but also whether
+    * to normalize/clamp the results.
+    */
+   struct lp_type type;
+
+   /** Same as lp_build_undef(type) */
+   LLVMValueRef undef;
+
+   /** Same as lp_build_zero(type) */
+   LLVMValueRef zero;
+
+   /** Same as lp_build_one(type) */
+   LLVMValueRef one;
+};
+
+
+LLVMTypeRef
+lp_build_elem_type(struct lp_type type);
+
+
+LLVMTypeRef
+lp_build_vec_type(struct lp_type type);
+
+
+boolean
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
+
+
+boolean
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
+
+
+boolean
+lp_check_value(struct lp_type type, LLVMValueRef val);
+
+
+LLVMTypeRef
+lp_build_int_elem_type(struct lp_type type);
+
+
+LLVMTypeRef
+lp_build_int_vec_type(struct lp_type type);
+
+
+struct lp_type
+lp_int_type(struct lp_type type);
+
+
+void
+lp_build_context_init(struct lp_build_context *bld,
+                      LLVMBuilderRef builder,
+                      struct lp_type type);
+
+
+#endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_buffer.c b/src/gallium/drivers/llvmpipe/lp_buffer.c
new file mode 100644
index 0000000000..66f1f8e138
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_buffer.c
@@ -0,0 +1,150 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "lp_winsys.h"
+#include "lp_screen.h"
+#include "lp_texture.h"
+#include "lp_buffer.h"
+
+
+static void *
+llvmpipe_buffer_map(struct pipe_screen *screen,
+                    struct pipe_buffer *buf,
+                    unsigned flags)
+{
+   struct llvmpipe_buffer *llvmpipe_buf = llvmpipe_buffer(buf);
+   return llvmpipe_buf->data;
+}
+
+
+static void
+llvmpipe_buffer_unmap(struct pipe_screen *screen,
+                      struct pipe_buffer *buf)
+{
+}
+
+
+static void
+llvmpipe_buffer_destroy(struct pipe_buffer *buf)
+{
+   struct llvmpipe_buffer *sbuf = llvmpipe_buffer(buf);
+
+   if (!sbuf->userBuffer)
+      align_free(sbuf->data);
+      
+   FREE(sbuf);
+}
+
+
+static struct pipe_buffer *
+llvmpipe_buffer_create(struct pipe_screen *screen,
+                       unsigned alignment,
+                       unsigned usage,
+                       unsigned size)
+{
+   struct llvmpipe_buffer *buffer = CALLOC_STRUCT(llvmpipe_buffer);
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.screen = screen;
+   buffer->base.alignment = MAX2(alignment, 16);
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+llvmpipe_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes)
+{
+   struct llvmpipe_buffer *buffer;
+
+   buffer = CALLOC_STRUCT(llvmpipe_buffer);
+   if(!buffer)
+      return NULL;
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.screen = screen;
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+llvmpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence,
+                      unsigned flag)
+{
+   return 0;
+}
+
+
+void
+llvmpipe_init_screen_buffer_funcs(struct pipe_screen *screen)
+{
+   screen->buffer_create = llvmpipe_buffer_create;
+   screen->user_buffer_create = llvmpipe_user_buffer_create;
+   screen->buffer_map = llvmpipe_buffer_map;
+   screen->buffer_unmap = llvmpipe_buffer_unmap;
+   screen->buffer_destroy = llvmpipe_buffer_destroy;
+
+   screen->fence_reference = llvmpipe_fence_reference;
+   screen->fence_signalled = llvmpipe_fence_signalled;
+   screen->fence_finish = llvmpipe_fence_finish;
+
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_buffer.h b/src/gallium/drivers/llvmpipe/lp_buffer.h
new file mode 100644
index 0000000000..d6b8184a0b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_buffer.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BUFFER_H
+#define LP_BUFFER_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+
+struct llvmpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+};
+
+
+/** Cast wrapper */
+static INLINE struct llvmpipe_buffer *
+llvmpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct llvmpipe_buffer *)buf;
+}
+
+
+void
+llvmpipe_init_screen_buffer_funcs(struct pipe_screen *screen);
+
+
+#endif /* LP_BUFFER_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
new file mode 100644
index 0000000000..bdcff94b9b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -0,0 +1,81 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Michel Dänzer
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_surface.h"
+#include "lp_state.h"
+#include "lp_tile_cache.h"
+
+
+/**
+ * Clear the given buffers to the specified values.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+               double depth, unsigned stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned cv;
+   uint i;
+
+   if (llvmpipe->no_rast)
+      return;
+
+#if 0
+   llvmpipe_update_derived(llvmpipe); /* not needed?? */
+#endif
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
+         struct pipe_surface *ps = llvmpipe->framebuffer.cbufs[i];
+
+         util_pack_color(rgba, ps->format, &cv);
+         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
+      }
+      llvmpipe->dirty_render_cache = TRUE;
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      struct pipe_surface *ps = llvmpipe->framebuffer.zsbuf;
+
+      cv = util_pack_z_stencil(ps->format, depth, stencil);
+
+      /* non-cached surface */
+      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, cv);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.h b/src/gallium/drivers/llvmpipe/lp_clear.h
new file mode 100644
index 0000000000..6d4ffccdf4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef LP_CLEAR_H
+#define LP_CLEAR_H
+
+#include "pipe/p_state.h"
+struct pipe_context;
+
+extern void
+llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+               double depth, unsigned stencil);
+
+
+#endif /* LP_CLEAR_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
new file mode 100644
index 0000000000..a4b2bd8c2a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -0,0 +1,295 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_prim_setup.h"
+#include "lp_prim_vbuf.h"
+#include "lp_state.h"
+#include "lp_surface.h"
+#include "lp_tile_cache.h"
+#include "lp_tex_cache.h"
+#include "lp_texture.h"
+#include "lp_winsys.h"
+#include "lp_query.h"
+
+
+
+/**
+ * Map any drawing surfaces which aren't already mapped
+ */
+void
+llvmpipe_map_transfers(struct llvmpipe_context *lp)
+{
+   struct pipe_screen *screen = lp->pipe.screen;
+   struct pipe_surface *zsbuf = lp->framebuffer.zsbuf;
+   unsigned i;
+
+   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
+   }
+
+   if(zsbuf) {
+      if(!lp->zsbuf_transfer)
+         lp->zsbuf_transfer = screen->get_tex_transfer(screen, zsbuf->texture,
+                                                       zsbuf->face, zsbuf->level, zsbuf->zslice,
+                                                       PIPE_TRANSFER_READ_WRITE,
+                                                       0, 0, zsbuf->width, zsbuf->height);
+      if(lp->zsbuf_transfer && !lp->zsbuf_map)
+         lp->zsbuf_map = screen->transfer_map(screen, lp->zsbuf_transfer);
+
+   }
+}
+
+
+/**
+ * Unmap any mapped drawing surfaces
+ */
+void
+llvmpipe_unmap_transfers(struct llvmpipe_context *lp)
+{
+   uint i;
+
+   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      lp_tile_cache_unmap_transfers(lp->cbuf_cache[i]);
+   }
+
+   if(lp->zsbuf_transfer) {
+      struct pipe_screen *screen = lp->pipe.screen;
+
+      if(lp->zsbuf_map) {
+         screen->transfer_unmap(screen, lp->zsbuf_transfer);
+         lp->zsbuf_map = NULL;
+      }
+   }
+}
+
+
+static void llvmpipe_destroy( struct pipe_context *pipe )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   uint i;
+
+   if (llvmpipe->draw)
+      draw_destroy( llvmpipe->draw );
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
+
+   for (i = 0; i < Elements(llvmpipe->constants); i++) {
+      if (llvmpipe->constants[i].buffer) {
+         pipe_buffer_reference(&llvmpipe->constants[i].buffer, NULL);
+      }
+   }
+
+   align_free( llvmpipe );
+}
+
+static unsigned int
+llvmpipe_is_texture_referenced( struct pipe_context *pipe,
+				struct pipe_texture *texture,
+				unsigned face, unsigned level)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   unsigned i;
+
+   if(llvmpipe->dirty_render_cache) {
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
+         if(llvmpipe->framebuffer.cbufs[i] && 
+            llvmpipe->framebuffer.cbufs[i]->texture == texture)
+            return PIPE_REFERENCED_FOR_WRITE;
+      }
+      if(llvmpipe->framebuffer.zsbuf && 
+         llvmpipe->framebuffer.zsbuf->texture == texture)
+         return PIPE_REFERENCED_FOR_WRITE;
+   }
+   
+   return PIPE_UNREFERENCED;
+}
+
+static unsigned int
+llvmpipe_is_buffer_referenced( struct pipe_context *pipe,
+			       struct pipe_buffer *buf)
+{
+   return PIPE_UNREFERENCED;
+}
+
+struct pipe_context *
+llvmpipe_create( struct pipe_screen *screen )
+{
+   struct llvmpipe_context *llvmpipe;
+   uint i;
+
+   llvmpipe = align_malloc(sizeof(struct llvmpipe_context), 16);
+   if (!llvmpipe)
+      return NULL;
+
+   util_init_math();
+
+   memset(llvmpipe, 0, sizeof *llvmpipe);
+
+   llvmpipe->pipe.winsys = screen->winsys;
+   llvmpipe->pipe.screen = screen;
+   llvmpipe->pipe.destroy = llvmpipe_destroy;
+
+   /* state setters */
+   llvmpipe->pipe.create_blend_state = llvmpipe_create_blend_state;
+   llvmpipe->pipe.bind_blend_state   = llvmpipe_bind_blend_state;
+   llvmpipe->pipe.delete_blend_state = llvmpipe_delete_blend_state;
+
+   llvmpipe->pipe.create_sampler_state = llvmpipe_create_sampler_state;
+   llvmpipe->pipe.bind_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
+
+   llvmpipe->pipe.create_depth_stencil_alpha_state = llvmpipe_create_depth_stencil_state;
+   llvmpipe->pipe.bind_depth_stencil_alpha_state   = llvmpipe_bind_depth_stencil_state;
+   llvmpipe->pipe.delete_depth_stencil_alpha_state = llvmpipe_delete_depth_stencil_state;
+
+   llvmpipe->pipe.create_rasterizer_state = llvmpipe_create_rasterizer_state;
+   llvmpipe->pipe.bind_rasterizer_state   = llvmpipe_bind_rasterizer_state;
+   llvmpipe->pipe.delete_rasterizer_state = llvmpipe_delete_rasterizer_state;
+
+   llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
+   llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
+   llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
+
+   llvmpipe->pipe.create_vs_state = llvmpipe_create_vs_state;
+   llvmpipe->pipe.bind_vs_state   = llvmpipe_bind_vs_state;
+   llvmpipe->pipe.delete_vs_state = llvmpipe_delete_vs_state;
+
+   llvmpipe->pipe.set_blend_color = llvmpipe_set_blend_color;
+   llvmpipe->pipe.set_clip_state = llvmpipe_set_clip_state;
+   llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+   llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
+   llvmpipe->pipe.set_polygon_stipple = llvmpipe_set_polygon_stipple;
+   llvmpipe->pipe.set_scissor_state = llvmpipe_set_scissor_state;
+   llvmpipe->pipe.set_sampler_textures = llvmpipe_set_sampler_textures;
+   llvmpipe->pipe.set_viewport_state = llvmpipe_set_viewport_state;
+
+   llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
+   llvmpipe->pipe.set_vertex_elements = llvmpipe_set_vertex_elements;
+
+   llvmpipe->pipe.draw_arrays = llvmpipe_draw_arrays;
+   llvmpipe->pipe.draw_elements = llvmpipe_draw_elements;
+   llvmpipe->pipe.draw_range_elements = llvmpipe_draw_range_elements;
+   llvmpipe->pipe.set_edgeflags = llvmpipe_set_edgeflags;
+
+
+   llvmpipe->pipe.clear = llvmpipe_clear;
+   llvmpipe->pipe.flush = llvmpipe_flush;
+
+   llvmpipe->pipe.is_texture_referenced = llvmpipe_is_texture_referenced;
+   llvmpipe->pipe.is_buffer_referenced = llvmpipe_is_buffer_referenced;
+
+   llvmpipe_init_query_funcs( llvmpipe );
+   llvmpipe_init_texture_funcs( llvmpipe );
+
+   /*
+    * Alloc caches for accessing drawing surfaces and textures.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      llvmpipe->cbuf_cache[i] = lp_create_tile_cache( screen );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      llvmpipe->tex_cache[i] = lp_create_tex_tile_cache( screen );
+
+
+   /* vertex shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
+      llvmpipe->tgsi.vert_samplers[i].processor = TGSI_PROCESSOR_VERTEX;
+      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->tex_cache[i];
+      llvmpipe->tgsi.vert_samplers_list[i] = &llvmpipe->tgsi.vert_samplers[i];
+   }
+
+   /* fragment shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
+      llvmpipe->tgsi.frag_samplers[i].processor = TGSI_PROCESSOR_FRAGMENT;
+      llvmpipe->tgsi.frag_samplers[i].cache = llvmpipe->tex_cache[i];
+      llvmpipe->tgsi.frag_samplers_list[i] = &llvmpipe->tgsi.frag_samplers[i];
+   }
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   llvmpipe->draw = draw_create();
+   if (!llvmpipe->draw) 
+      goto fail;
+
+   draw_texture_samplers(llvmpipe->draw,
+                         PIPE_MAX_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            llvmpipe->tgsi.vert_samplers_list);
+
+   llvmpipe->setup = lp_draw_render_stage(llvmpipe);
+   if (!llvmpipe->setup)
+      goto fail;
+
+   if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
+      llvmpipe->no_rast = TRUE;
+
+   if (debug_get_bool_option( "LP_NO_VBUF", FALSE )) {
+      /* Deprecated path -- vbuf is the intended interface to the draw module:
+       */
+      draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->setup);
+   }
+   else {
+      lp_init_vbuf(llvmpipe);
+   }
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
+   draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
+
+#if USE_DRAW_STAGE_PSTIPPLE
+   /* Do polygon stipple w/ texture map + frag prog? */
+   draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
+#endif
+
+   lp_init_surface_functions(llvmpipe);
+
+   return &llvmpipe->pipe;
+
+ fail:
+   llvmpipe_destroy(&llvmpipe->pipe);
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
new file mode 100644
index 0000000000..8d5a0d4f1f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_CONTEXT_H
+#define LP_CONTEXT_H
+
+#include "pipe/p_context.h"
+
+#include "draw/draw_vertex.h"
+
+#include "lp_tex_sample.h"
+#include "lp_jit.h"
+
+
+struct llvmpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct llvmpipe_tile_cache;
+struct llvmpipe_tex_tile_cache;
+struct lp_fragment_shader;
+struct lp_vertex_shader;
+struct lp_blend_state;
+
+
+struct llvmpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /** Constant state objects */
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   struct lp_fragment_shader *fs;
+   const struct lp_vertex_shader *vs;
+
+   /** Other rendering state */
+   struct pipe_blend_color blend_color[4][16];
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+
+   unsigned dirty; /**< Mask of LP_NEW_x flags */
+
+   /* Counter for occlusion queries.  Note this supports overlapping
+    * queries.
+    */
+   uint64_t occlusion_count;
+   unsigned active_query_count;
+
+   /** Mapped vertex buffers */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+   
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   /** Vertex format */
+   struct vertex_info vertex_info;
+   struct vertex_info vertex_info_vbuf;
+
+   /** Which vertex shader output slot contains point size */
+   int psize_slot;
+
+   /* The reduced version of the primitive supplied by the state
+    * tracker.
+    */
+   unsigned reduced_api_prim;
+
+   /* The reduced primitive after unfilled triangles, wide-line
+    * decomposition, etc, are taken into account.  This is the
+    * primitive actually rasterized.
+    */
+   unsigned reduced_prim;
+
+   /** Derived from scissor and surface bounds: */
+   struct pipe_scissor_state cliprect;
+
+   unsigned line_stipple_counter;
+
+   /** TGSI exec things */
+   struct {
+      struct lp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
+      struct lp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct lp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
+      struct lp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+   } tgsi;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *setup;
+   struct draw_stage *vbuf;
+   struct llvmpipe_vbuf_render *vbuf_render;
+
+   boolean dirty_render_cache;
+   
+   struct llvmpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
+   
+   /* TODO: we shouldn't be using external interfaces internally like this */
+   struct pipe_transfer *zsbuf_transfer;
+   uint8_t *zsbuf_map;
+
+   unsigned tex_timestamp;
+   struct llvmpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+
+   unsigned no_rast : 1;
+
+   struct lp_jit_context jit_context;
+};
+
+
+static INLINE struct llvmpipe_context *
+llvmpipe_context( struct pipe_context *pipe )
+{
+   return (struct llvmpipe_context *)pipe;
+}
+
+#endif /* LP_CONTEXT_H */
+
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
new file mode 100644
index 0000000000..89772e62d3
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -0,0 +1,192 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_prim.h"
+
+#include "lp_buffer.h"
+#include "lp_context.h"
+#include "lp_state.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+llvmpipe_map_constant_buffers(struct llvmpipe_context *lp)
+{
+   struct pipe_screen *screen = lp->pipe.screen;
+   uint i, size;
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
+         lp->mapped_constants[i] = screen->buffer_map(screen, lp->constants[i].buffer,
+                                                      PIPE_BUFFER_USAGE_CPU_READ);
+   }
+
+   if (lp->constants[PIPE_SHADER_VERTEX].buffer)
+      size = lp->constants[PIPE_SHADER_VERTEX].buffer->size;
+   else
+      size = 0;
+
+   lp->jit_context.constants = lp->mapped_constants[PIPE_SHADER_FRAGMENT];
+
+   draw_set_mapped_constant_buffer(lp->draw,
+                                   lp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   size);
+}
+
+
+static void
+llvmpipe_unmap_constant_buffers(struct llvmpipe_context *lp)
+{
+   struct pipe_screen *screen = lp->pipe.screen;
+   uint i;
+
+   /* really need to flush all prims since the vert/frag shaders const buffers
+    * are going away now.
+    */
+   draw_flush(lp->draw);
+
+   draw_set_mapped_constant_buffer(lp->draw, NULL, 0);
+
+   lp->jit_context.constants = NULL;
+
+   for (i = 0; i < 2; i++) {
+      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
+         screen->buffer_unmap(screen, lp->constants[i].buffer);
+      lp->mapped_constants[i] = NULL;
+   }
+}
+
+
+boolean
+llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+boolean
+llvmpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct draw_context *draw = lp->draw;
+   unsigned i;
+
+   lp->reduced_api_prim = u_reduced_prim(mode);
+
+   if (lp->dirty)
+      llvmpipe_update_derived( lp );
+
+   llvmpipe_map_transfers(lp);
+   llvmpipe_map_constant_buffers(lp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      void *buf = llvmpipe_buffer(lp->vertex_buffer[i].buffer)->data;
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = llvmpipe_buffer(indexBuffer)->data;
+      draw_set_mapped_element_buffer_range(draw, indexSize,
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer_range(draw, 0, start,
+                                           start + count - 1, NULL);
+   }
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   /* Note: leave drawing surfaces mapped */
+   llvmpipe_unmap_constant_buffers(lp);
+
+   lp->dirty_render_cache = TRUE;
+   
+   return TRUE;
+}
+
+
+boolean
+llvmpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   return llvmpipe_draw_range_elements( pipe, indexBuffer,
+                                        indexSize,
+                                        0, 0xffffffff,
+                                        mode, start, count );
+}
+
+
+void
+llvmpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   draw_set_edgeflags(lp->draw, edgeflags);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
new file mode 100644
index 0000000000..b5c1c95bb7
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -0,0 +1,98 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "lp_flush.h"
+#include "lp_context.h"
+#include "lp_surface.h"
+#include "lp_state.h"
+#include "lp_tile_cache.h"
+#include "lp_tex_cache.h"
+#include "lp_winsys.h"
+
+
+void
+llvmpipe_flush( struct pipe_context *pipe,
+		unsigned flags,
+                struct pipe_fence_handle **fence )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   draw_flush(llvmpipe->draw);
+
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
+         if (llvmpipe->cbuf_cache[i])
+            lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+
+      /* Need this call for hardware buffers before swapbuffers.
+       *
+       * there should probably be another/different flush-type function
+       * that's called before swapbuffers because we don't always want
+       * to unmap surfaces when flushing.
+       */
+      llvmpipe_unmap_transfers(llvmpipe);
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
+         if (llvmpipe->cbuf_cache[i])
+            lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+
+      /* FIXME: untile zsbuf! */
+     
+      llvmpipe->dirty_render_cache = FALSE;
+   }
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+#if 0
+   if(flags & PIPE_FLUSH_FRAME) {
+      static unsigned frame_no = 1;
+      static char filename[256];
+      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, llvmpipe->framebuffer.cbufs[0]);
+      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, llvmpipe->framebuffer.zsbuf);
+      ++frame_no;
+   }
+#endif
+   
+   if (fence)
+      *fence = NULL;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
new file mode 100644
index 0000000000..10b2b52583
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_FLUSH_H
+#define LP_FLUSH_H
+
+struct pipe_context;
+struct pipe_fence_handle;
+
+void llvmpipe_flush(struct pipe_context *pipe, unsigned flags,
+                    struct pipe_fence_handle **fence);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
new file mode 100644
index 0000000000..b4a22ff4a9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "util/u_memory.h"
+#include "lp_screen.h"
+#include "lp_bld_intr.h"
+#include "lp_jit.h"
+
+
+static void
+lp_jit_init_globals(struct llvmpipe_screen *screen)
+{
+   LLVMTypeRef texture_type;
+
+   /* struct lp_jit_texture */
+   {
+      LLVMTypeRef elem_types[4];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           screen->target, texture_type);
+
+      LLVMAddTypeName(screen->module, "texture", texture_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[5];
+      LLVMTypeRef context_type;
+
+      elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
+      elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
+      elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
+      elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+
+      context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
+                             screen->target, context_type, 0);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers,
+                             screen->target, context_type, 1);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
+                             screen->target, context_type, 2);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+                             screen->target, context_type, 3);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             screen->target, context_type,
+                             LP_JIT_CONTEXT_TEXTURES_INDEX);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
+                           screen->target, context_type);
+
+      LLVMAddTypeName(screen->module, "context", context_type);
+
+      screen->context_ptr_type = LLVMPointerType(context_type, 0);
+   }
+
+   /* fetch_texel
+    */
+   {
+      LLVMTypeRef ret_type;
+      LLVMTypeRef arg_types[3];
+      LLVMValueRef fetch_texel;
+
+      ret_type = LLVMVoidType();
+      arg_types[0] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
+      arg_types[1] = LLVMInt32Type();                     /* unit */
+      arg_types[2] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0); /* store */
+
+      fetch_texel = lp_declare_intrinsic(screen->module, "fetch_texel",
+                                         ret_type, arg_types, Elements(arg_types));
+
+      LLVMAddGlobalMapping(screen->engine, fetch_texel, lp_fetch_texel_soa);
+   }
+
+#ifdef DEBUG
+   LLVMDumpModule(screen->module);
+#endif
+}
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
+{
+   if(screen->engine)
+      LLVMDisposeExecutionEngine(screen->engine);
+
+   if(screen->pass)
+      LLVMDisposePassManager(screen->pass);
+}
+
+
+void
+lp_jit_screen_init(struct llvmpipe_screen *screen)
+{
+   char *error = NULL;
+
+   screen->module = LLVMModuleCreateWithName("llvmpipe");
+
+   screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
+
+   if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
+      _debug_printf("%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+   screen->target = LLVMGetExecutionEngineTargetData(screen->engine);
+
+   screen->pass = LLVMCreateFunctionPassManager(screen->provider);
+   LLVMAddTargetData(screen->target, screen->pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(screen->pass);
+   LLVMAddInstructionCombiningPass(screen->pass);
+   LLVMAddPromoteMemoryToRegisterPass(screen->pass);
+   LLVMAddGVNPass(screen->pass);
+   LLVMAddCFGSimplificationPass(screen->pass);
+
+   lp_jit_init_globals(screen);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
new file mode 100644
index 0000000000..58f716ede2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_JIT_H
+#define LP_JIT_H
+
+
+#include "lp_bld_struct.h"
+
+#include "pipe/p_state.h"
+
+
+struct tgsi_sampler;
+struct llvmpipe_screen;
+
+
+struct lp_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t stride;
+   const void *data;
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_STRIDE,
+   LP_JIT_TEXTURE_DATA
+};
+
+
+
+/**
+ * This structure is passed directly to the generated fragment shader.
+ *
+ * It contains the derived state.
+ *
+ * Changes here must be reflected in the lp_jit_context_* macros and
+ * lp_jit_init_types function. Changes to the ordering should be avoided.
+ *
+ * Only use types with a clear size and padding here, in particular prefer the
+ * stdint.h types to the basic integer types.
+ */
+struct lp_jit_context
+{
+   const float *constants;
+
+   struct tgsi_sampler **samplers;
+
+   float alpha_ref_value;
+
+   /* FIXME: store (also?) in floats */
+   uint8_t *blend_color;
+
+   struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
+};
+
+
+#define lp_jit_context_constants(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 0, "constants")
+
+#define lp_jit_context_samplers(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 1, "samplers")
+
+#define lp_jit_context_alpha_ref_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 2, "alpha_ref_value")
+
+#define lp_jit_context_blend_color(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 3, "blend_color")
+
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+
+#define lp_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+
+
+typedef void
+(*lp_jit_frag_func)(struct lp_jit_context *context,
+                    uint32_t x,
+                    uint32_t y,
+                    const void *a0,
+                    const void *dadx,
+                    const void *dady,
+                    uint32_t *mask,
+                    void *color,
+                    void *depth);
+
+void PIPE_CDECL
+lp_fetch_texel_soa( struct tgsi_sampler **samplers,
+                    uint32_t unit,
+                    float *store );
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
+
+
+void
+lp_jit_screen_init(struct llvmpipe_screen *screen);
+
+
+#endif /* LP_JIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.c b/src/gallium/drivers/llvmpipe/lp_prim_setup.c
new file mode 100644
index 0000000000..b14f8fb99d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_prim_setup.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief A draw stage that drives our triangle setup routines from
+ * within the draw pipeline.  One of two ways to drive setup, the
+ * other being in lp_prim_vbuf.c.
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+
+#include "lp_context.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "lp_prim_setup.h"
+#include "draw/draw_pipe.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct setup_context *setup;
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+typedef const float (*cptrf4)[4];
+
+static void
+do_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+   
+   llvmpipe_setup_tri( setup->setup,
+              (cptrf4)prim->v[0]->data,
+              (cptrf4)prim->v[1]->data,
+              (cptrf4)prim->v[2]->data );
+}
+
+static void
+do_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   llvmpipe_setup_line( setup->setup,
+               (cptrf4)prim->v[0]->data,
+               (cptrf4)prim->v[1]->data );
+}
+
+static void
+do_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   llvmpipe_setup_point( setup->setup,
+                (cptrf4)prim->v[0]->data );
+}
+
+
+
+
+static void setup_begin( struct draw_stage *stage )
+{
+   struct setup_stage *setup = setup_stage(stage);
+
+   llvmpipe_setup_prepare( setup->setup );
+
+   stage->point = do_point;
+   stage->line = do_line;
+   stage->tri = do_tri;
+}
+
+
+static void setup_first_point( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->point( stage, header );
+}
+
+static void setup_first_line( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->line( stage, header );
+}
+
+
+static void setup_first_tri( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->tri( stage, header );
+}
+
+
+
+static void setup_flush( struct draw_stage *stage,
+			 unsigned flags )
+{
+   stage->point = setup_first_point;
+   stage->line = setup_first_line;
+   stage->tri = setup_first_tri;
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   llvmpipe_setup_destroy_context(ssetup->setup);
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct draw_stage *lp_draw_render_stage( struct llvmpipe_context *llvmpipe )
+{
+   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
+
+   sstage->setup = llvmpipe_setup_create_context(llvmpipe);
+   sstage->stage.draw = llvmpipe->draw;
+   sstage->stage.point = setup_first_point;
+   sstage->stage.line = setup_first_line;
+   sstage->stage.tri = setup_first_tri;
+   sstage->stage.flush = setup_flush;
+   sstage->stage.reset_stipple_counter = reset_stipple_counter;
+   sstage->stage.destroy = render_destroy;
+
+   return (struct draw_stage *)sstage;
+}
+
+struct setup_context *
+lp_draw_setup_context( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   return ssetup->setup;
+}
+
+void
+lp_draw_flush( struct draw_stage *stage )
+{
+   stage->flush( stage, 0 );
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.h b/src/gallium/drivers/llvmpipe/lp_prim_setup.h
new file mode 100644
index 0000000000..da6cae6375
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_prim_setup.h
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_PRIM_SETUP_H
+#define LP_PRIM_SETUP_H
+
+
+/**
+ * vbuf is a special stage to gather the stream of triangles, lines, points
+ * together and reconstruct vertex buffers for hardware upload.
+ *
+ * First attempt, work in progress.
+ * 
+ * TODO:
+ *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
+ *    - tell vbuf stage how to build hw vertices directly
+ *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
+ *
+ *
+ *
+ * Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+
+struct draw_stage;
+struct llvmpipe_context;
+
+
+typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
+                                unsigned prim,
+                                const ushort *elements,
+                                unsigned nr_elements,
+                                const void *vertex_buffer,
+                                unsigned nr_vertices );
+
+
+extern struct draw_stage *
+lp_draw_render_stage( struct llvmpipe_context *llvmpipe );
+
+extern struct setup_context *
+lp_draw_setup_context( struct draw_stage * );
+
+extern void
+lp_draw_flush( struct draw_stage * );
+
+
+extern struct draw_stage *
+lp_draw_vbuf_stage( struct draw_context *draw_context,
+                    struct pipe_context *pipe,
+                    vbuf_draw_func draw );
+
+
+#endif /* LP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
new file mode 100644
index 0000000000..c394dcb61d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
@@ -0,0 +1,607 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the llvmpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_prim_vbuf.h"
+#include "lp_prim_setup.h"
+#include "lp_setup.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+
+
+#define LP_MAX_VBUF_INDEXES 1024
+#define LP_MAX_VBUF_SIZE    4096
+
+typedef const float (*cptrf4)[4];
+
+/**
+ * Subclass of vbuf_render.
+ */
+struct llvmpipe_vbuf_render
+{
+   struct vbuf_render base;
+   struct llvmpipe_context *llvmpipe;
+   uint prim;
+   uint vertex_size;
+   uint nr_vertices;
+   uint vertex_buffer_size;
+   void *vertex_buffer;
+};
+
+
+/** cast wrapper */
+static struct llvmpipe_vbuf_render *
+llvmpipe_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct llvmpipe_vbuf_render *) vbr;
+}
+
+
+static const struct vertex_info *
+lp_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   return llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
+}
+
+
+static boolean
+lp_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                          ushort vertex_size, ushort nr_vertices)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (cvbr->vertex_buffer_size < size) {
+      align_free(cvbr->vertex_buffer);
+      cvbr->vertex_buffer = align_malloc(size, 16);
+      cvbr->vertex_buffer_size = size;
+   }
+
+   cvbr->vertex_size = vertex_size;
+   cvbr->nr_vertices = nr_vertices;
+   
+   return cvbr->vertex_buffer != NULL;
+}
+
+static void
+lp_vbuf_release_vertices(struct vbuf_render *vbr)
+{
+#if 0
+   {
+      struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+      const struct vertex_info *info = 
+         llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
+      const float *vtx = (const float *) cvbr->vertex_buffer;
+      uint i, j;
+      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
+             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
+      for (i = 0; i < cvbr->nr_vertices; i++) {
+         for (j = 0; j < info->num_attribs; j++) {
+            uint k;
+            switch (info->attrib[j].emit) {
+            case EMIT_4F:  k = 4;   break;
+            case EMIT_3F:  k = 3;   break;
+            case EMIT_2F:  k = 2;   break;
+            case EMIT_1F:  k = 1;   break;
+            default: assert(0);
+            }
+            debug_printf("Vert %u attr %u: ", i, j);
+            while (k-- > 0) {
+               debug_printf("%g ", vtx[0]);
+               vtx++;
+            }
+            debug_printf("\n");
+         }
+      }
+   }
+#endif
+
+   /* keep the old allocation for next time */
+}
+
+static void *
+lp_vbuf_map_vertices(struct vbuf_render *vbr)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   return cvbr->vertex_buffer;
+}
+
+static void 
+lp_vbuf_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   /* do nothing */
+}
+
+
+static boolean
+lp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+
+   /* XXX: break this dependency - make setup_context live under
+    * llvmpipe, rename the old "setup" draw stage to something else.
+    */
+   struct setup_context *setup_ctx = lp_draw_setup_context(cvbr->llvmpipe->setup);
+   
+   llvmpipe_setup_prepare( setup_ctx );
+
+   cvbr->llvmpipe->reduced_prim = u_reduced_prim(prim);
+   cvbr->prim = prim;
+   return TRUE;
+
+}
+
+
+static INLINE cptrf4 get_vert( const void *vertex_buffer,
+                               int index,
+                               int stride )
+{
+   return (cptrf4)((char *)vertex_buffer + index * stride);
+}
+
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
+   const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer = cvbr->vertex_buffer;
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * llvmpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = llvmpipe->setup;
+   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         llvmpipe_setup_point( setup_ctx,
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[nr-1], stride),
+                     get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride),
+                       get_vert(vertex_buffer, indices[i-2], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
+                       get_vert(vertex_buffer, indices[i-(i&1)], stride),
+                       get_vert(vertex_buffer, indices[i-2], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                       get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-0], stride),
+                       get_vert(vertex_buffer, indices[0], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[0], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-3], stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride),
+                       get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-3], stride),
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-0], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-3], stride));
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride),
+                       get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-3], stride),
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-3], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.  Note that the first polygon vertex is passed as
+       * the last triangle vertex here.
+       * flatshade_first state makes no difference.
+       */
+      for (i = 2; i < nr; i += 1) {
+         llvmpipe_setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-0], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+
+   /* XXX: why are we calling this???  If we had to call something, it
+    * would be a function in lp_setup.c:
+    */
+   lp_draw_flush( setup );
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+lp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
+   const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * llvmpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = llvmpipe->setup;
+   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         llvmpipe_setup_point( setup_ctx,
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         llvmpipe_setup_line( setup_ctx,
+                     get_vert(vertex_buffer, nr-1, stride),
+                     get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride),
+                       get_vert(vertex_buffer, i-2, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i+(i&1)-1, stride),
+                       get_vert(vertex_buffer, i-(i&1), stride),
+                       get_vert(vertex_buffer, i-2, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i+(i&1)-2, stride),
+                       get_vert(vertex_buffer, i-(i&1)-1, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-0, stride),
+                       get_vert(vertex_buffer, 0, stride),
+                       get_vert(vertex_buffer, i-1, stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, 0, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-3, stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride),
+                       get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-3, stride),
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (llvmpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-0, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-3, stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-0, stride),
+                       get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-3, stride),
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+            llvmpipe_setup_tri( setup_ctx,
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-3, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.  Note that the first polygon vertex is passed as
+       * the last triangle vertex here.
+       * flatshade_first state makes no difference.
+       */
+      for (i = 2; i < nr; i += 1) {
+         llvmpipe_setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride),
+                    get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+lp_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   cvbr->llvmpipe->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+lp_init_vbuf(struct llvmpipe_context *lp)
+{
+   assert(lp->draw);
+
+   lp->vbuf_render = CALLOC_STRUCT(llvmpipe_vbuf_render);
+
+   lp->vbuf_render->base.max_indices = LP_MAX_VBUF_INDEXES;
+   lp->vbuf_render->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
+
+   lp->vbuf_render->base.get_vertex_info = lp_vbuf_get_vertex_info;
+   lp->vbuf_render->base.allocate_vertices = lp_vbuf_allocate_vertices;
+   lp->vbuf_render->base.map_vertices = lp_vbuf_map_vertices;
+   lp->vbuf_render->base.unmap_vertices = lp_vbuf_unmap_vertices;
+   lp->vbuf_render->base.set_primitive = lp_vbuf_set_primitive;
+   lp->vbuf_render->base.draw = lp_vbuf_draw;
+   lp->vbuf_render->base.draw_arrays = lp_vbuf_draw_arrays;
+   lp->vbuf_render->base.release_vertices = lp_vbuf_release_vertices;
+   lp->vbuf_render->base.destroy = lp_vbuf_destroy;
+
+   lp->vbuf_render->llvmpipe = lp;
+
+   lp->vbuf = draw_vbuf_stage(lp->draw, &lp->vbuf_render->base);
+
+   draw_set_rasterize_stage(lp->draw, lp->vbuf);
+
+   draw_set_render(lp->draw, &lp->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
new file mode 100644
index 0000000000..6c4e6063e6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_VBUF_H
+#define LP_VBUF_H
+
+
+struct llvmpipe_context;
+
+extern void
+lp_init_vbuf(struct llvmpipe_context *llvmpipe);
+
+
+#endif /* LP_VBUF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
new file mode 100644
index 0000000000..7eb05de77a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_quad.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_QUAD_H
+#define LP_QUAD_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
+
+
+#define QUAD_PRIM_POINT 1
+#define QUAD_PRIM_LINE  2
+#define QUAD_PRIM_TRI   3
+
+
+/* The rasterizer generates 2x2 quads of fragment and feeds them to
+ * the current fp_machine (see below).
+ * Remember that Y=0=top with Y increasing down the window.
+ */
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+/**
+ * Quad stage inputs (pos, coverage, front/back face, etc)
+ */
+struct quad_header_input
+{
+   int x0, y0;                /**< quad window pos, always even */
+   float coverage[QUAD_SIZE]; /**< fragment coverage for antialiasing */
+   unsigned facing:1;         /**< Front (0) or back (1) facing? */
+   unsigned prim:2;           /**< QUAD_PRIM_POINT, LINE, TRI */
+};
+
+
+/**
+ * Quad stage inputs/outputs.
+ */
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
+
+
+/**
+ * Quad stage outputs (color & depth).
+ */
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+};
+
+
+/**
+ * Input interpolation coefficients
+ */
+struct quad_interp_coef
+{
+   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+};
+
+
+/**
+ * Encodes everything we need to know about a 2x2 pixel block.  Uses
+ * "Channel-Serial" or "SoA" layout.  
+ */
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+
+   /* Redundant/duplicated:
+    */
+   const struct quad_interp_coef *coef;
+};
+
+#endif /* LP_QUAD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
new file mode 100644
index 0000000000..5554285425
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "lp_context.h"
+#include "lp_query.h"
+#include "lp_state.h"
+
+struct llvmpipe_query {
+   uint64_t start;
+   uint64_t end;
+};
+
+
+static struct llvmpipe_query *llvmpipe_query( struct pipe_query *p )
+{
+   return (struct llvmpipe_query *)p;
+}
+
+static struct pipe_query *
+llvmpipe_create_query(struct pipe_context *pipe, 
+		      unsigned type)
+{
+   assert(type == PIPE_QUERY_OCCLUSION_COUNTER);
+   return (struct pipe_query *)CALLOC_STRUCT( llvmpipe_query );
+}
+
+
+static void
+llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   FREE(q);
+}
+
+
+static void
+llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *sq = llvmpipe_query(q);
+   
+   sq->start = llvmpipe->occlusion_count;
+   llvmpipe->active_query_count++;
+   llvmpipe->dirty |= LP_NEW_QUERY;
+}
+
+
+static void
+llvmpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *sq = llvmpipe_query(q);
+
+   llvmpipe->active_query_count--;
+   sq->end = llvmpipe->occlusion_count;
+   llvmpipe->dirty |= LP_NEW_QUERY;
+}
+
+
+static boolean
+llvmpipe_get_query_result(struct pipe_context *pipe, 
+			  struct pipe_query *q,
+			  boolean wait,
+			  uint64_t *result )
+{
+   struct llvmpipe_query *sq = llvmpipe_query(q);
+   *result = sq->end - sq->start;
+   return TRUE;
+}
+
+
+void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe )
+{
+   llvmpipe->pipe.create_query = llvmpipe_create_query;
+   llvmpipe->pipe.destroy_query = llvmpipe_destroy_query;
+   llvmpipe->pipe.begin_query = llvmpipe_begin_query;
+   llvmpipe->pipe.end_query = llvmpipe_end_query;
+   llvmpipe->pipe.get_query_result = llvmpipe_get_query_result;
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_query.h b/src/gallium/drivers/llvmpipe/lp_query.h
new file mode 100644
index 0000000000..fa9fcd8713
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_query.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell
+ */
+
+#ifndef LP_QUERY_H
+#define LP_QUERY_H
+
+struct llvmpipe_context;
+extern void llvmpipe_init_query_funcs(struct llvmpipe_context * );
+
+
+#endif /* LP_QUERY_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
new file mode 100644
index 0000000000..ff7ef8658a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -0,0 +1,238 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "lp_texture.h"
+#include "lp_buffer.h"
+#include "lp_winsys.h"
+#include "lp_jit.h"
+#include "lp_screen.h"
+
+
+static const char *
+llvmpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+llvmpipe_get_name(struct pipe_screen *screen)
+{
+   return "llvmpipe";
+}
+
+
+static int
+llvmpipe_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 13; /* max 4Kx4K */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 13; /* max 4Kx4K */
+   case PIPE_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+
+static float
+llvmpipe_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 16.0; /* not actually signficant at this time */
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+llvmpipe_is_format_supported( struct pipe_screen *_screen,
+                              enum pipe_format format, 
+                              enum pipe_texture_target target,
+                              unsigned tex_usage, 
+                              unsigned geom_flags )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_winsys *winsys = screen->winsys;
+
+   assert(target == PIPE_TEXTURE_1D ||
+          target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_3D ||
+          target == PIPE_TEXTURE_CUBE);
+
+   if(format == PIPE_FORMAT_Z16_UNORM)
+      return FALSE;
+   if(format == PIPE_FORMAT_S8_UNORM)
+      return FALSE;
+
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      return FALSE;
+   default:
+      break;
+   }
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      return winsys->is_displaytarget_format_supported(winsys, format);
+
+   return TRUE;
+}
+
+
+static struct pipe_buffer *
+llvmpipe_surface_buffer_create(struct pipe_screen *screen,
+                               unsigned width, unsigned height,
+                               enum pipe_format format,
+                               unsigned tex_usage,
+                               unsigned usage,
+                               unsigned *stride)
+{
+   /* This function should never be used */
+   assert(0);
+   return NULL;
+}
+
+
+static void
+llvmpipe_flush_frontbuffer(struct pipe_screen *_screen,
+                           struct pipe_surface *surface,
+                           void *context_private)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_winsys *winsys = screen->winsys;
+   struct llvmpipe_texture *texture = llvmpipe_texture(surface->texture);
+
+   assert(texture->dt);
+   if (texture->dt)
+      winsys->displaytarget_display(winsys, texture->dt, context_private);
+}
+
+
+static void
+llvmpipe_destroy_screen( struct pipe_screen *_screen )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_winsys *winsys = screen->winsys;
+
+   lp_jit_screen_cleanup(screen);
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen).
+ */
+struct pipe_screen *
+llvmpipe_create_screen(struct llvmpipe_winsys *winsys)
+{
+   struct llvmpipe_screen *screen = CALLOC_STRUCT(llvmpipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->base.destroy = llvmpipe_destroy_screen;
+
+   screen->base.get_name = llvmpipe_get_name;
+   screen->base.get_vendor = llvmpipe_get_vendor;
+   screen->base.get_param = llvmpipe_get_param;
+   screen->base.get_paramf = llvmpipe_get_paramf;
+   screen->base.is_format_supported = llvmpipe_is_format_supported;
+
+   screen->base.surface_buffer_create = llvmpipe_surface_buffer_create;
+   screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
+
+   llvmpipe_init_screen_texture_funcs(&screen->base);
+   llvmpipe_init_screen_buffer_funcs(&screen->base);
+
+   lp_jit_screen_init(screen);
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h
new file mode 100644
index 0000000000..4a1b4d6f3e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_SCREEN_H
+#define LP_SCREEN_H
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Analysis.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/ExecutionEngine.h>
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+struct llvmpipe_winsys;
+
+
+struct llvmpipe_screen
+{
+   struct pipe_screen base;
+
+   struct llvmpipe_winsys *winsys;
+
+   LLVMModuleRef module;
+   LLVMExecutionEngineRef engine;
+   LLVMModuleProviderRef provider;
+   LLVMTargetDataRef target;
+   LLVMPassManagerRef pass;
+
+   LLVMTypeRef context_ptr_type;
+
+   /* Increments whenever textures are modified.  Contexts can track
+    * this.
+    */
+   unsigned timestamp;          
+};
+
+
+
+
+static INLINE struct llvmpipe_screen *
+llvmpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct llvmpipe_screen *)pipe;
+}
+
+
+#endif /* LP_SCREEN_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
new file mode 100644
index 0000000000..2d2fc19a65
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -0,0 +1,1483 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "lp_context.h"
+#include "lp_prim_setup.h"
+#include "lp_quad.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_bld_debug.h"
+#include "lp_tile_cache.h"
+#include "lp_tile_soa.h"
+
+
+#define DEBUG_VERTS 0
+#define DEBUG_FRAGS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+#define MAX_QUADS 16
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_context {
+   struct llvmpipe_context *llvmpipe;
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const float (*vmax)[4];
+   const float (*vmid)[4];
+   const float (*vmin)[4];
+   const float (*vprovoke)[4];
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+   int facing;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
+
+   struct quad_interp_coef coef;
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+   } span;
+
+#if DEBUG_FRAGS
+   uint numFragsEmitted;  /**< per primitive */
+   uint numFragsWritten;  /**< per primitive */
+#endif
+
+   unsigned winding;		/* which winding to cull */
+};
+
+
+
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ */
+static void
+shade_quads(struct llvmpipe_context *llvmpipe,
+            struct quad_header *quads[],
+            unsigned nr)
+{
+   struct lp_fragment_shader *fs = llvmpipe->fs;
+   struct quad_header *quad = quads[0];
+   const unsigned x = quad->input.x0;
+   const unsigned y = quad->input.y0;
+   uint8_t *tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+   uint8_t *color;
+   void *depth;
+   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   unsigned chan_index;
+   unsigned q;
+
+   assert(fs->current);
+   if(!fs->current)
+      return;
+
+   /* Sanity checks */
+   assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+   for (q = 0; q < nr; ++q) {
+      assert(quads[q]->input.x0 == x + q*2);
+      assert(quads[q]->input.y0 == y);
+   }
+
+   /* mask */
+   for (q = 0; q < 4; ++q)
+      for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
+         mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
+
+   /* color buffer */
+   color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+
+   /* depth buffer */
+   if(llvmpipe->zsbuf_map) {
+      assert((x % 2) == 0);
+      assert((y % 2) == 0);
+      depth = llvmpipe->zsbuf_map +
+              y*llvmpipe->zsbuf_transfer->stride +
+              2*x*llvmpipe->zsbuf_transfer->block.size;
+   }
+   else
+      depth = NULL;
+
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
+
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
+
+   /* run shader */
+   fs->current->jit_function( &llvmpipe->jit_context,
+                              x, y,
+                              quad->coef->a0,
+                              quad->coef->dadx,
+                              quad->coef->dady,
+                              &mask[0][0],
+                              color,
+                              depth);
+}
+
+
+
+
+/**
+ * Do triangle cull test using tri determinant (sign indicates orientation)
+ * \return true if triangle is to be culled.
+ */
+static INLINE boolean
+cull_tri(const struct setup_context *setup, float det)
+{
+   if (det != 0) {   
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+
+      if ((winding & setup->winding) == 0)
+	 return FALSE;
+   }
+
+   /* Culled:
+    */
+   return TRUE;
+}
+
+
+
+/**
+ * Clip setup->quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip( struct setup_context *setup, struct quad_header *quad )
+{
+   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
+      /* totally clipped */
+      quad->inout.mask = 0x0;
+      return;
+   }
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int block( int x )
+{
+   return x & ~(2-1);
+}
+
+static INLINE int block_x( int x )
+{
+   return x & ~(TILE_VECTOR_WIDTH - 1);
+}
+
+
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
+{
+   quad_clip( setup, quad );
+
+   if (quad->inout.mask) {
+      struct llvmpipe_context *lp = setup->llvmpipe;
+
+#if 1
+      /* XXX: The blender expects 4 quads. This is far from efficient, but
+       * until we codegenerate single-quad variants of the fragment pipeline
+       * we need this hack. */
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
+      struct quad_header quads[nr_quads];
+      struct quad_header *quad_ptrs[nr_quads];
+      int x0 = block_x(quad->input.x0);
+      unsigned i;
+
+      for(i = 0; i < nr_quads; ++i) {
+         int x = x0 + 2*i;
+         if(x == quad->input.x0)
+            memcpy(&quads[i], quad, sizeof quads[i]);
+         else {
+            memset(&quads[i], 0, sizeof quads[i]);
+            quads[i].input.x0 = x;
+            quads[i].input.y0 = quad->input.y0;
+            quads[i].coef = quad->coef;
+         }
+         quad_ptrs[i] = &quads[i];
+      }
+
+      shade_quads( lp, quad_ptrs, nr_quads );
+#else
+      shade_quads( lp, &quad, 1 );
+#endif
+   }
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void flush_spans( struct setup_context *setup )
+{
+   const int step = TILE_VECTOR_WIDTH;
+   const int xleft0 = setup->span.left[0];
+   const int xleft1 = setup->span.left[1];
+   const int xright0 = setup->span.right[0];
+   const int xright1 = setup->span.right[1];
+
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
+   int maxright = MAX2(xright0, xright1);
+   int x;
+
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
+      unsigned q = 0;
+
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         for(q = 0; q < nr_quads; ++q) {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            setup->quad[q].input.x0 = lx;
+            setup->quad[q].input.y0 = setup->span.y;
+            setup->quad[q].inout.mask = quadmask;
+            setup->quad_ptrs[q] = &setup->quad[q];
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         }
+         assert(!(mask0 | mask1));
+
+         shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
+      }
+   }
+
+
+   setup->span.y = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
+}
+
+
+#if DEBUG_VERTS
+static void print_vertex(const struct setup_context *setup,
+                         const float (*v)[4])
+{
+   int i;
+   debug_printf("   Vertex: (%p)\n", v);
+   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
+      debug_printf("     %d: %f %f %f %f\n",  i,
+              v[i][0], v[i][1], v[i][2], v[i][3]);
+      if (util_is_inf_or_nan(v[i][0])) {
+         debug_printf("   NaN!\n");
+      }
+   }
+}
+#endif
+
+/**
+ * Sort the vertices from top to bottom order, setting up the triangle
+ * edge fields (ebot, emaj, etop).
+ * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
+ */
+static boolean setup_sort_vertices( struct setup_context *setup,
+                                    float det,
+                                    const float (*v0)[4],
+                                    const float (*v1)[4],
+                                    const float (*v2)[4] )
+{
+   setup->vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = v0[0][1];
+      float y1 = v1[0][1];
+      float y2 = v2[0][1];
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup->vmin = v0;
+	    setup->vmid = v1;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup->vmin = v2;
+	    setup->vmid = v0;
+	    setup->vmax = v1;
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup->vmin = v0;
+	    setup->vmid = v2;
+	    setup->vmax = v1;
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup->vmin = v1;
+	    setup->vmid = v0;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup->vmin = v2;
+	    setup->vmid = v1;
+	    setup->vmax = v0;
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup->vmin = v1;
+	    setup->vmid = v2;
+	    setup->vmax = v0;
+	 }
+      }
+   }
+
+   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
+   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
+   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup->emaj.dx * setup->ebot.dy -
+			    setup->ebot.dx * setup->emaj.dy);
+
+      setup->oneoverarea = 1.0f / area;
+
+      /*
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup->oneoverarea, area, det );
+      */
+      if (util_is_inf_or_nan(setup->oneoverarea))
+         return FALSE;
+   }
+
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    */
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_pos_coeff( struct setup_context *setup,
+                           uint vertSlot, unsigned i)
+{
+   float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+   float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   assert(i <= 3);
+
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - 0.5f) +
+                            dady * (setup->vmin[0][1] - 0.5f)));
+
+   /*
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+                slot, "xyzw"[i],
+                setup->coef[slot].a0[i],
+                setup->coef[slot].dadx[i],
+                setup->coef[slot].dady[i]);
+   */
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_pos_coeff( struct setup_context *setup,
+                             uint vertSlot, unsigned i)
+{
+   setup->coef.dadx[0][i] = 0;
+   setup->coef.dady[0][i] = 0;
+
+   /* need provoking vertex info!
+    */
+   setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_coeff( struct setup_context *setup,
+                         unsigned attrib,
+                         uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0;
+      setup->coef.dady[1 + attrib][i] = 0;
+
+      /* need provoking vertex info!
+       */
+      setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
+   }
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_linear_coeff( struct setup_context *setup,
+                              unsigned attrib,
+                              uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+      float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
+
+      assert(i <= 3);
+
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+
+      /* calculate a0 as the value which would be sampled for the
+       * fragment at (0,0), taking into account that we want to sample at
+       * pixel centers, in other words (0.5, 0.5).
+       *
+       * this is neat but unfortunately not a good way to do things for
+       * triangles with very large values of dadx or dady as it will
+       * result in the subtraction and re-addition from a0 of a very
+       * large number, which means we'll end up loosing a lot of the
+       * fractional bits and precision from a0.  the way to fix this is
+       * to define a0 as the sample at a pixel center somewhere near vmin
+       * instead - i'll switch to this later.
+       */
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - 0.5f) +
+                      dady * (setup->vmin[0][1] - 0.5f)));
+
+      /*
+      debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+                   slot, "xyzw"[i],
+                   setup->coef[slot].a0[i],
+                   setup->coef[slot].dadx[i],
+                   setup->coef[slot].dady[i]);
+      */
+   }
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void tri_persp_coeff( struct setup_context *setup,
+                             unsigned attrib,
+                             uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* premultiply by 1/w  (v[0][3] is always W):
+       */
+      float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
+      float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      float botda = mida - mina;
+      float majda = maxa - mina;
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
+
+      /*
+      debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+                   setup->vmin[vertSlot][i],
+                   setup->vmid[vertSlot][i],
+                   setup->vmax[vertSlot][i]
+             );
+      */
+      assert(i <= 3);
+
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (mina -
+                     (dadx * (setup->vmin[0][0] - 0.5f) +
+                      dady * (setup->vmin[0][1] - 0.5f)));
+   }
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * Z and W are copied from posCoef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+{
+   /*X*/
+   setup->coef.a0[1 + slot][0] = 0;
+   setup->coef.dadx[1 + slot][0] = 1.0;
+   setup->coef.dady[1 + slot][0] = 0.0;
+   /*Y*/
+   setup->coef.a0[1 + slot][1] = 0.0;
+   setup->coef.dadx[1 + slot][1] = 0.0;
+   setup->coef.dady[1 + slot][1] = 1.0;
+   /*Z*/
+   setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
+   setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
+   setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
+   /*W*/
+   setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
+   setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
+   setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
+}
+
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void setup_tri_coefficients( struct setup_context *setup )
+{
+   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
+   uint fragSlot;
+
+   /* z and w are done by linear interpolation:
+    */
+   tri_pos_coeff(setup, 0, 2);
+   tri_pos_coeff(setup, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         const_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_LINEAR:
+         tri_linear_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_persp_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
+      }
+   }
+}
+
+
+
+static void setup_tri_edges( struct setup_context *setup )
+{
+   float vmin_x = setup->vmin[0][0] + 0.5f;
+   float vmid_x = setup->vmid[0][0] + 0.5f;
+
+   float vmin_y = setup->vmin[0][1] - 0.5f;
+   float vmid_y = setup->vmid[0][1] - 0.5f;
+   float vmax_y = setup->vmax[0][1] - 0.5f;
+
+   setup->emaj.sy = ceilf(vmin_y);
+   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
+   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
+   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
+
+   setup->etop.sy = ceilf(vmid_y);
+   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
+   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
+   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
+
+   setup->ebot.sy = ceilf(vmin_y);
+   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
+   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
+   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void subtriangle( struct setup_context *setup,
+			 struct edge *eleft,
+			 struct edge *eright,
+			 unsigned lines )
+{
+   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   assert((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   if (start_y < miny)
+      start_y = miny;
+
+   finish_y = sy + lines;
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup->span.y) {
+            flush_spans(setup);
+            setup->span.y = block(_y);
+         }
+
+         setup->span.left[_y&1] = left;
+         setup->span.right[_y&1] = right;
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Recalculate prim's determinant.  This is needed as we don't have
+ * get this information through the vbuf_render interface & we must
+ * calculate it here.
+ */
+static float
+calc_det( const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
+/**
+ * Do setup for triangle rasterization, then render the triangle.
+ */
+void llvmpipe_setup_tri( struct setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4] )
+{
+   float det;
+
+#if DEBUG_VERTS
+   debug_printf("Setup triangle:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+   print_vertex(setup, v2);
+#endif
+
+   if (setup->llvmpipe->no_rast)
+      return;
+   
+   det = calc_det(v0, v1, v2);
+   /*
+   debug_printf("%s\n", __FUNCTION__ );
+   */
+
+#if DEBUG_FRAGS
+   setup->numFragsEmitted = 0;
+   setup->numFragsWritten = 0;
+#endif
+
+   if (cull_tri( setup, det ))
+      return;
+
+   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
+      return;
+   setup_tri_coefficients( setup );
+   setup_tri_edges( setup );
+
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
+
+   setup->span.y = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+
+   if (setup->oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
+      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
+      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+   }
+
+   flush_spans( setup );
+
+#if DEBUG_FRAGS
+   printf("Tri: %u frags emitted, %u written\n",
+          setup->numFragsEmitted,
+          setup->numFragsWritten);
+#endif
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+linear_pos_coeff(struct setup_context *setup,
+                 uint vertSlot, uint i)
+{
+   const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - 0.5f) +
+                            dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+line_linear_coeff(struct setup_context *setup,
+                  unsigned attrib,
+                  uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - 0.5f) +
+                      dady * (setup->vmin[0][1] - 0.5f)));
+   }
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a line.
+ */
+static void
+line_persp_coeff(struct setup_context *setup,
+                 unsigned attrib,
+                 uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* XXX double-check/verify this arithmetic */
+      const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      const float da = a1 - a0;
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - 0.5f) +
+                      dady * (setup->vmin[0][1] - 0.5f)));
+   }
+}
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmax are initialized.
+ */
+static INLINE boolean
+setup_line_coefficients(struct setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4])
+{
+   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
+   uint fragSlot;
+   float area;
+
+   /* use setup->vmin, vmax to point to vertices */
+   if (llvmpipe->rasterizer->flatshade_first)
+      setup->vprovoke = v0;
+   else
+      setup->vprovoke = v1;
+   setup->vmin = v0;
+   setup->vmax = v1;
+
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+
+   /* NOTE: this is not really area but something proportional to it */
+   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
+   if (area == 0.0f || util_is_inf_or_nan(area))
+      return FALSE;
+   setup->oneoverarea = 1.0f / area;
+
+   /* z and w are done by linear interpolation:
+    */
+   linear_pos_coeff(setup, 0, 2);
+   linear_pos_coeff(setup, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         const_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_LINEAR:
+         line_linear_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_PERSPECTIVE:
+         line_persp_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
+      }
+   }
+   return TRUE;
+}
+
+
+/**
+ * Plot a pixel in a line segment.
+ */
+static INLINE void
+plot(struct setup_context *setup, int x, int y)
+{
+   const int iy = y & 1;
+   const int ix = x & 1;
+   const int quadX = x - ix;
+   const int quadY = y - iy;
+   const int mask = (1 << ix) << (2 * iy);
+
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
+   {
+      /* flush prev quad, start new quad */
+
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
+
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
+   }
+
+   setup->quad[0].inout.mask |= mask;
+}
+
+
+/**
+ * Do setup for line rasterization, then render the line.
+ * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
+ * to handle stippling and wide lines.
+ */
+void
+llvmpipe_setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4])
+{
+   int x0 = (int) v0[0][0];
+   int x1 = (int) v1[0][0];
+   int y0 = (int) v0[0][1];
+   int y1 = (int) v1[0][1];
+   int dx = x1 - x0;
+   int dy = y1 - y0;
+   int xstep, ystep;
+
+#if DEBUG_VERTS
+   debug_printf("Setup line:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+#endif
+
+   if (setup->llvmpipe->no_rast)
+      return;
+
+   if (dx == 0 && dy == 0)
+      return;
+
+   if (!setup_line_coefficients(setup, v0, v1))
+      return;
+
+   assert(v0[0][0] < 1.0e9);
+   assert(v0[0][1] < 1.0e9);
+   assert(v1[0][0] < 1.0e9);
+   assert(v1[0][1] < 1.0e9);
+
+   if (dx < 0) {
+      dx = -dx;   /* make positive */
+      xstep = -1;
+   }
+   else {
+      xstep = 1;
+   }
+
+   if (dy < 0) {
+      dy = -dy;   /* make positive */
+      ystep = -1;
+   }
+   else {
+      ystep = 1;
+   }
+
+   assert(dx >= 0);
+   assert(dy >= 0);
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
+
+   /* XXX temporary: set coverage to 1.0 so the line appears
+    * if AA mode happens to be enabled.
+    */
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
+
+   if (dx > dy) {
+      /*** X-major line ***/
+      int i;
+      const int errorInc = dy + dy;
+      int error = errorInc - dx;
+      const int errorDec = error - dx;
+
+      for (i = 0; i < dx; i++) {
+         plot(setup, x0, y0);
+
+         x0 += xstep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            y0 += ystep;
+         }
+      }
+   }
+   else {
+      /*** Y-major line ***/
+      int i;
+      const int errorInc = dx + dx;
+      int error = errorInc - dy;
+      const int errorDec = error - dy;
+
+      for (i = 0; i < dy; i++) {
+         plot(setup, x0, y0);
+
+         y0 += ystep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            x0 += xstep;
+         }
+      }
+   }
+
+   /* draw final quad */
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
+   }
+}
+
+
+static void
+point_persp_coeff(struct setup_context *setup,
+                  const float (*vert)[4],
+                  unsigned attrib,
+                  uint vertSlot)
+{
+   unsigned i;
+   for(i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0.0F;
+      setup->coef.dady[1 + attrib][i] = 0.0F;
+      setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
+   }
+}
+
+
+/**
+ * Do setup for point rasterization, then render the point.
+ * Round or square points...
+ * XXX could optimize a lot for 1-pixel points.
+ */
+void
+llvmpipe_setup_point( struct setup_context *setup,
+             const float (*v0)[4] )
+{
+   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const int sizeAttr = setup->llvmpipe->psize_slot;
+   const float size
+      = sizeAttr > 0 ? v0[sizeAttr][0]
+      : setup->llvmpipe->rasterizer->point_size;
+   const float halfSize = 0.5F * size;
+   const boolean round = (boolean) setup->llvmpipe->rasterizer->point_smooth;
+   const float x = v0[0][0];  /* Note: data[0] is always position */
+   const float y = v0[0][1];
+   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
+   uint fragSlot;
+
+#if DEBUG_VERTS
+   debug_printf("Setup point:\n");
+   print_vertex(setup, v0);
+#endif
+
+   if (llvmpipe->no_rast)
+      return;
+
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_POINTS);
+
+   /* For points, all interpolants are constant-valued.
+    * However, for point sprites, we'll need to setup texcoords appropriately.
+    * XXX: which coefficients are the texcoords???
+    * We may do point sprites as textured quads...
+    *
+    * KW: We don't know which coefficients are texcoords - ultimately
+    * the choice of what interpolation mode to use for each attribute
+    * should be determined by the fragment program, using
+    * per-attribute declaration statements that include interpolation
+    * mode as a parameter.  So either the fragment program will have
+    * to be adjusted for pointsprite vs normal point behaviour, or
+    * otherwise a special interpolation mode will have to be defined
+    * which matches the required behaviour for point sprites.  But -
+    * the latter is not a feature of normal hardware, and as such
+    * probably should be ruled out on that basis.
+    */
+   setup->vprovoke = v0;
+
+   /* setup Z, W */
+   const_pos_coeff(setup, 0, 2);
+   const_pos_coeff(setup, 0, 3);
+
+   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         /* fall-through */
+      case INTERP_LINEAR:
+         const_coeff(setup, fragSlot, vertSlot);
+         break;
+      case INTERP_PERSPECTIVE:
+         point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
+      }
+   }
+
+
+   if (halfSize <= 0.5 && !round) {
+      /* special case for 1-pixel points */
+      const int ix = ((int) x) & 1;
+      const int iy = ((int) y) & 1;
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
+   }
+   else {
+      if (round) {
+         /* rounded points */
+         const int ixmin = block((int) (x - halfSize));
+         const int ixmax = block((int) (x + halfSize));
+         const int iymin = block((int) (y - halfSize));
+         const int iymax = block((int) (y + halfSize));
+         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
+         const float rmax = halfSize + 0.7071F;
+         const float rmin2 = MAX2(0.0F, rmin * rmin);
+         const float rmax2 = rmax * rmax;
+         const float cscale = 1.0F / (rmax2 - rmin2);
+         int ix, iy;
+
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               float dx, dy, dist2, cover;
+
+               setup->quad[0].inout.mask = 0x0;
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
+               }
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
+               }
+
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
+               }
+            }
+         }
+      }
+      else {
+         /* square points */
+         const int xmin = (int) (x + 0.75 - halfSize);
+         const int ymin = (int) (y + 0.25 - halfSize);
+         const int xmax = xmin + (int) size;
+         const int ymax = ymin + (int) size;
+         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
+         const int ixmin = block(xmin);
+         const int ixmax = block(xmax - 1);
+         const int iymin = block(ymin);
+         const int iymax = block(ymax - 1);
+         int ix, iy;
+
+         /*
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         */
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            uint rowMask = 0xf;
+            if (iy < ymin) {
+               /* above the top edge */
+               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+            }
+            if (iy + 1 >= ymax) {
+               /* below the bottom edge */
+               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+            }
+
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               uint mask = rowMask;
+
+               if (ix < xmin) {
+                  /* fragment is past left edge of point, turn off left bits */
+                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+               }
+               if (ix + 1 >= xmax) {
+                  /* past the right edge */
+                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+               }
+
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
+            }
+         }
+      }
+   }
+}
+
+void llvmpipe_setup_prepare( struct setup_context *setup )
+{
+   struct llvmpipe_context *lp = setup->llvmpipe;
+
+   if (lp->dirty) {
+      llvmpipe_update_derived(lp);
+   }
+
+   if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
+       lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
+      /* we'll do culling */
+      setup->winding = lp->rasterizer->cull_mode;
+   }
+   else {
+      /* 'draw' will do culling */
+      setup->winding = PIPE_WINDING_NONE;
+   }
+}
+
+
+
+void llvmpipe_setup_destroy_context( struct setup_context *setup )
+{
+   align_free( setup );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
+{
+   struct setup_context *setup;
+   unsigned i;
+
+   setup = align_malloc(sizeof(struct setup_context), 16);
+   if (!setup)
+      return NULL;
+
+   memset(setup, 0, sizeof *setup);
+   setup->llvmpipe = llvmpipe;
+
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = &setup->coef;
+   }
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
+
+   return setup;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
new file mode 100644
index 0000000000..89c43da046
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef LP_SETUP_H
+#define LP_SETUP_H
+
+struct setup_context;
+struct llvmpipe_context;
+
+void 
+llvmpipe_setup_tri( struct setup_context *setup,
+	   const float (*v0)[4],
+	   const float (*v1)[4],
+	   const float (*v2)[4] );
+
+void
+llvmpipe_setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4]);
+
+void
+llvmpipe_setup_point( struct setup_context *setup,
+             const float (*v0)[4] );
+
+
+struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe );
+void llvmpipe_setup_prepare( struct setup_context *setup );
+void llvmpipe_setup_destroy_context( struct setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
new file mode 100644
index 0000000000..7b26ce61a3
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -0,0 +1,230 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_STATE_H
+#define LP_STATE_H
+
+#include <llvm-c/Core.h>
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "lp_jit.h"
+#include "lp_bld_sample.h" /* for struct lp_sampler_static_state */
+
+
+#define LP_NEW_VIEWPORT      0x1
+#define LP_NEW_RASTERIZER    0x2
+#define LP_NEW_FS            0x4
+#define LP_NEW_BLEND         0x8
+#define LP_NEW_CLIP          0x10
+#define LP_NEW_SCISSOR       0x20
+#define LP_NEW_STIPPLE       0x40
+#define LP_NEW_FRAMEBUFFER   0x80
+#define LP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define LP_NEW_CONSTANTS     0x200
+#define LP_NEW_SAMPLER       0x400
+#define LP_NEW_TEXTURE       0x800
+#define LP_NEW_VERTEX        0x1000
+#define LP_NEW_VS            0x2000
+#define LP_NEW_QUERY         0x4000
+
+
+struct tgsi_sampler;
+struct vertex_info;
+struct pipe_context;
+struct llvmpipe_context;
+
+struct lp_fragment_shader;
+
+
+struct lp_fragment_shader_variant_key
+{
+   enum pipe_format zsbuf_format;
+   struct pipe_depth_state depth;
+   struct pipe_alpha_state alpha;
+   struct pipe_blend_state blend;
+
+   struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
+};
+
+
+struct lp_fragment_shader_variant
+{
+   struct lp_fragment_shader *shader;
+
+   struct lp_fragment_shader_variant_key key;
+
+   LLVMValueRef function;
+
+   lp_jit_frag_func jit_function;
+
+   struct lp_fragment_shader_variant *next;
+};
+
+
+/**
+ * Subclass of pipe_shader_state (though it doesn't really need to be).
+ *
+ * This is starting to look an awful lot like a quad pipeline stage...
+ */
+struct lp_fragment_shader
+{
+   struct pipe_shader_state base;
+
+   struct tgsi_shader_info info;
+
+   struct lp_fragment_shader_variant *variants;
+
+   struct lp_fragment_shader_variant *current;
+};
+
+
+/** Subclass of pipe_shader_state */
+struct lp_vertex_shader {
+   struct pipe_shader_state shader;
+   struct draw_vertex_shader *draw_data;
+};
+
+
+
+void *
+llvmpipe_create_blend_state(struct pipe_context *,
+                            const struct pipe_blend_state *);
+void llvmpipe_bind_blend_state(struct pipe_context *,
+                               void *);
+void llvmpipe_delete_blend_state(struct pipe_context *,
+                                 void *);
+
+void *
+llvmpipe_create_sampler_state(struct pipe_context *,
+                              const struct pipe_sampler_state *);
+void llvmpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void llvmpipe_delete_sampler_state(struct pipe_context *, void *);
+
+void *
+llvmpipe_create_depth_stencil_state(struct pipe_context *,
+                                    const struct pipe_depth_stencil_alpha_state *);
+void llvmpipe_bind_depth_stencil_state(struct pipe_context *, void *);
+void llvmpipe_delete_depth_stencil_state(struct pipe_context *, void *);
+
+void *
+llvmpipe_create_rasterizer_state(struct pipe_context *,
+                                 const struct pipe_rasterizer_state *);
+void llvmpipe_bind_rasterizer_state(struct pipe_context *, void *);
+void llvmpipe_delete_rasterizer_state(struct pipe_context *, void *);
+
+void llvmpipe_set_framebuffer_state( struct pipe_context *,
+			     const struct pipe_framebuffer_state * );
+
+void llvmpipe_set_blend_color( struct pipe_context *pipe,
+                               const struct pipe_blend_color *blend_color );
+
+void llvmpipe_set_clip_state( struct pipe_context *,
+			     const struct pipe_clip_state * );
+
+void llvmpipe_set_constant_buffer(struct pipe_context *,
+                                  uint shader, uint index,
+                                  const struct pipe_constant_buffer *buf);
+
+void *llvmpipe_create_fs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void llvmpipe_bind_fs_state(struct pipe_context *, void *);
+void llvmpipe_delete_fs_state(struct pipe_context *, void *);
+void *llvmpipe_create_vs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void llvmpipe_bind_vs_state(struct pipe_context *, void *);
+void llvmpipe_delete_vs_state(struct pipe_context *, void *);
+
+void llvmpipe_set_polygon_stipple( struct pipe_context *,
+				  const struct pipe_poly_stipple * );
+
+void llvmpipe_set_scissor_state( struct pipe_context *,
+                                 const struct pipe_scissor_state * );
+
+void llvmpipe_set_sampler_textures( struct pipe_context *,
+                                    unsigned num,
+                                    struct pipe_texture ** );
+
+void llvmpipe_set_viewport_state( struct pipe_context *,
+                                  const struct pipe_viewport_state * );
+
+void llvmpipe_set_vertex_elements(struct pipe_context *,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *);
+
+void llvmpipe_set_vertex_buffers(struct pipe_context *,
+                                 unsigned count,
+                                 const struct pipe_vertex_buffer *);
+
+void llvmpipe_update_fs(struct llvmpipe_context *lp);
+
+void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe );
+
+
+boolean llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+			     unsigned start, unsigned count);
+
+boolean llvmpipe_draw_elements(struct pipe_context *pipe,
+			       struct pipe_buffer *indexBuffer,
+			       unsigned indexSize,
+			       unsigned mode, unsigned start, unsigned count);
+boolean
+llvmpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count);
+
+void
+llvmpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+
+
+void
+llvmpipe_map_transfers(struct llvmpipe_context *lp);
+
+void
+llvmpipe_unmap_transfers(struct llvmpipe_context *lp);
+
+void
+llvmpipe_map_texture_surfaces(struct llvmpipe_context *lp);
+
+void
+llvmpipe_unmap_texture_surfaces(struct llvmpipe_context *lp);
+
+
+struct vertex_info *
+llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe);
+
+struct vertex_info *
+llvmpipe_get_vbuf_vertex_info(struct llvmpipe_context *llvmpipe);
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
new file mode 100644
index 0000000000..3f03bd0057
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_debug_dump.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_state.h"
+
+
+void *
+llvmpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+void llvmpipe_bind_blend_state( struct pipe_context *pipe,
+                                void *blend )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->blend = blend;
+
+   llvmpipe->dirty |= LP_NEW_BLEND;
+}
+
+void llvmpipe_delete_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   FREE( blend );
+}
+
+
+void llvmpipe_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i, j;
+
+   memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
+
+   if(!llvmpipe->jit_context.blend_color)
+      llvmpipe->jit_context.blend_color = align_malloc(4 * 16, 16);
+   for (i = 0; i < 4; ++i) {
+      uint8_t c = float_to_ubyte(blend_color->color[i]);
+      for (j = 0; j < 16; ++j)
+         llvmpipe->jit_context.blend_color[i*4 + j] = c;
+   }
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+void *
+llvmpipe_create_depth_stencil_state(struct pipe_context *pipe,
+				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+}
+
+void
+llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   if(llvmpipe->depth_stencil)
+      llvmpipe->jit_context.alpha_ref_value = llvmpipe->depth_stencil->alpha.ref_value;
+
+   llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+void
+llvmpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_clip.c b/src/gallium/drivers/llvmpipe/lp_state_clip.c
new file mode 100644
index 0000000000..df68f27acc
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_clip.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "lp_context.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+
+
+void llvmpipe_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(llvmpipe->draw, clip);
+}
+
+
+void llvmpipe_set_viewport_state( struct pipe_context *pipe,
+                                  const struct pipe_viewport_state *viewport )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(llvmpipe->draw, viewport);
+
+   llvmpipe->viewport = *viewport; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_VIEWPORT;
+}
+
+
+void llvmpipe_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->scissor = *scissor; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_SCISSOR;
+}
+
+
+void llvmpipe_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->poly_stipple = *stipple; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_STIPPLE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
new file mode 100644
index 0000000000..30fb41ea65
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -0,0 +1,277 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "lp_context.h"
+#include "lp_screen.h"
+#include "lp_tex_cache.h"
+#include "lp_state.h"
+
+
+/**
+ * Mark the current vertex layout as "invalid".
+ * We'll validate the vertex layout later, when we start to actually
+ * render a point or line or tri.
+ */
+static void
+invalidate_vertex_layout(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->vertex_info.num_attribs =  0;
+}
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout and returns a pointer to a
+ * vertex_info object.
+ */
+struct vertex_info *
+llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
+{
+   struct vertex_info *vinfo = &llvmpipe->vertex_info;
+
+   if (vinfo->num_attribs == 0) {
+      /* compute vertex layout now */
+      const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+      const enum interp_mode colorInterp
+         = llvmpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      uint i;
+
+      if (llvmpipe->vbuf) {
+         /* if using the post-transform vertex buffer, tell draw_vbuf to
+          * simply emit the whole post-xform vertex as-is:
+          */
+         struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
+         const uint num = draw_num_vs_outputs(llvmpipe->draw);
+         uint i;
+
+         /* No longer any need to try and emit draw vertex_header info.
+          */
+         vinfo_vbuf->num_attribs = 0;
+         for (i = 0; i < num; i++) {
+            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+         }
+         draw_compute_vertex_size(vinfo_vbuf);
+      }
+
+      /*
+       * Loop over fragment shader inputs, searching for the matching output
+       * from the vertex shader.
+       */
+      vinfo->num_attribs = 0;
+      for (i = 0; i < lpfs->info.num_inputs; i++) {
+         int src;
+         enum interp_mode interp;
+
+         switch (lpfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
+         switch (lpfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            src = draw_find_vs_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_POSITION, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_COLOR, 
+                                 lpfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+            break;
+
+         case TGSI_SEMANTIC_FOG:
+            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_FOG, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+            break;
+
+         case TGSI_SEMANTIC_GENERIC:
+         case TGSI_SEMANTIC_FACE:
+            /* this includes texcoords and varying vars */
+            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_GENERIC,
+                                      lpfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+            break;
+
+         default:
+            assert(0);
+         }
+      }
+
+      llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
+                                                 TGSI_SEMANTIC_PSIZE, 0);
+      if (llvmpipe->psize_slot > 0) {
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
+                               llvmpipe->psize_slot);
+      }
+
+      draw_compute_vertex_size(vinfo);
+   }
+
+   return vinfo;
+}
+
+
+/**
+ * Called from vbuf module.
+ *
+ * Note that there's actually two different vertex layouts in llvmpipe.
+ *
+ * The normal one is computed in llvmpipe_get_vertex_info() above and is
+ * used by the point/line/tri "setup" code.
+ *
+ * The other one (this one) is only used by the vbuf module (which is
+ * not normally used by default but used in testing).  For the vbuf module,
+ * we basically want to pass-through the draw module's vertex layout as-is.
+ * When the llvmpipe vbuf code begins drawing, the normal vertex layout
+ * will come into play again.
+ */
+struct vertex_info *
+llvmpipe_get_vbuf_vertex_info(struct llvmpipe_context *llvmpipe)
+{
+   (void) llvmpipe_get_vertex_info(llvmpipe);
+   return &llvmpipe->vertex_info_vbuf;
+}
+
+
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct llvmpipe_context *lp)
+{
+   /* LP_NEW_FRAMEBUFFER
+    */
+   uint surfWidth = lp->framebuffer.width;
+   uint surfHeight = lp->framebuffer.height;
+
+   /* LP_NEW_RASTERIZER
+    */
+   if (lp->rasterizer->scissor) {
+
+      /* LP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
+      lp->cliprect.minx = MAX2(lp->scissor.minx, 0);
+      lp->cliprect.miny = MAX2(lp->scissor.miny, 0);
+      lp->cliprect.maxx = MIN2(lp->scissor.maxx, surfWidth);
+      lp->cliprect.maxy = MIN2(lp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      lp->cliprect.minx = 0;
+      lp->cliprect.miny = 0;
+      lp->cliprect.maxx = surfWidth;
+      lp->cliprect.maxy = surfHeight;
+   }
+}
+
+
+static void
+update_tgsi_samplers( struct llvmpipe_context *llvmpipe )
+{
+   unsigned i;
+
+   /* vertex shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->sampler[i];
+      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->texture[i];
+      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
+   }
+
+   /* fragment shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      llvmpipe->tgsi.frag_samplers[i].sampler = llvmpipe->sampler[i];
+      llvmpipe->tgsi.frag_samplers[i].texture = llvmpipe->texture[i];
+      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
+   }
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      lp_tex_tile_cache_validate_texture( llvmpipe->tex_cache[i] );
+   }
+
+   llvmpipe->jit_context.samplers = (struct tgsi_sampler **)llvmpipe->tgsi.frag_samplers_list;
+}
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
+{
+   struct llvmpipe_screen *lp_screen = llvmpipe_screen(llvmpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (llvmpipe->tex_timestamp != lp_screen->timestamp) {
+      llvmpipe->tex_timestamp = lp_screen->timestamp;
+      llvmpipe->dirty |= LP_NEW_TEXTURE;
+   }
+      
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER |
+                          LP_NEW_TEXTURE))
+      update_tgsi_samplers( llvmpipe );
+
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
+                          LP_NEW_VS))
+      invalidate_vertex_layout( llvmpipe );
+
+   if (llvmpipe->dirty & (LP_NEW_SCISSOR |
+                          LP_NEW_RASTERIZER |
+                          LP_NEW_FRAMEBUFFER))
+      compute_cliprect(llvmpipe);
+
+   if (llvmpipe->dirty & (LP_NEW_FS |
+                          LP_NEW_BLEND |
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_TEXTURE))
+      llvmpipe_update_fs( llvmpipe );
+
+
+   llvmpipe->dirty = 0;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
new file mode 100644
index 0000000000..9faed5a0b1
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -0,0 +1,762 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Code generate the whole fragment pipeline.
+ *
+ * The fragment pipeline consists of the following stages:
+ * - stipple (TBI)
+ * - early depth test
+ * - fragment shader
+ * - alpha test
+ * - depth/stencil test (stencil TBI)
+ * - blending
+ *
+ * This file has only the glue to assembly the fragment pipeline.  The actual
+ * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
+ * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
+ * muster the LLVM JIT execution engine to create a function that follows an
+ * established binary interface and that can be called from C directly.
+ *
+ * A big source of complexity here is that we often want to run different
+ * stages with different precisions and data types and precisions. For example,
+ * the fragment shader needs typically to be done in floats, but the
+ * depth/stencil test and blending is better done in the type that most closely
+ * matches the depth/stencil and color buffer respectively.
+ *
+ * Since the width of a SIMD vector register stays the same regardless of the
+ * element type, different types imply different number of elements, so we must
+ * code generate more instances of the stages with larger types to be able to
+ * feed/consume the stages with smaller types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_debug_dump.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_depth.h"
+#include "lp_bld_interp.h"
+#include "lp_bld_tgsi.h"
+#include "lp_bld_alpha.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_debug.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_quad.h"
+#include "lp_tex_sample.h"
+
+
+static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
+static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+
+
+/*
+ * Derive from the quad's upper left scalar coordinates the coordinates for
+ * all other quad pixels
+ */
+static void
+generate_pos0(LLVMBuilderRef builder,
+              LLVMValueRef x,
+              LLVMValueRef y,
+              LLVMValueRef *x0,
+              LLVMValueRef *y0)
+{
+   LLVMTypeRef int_elem_type = LLVMInt32Type();
+   LLVMTypeRef int_vec_type = LLVMVectorType(int_elem_type, QUAD_SIZE);
+   LLVMTypeRef elem_type = LLVMFloatType();
+   LLVMTypeRef vec_type = LLVMVectorType(elem_type, QUAD_SIZE);
+   LLVMValueRef x_offsets[QUAD_SIZE];
+   LLVMValueRef y_offsets[QUAD_SIZE];
+   unsigned i;
+
+   x = lp_build_broadcast(builder, int_vec_type, x);
+   y = lp_build_broadcast(builder, int_vec_type, y);
+
+   for(i = 0; i < QUAD_SIZE; ++i) {
+      x_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_x[i], 0);
+      y_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_y[i], 0);
+   }
+
+   x = LLVMBuildAdd(builder, x, LLVMConstVector(x_offsets, QUAD_SIZE), "");
+   y = LLVMBuildAdd(builder, y, LLVMConstVector(y_offsets, QUAD_SIZE), "");
+
+   *x0 = LLVMBuildSIToFP(builder, x, vec_type, "");
+   *y0 = LLVMBuildSIToFP(builder, y, vec_type, "");
+}
+
+
+/**
+ * Generate the depth test.
+ */
+static void
+generate_depth(LLVMBuilderRef builder,
+               const struct lp_fragment_shader_variant_key *key,
+               struct lp_type src_type,
+               struct lp_build_mask_context *mask,
+               LLVMValueRef src,
+               LLVMValueRef dst_ptr)
+{
+   const struct util_format_description *format_desc;
+   struct lp_type dst_type;
+
+   if(!key->depth.enabled)
+      return;
+
+   format_desc = util_format_description(key->zsbuf_format);
+   assert(format_desc);
+
+   /* Pick the depth type. */
+   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
+
+   /* FIXME: Cope with a depth test type with a different bit width. */
+   assert(dst_type.width == src_type.width);
+   assert(dst_type.length == src_type.length);
+
+#if 1
+   src = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                 src_type,
+                                                 dst_type.width,
+                                                 src);
+#else
+   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
+#endif
+
+   lp_build_depth_test(builder,
+                       &key->depth,
+                       dst_type,
+                       format_desc,
+                       mask,
+                       src,
+                       dst_ptr);
+}
+
+
+/**
+ * Generate the fragment shader, depth/stencil test, and alpha tests.
+ */
+static void
+generate_fs(struct llvmpipe_context *lp,
+            struct lp_fragment_shader *shader,
+            const struct lp_fragment_shader_variant_key *key,
+            LLVMBuilderRef builder,
+            struct lp_type type,
+            LLVMValueRef context_ptr,
+            unsigned i,
+            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_sampler_soa *sampler,
+            LLVMValueRef *pmask,
+            LLVMValueRef *color,
+            LLVMValueRef depth_ptr)
+{
+   const struct tgsi_token *tokens = shader->base.tokens;
+   LLVMTypeRef elem_type;
+   LLVMTypeRef vec_type;
+   LLVMTypeRef int_vec_type;
+   LLVMValueRef consts_ptr;
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
+   LLVMValueRef z = interp->pos[2];
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask;
+   boolean early_depth_test;
+   unsigned attrib;
+   unsigned chan;
+
+   elem_type = lp_build_elem_type(type);
+   vec_type = lp_build_vec_type(type);
+   int_vec_type = lp_build_int_vec_type(type);
+
+   consts_ptr = lp_jit_context_constants(builder, context_ptr);
+
+   flow = lp_build_flow_create(builder);
+
+   memset(outputs, 0, sizeof outputs);
+
+   lp_build_flow_scope_begin(flow);
+
+   /* Declare the color and z variables */
+   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+      color[chan] = LLVMGetUndef(vec_type);
+      lp_build_flow_scope_declare(flow, &color[chan]);
+   }
+   lp_build_flow_scope_declare(flow, &z);
+
+   lp_build_mask_begin(&mask, flow, type, *pmask);
+
+   early_depth_test =
+      key->depth.enabled &&
+      !key->alpha.enabled &&
+      !shader->info.uses_kill &&
+      !shader->info.writes_z;
+
+   if(early_depth_test)
+      generate_depth(builder, key,
+                     type, &mask,
+                     z, depth_ptr);
+
+   lp_build_tgsi_soa(builder, tokens, type, &mask,
+                     consts_ptr, interp->pos, interp->inputs,
+                     outputs, sampler);
+
+   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(outputs[attrib][chan]) {
+            lp_build_name(outputs[attrib][chan], "output%u.%u.%c", i, attrib, "xyzw"[chan]);
+
+            switch (shader->info.output_semantic_name[attrib]) {
+            case TGSI_SEMANTIC_COLOR:
+               {
+                  unsigned cbuf = shader->info.output_semantic_index[attrib];
+
+                  lp_build_name(outputs[attrib][chan], "color%u.%u.%c", i, attrib, "rgba"[chan]);
+
+                  /* Alpha test */
+                  /* XXX: should the alpha reference value be passed separately? */
+                  if(cbuf == 0 && chan == 3) {
+                     LLVMValueRef alpha = outputs[attrib][chan];
+                     LLVMValueRef alpha_ref_value;
+                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+                     lp_build_alpha_test(builder, &key->alpha, type,
+                                         &mask, alpha, alpha_ref_value);
+                  }
+
+                  if(cbuf == 0)
+                     color[chan] = outputs[attrib][chan];
+
+                  break;
+               }
+
+            case TGSI_SEMANTIC_POSITION:
+               if(chan == 2)
+                  z = outputs[attrib][chan];
+               break;
+            }
+         }
+      }
+   }
+
+   if(!early_depth_test)
+      generate_depth(builder, key,
+                     type, &mask,
+                     z, depth_ptr);
+
+   lp_build_mask_end(&mask);
+
+   lp_build_flow_scope_end(flow);
+
+   lp_build_flow_destroy(flow);
+
+   *pmask = mask.value;
+
+}
+
+
+/**
+ * Generate color blending and color output.
+ */
+static void
+generate_blend(const struct pipe_blend_state *blend,
+               LLVMBuilderRef builder,
+               struct lp_type type,
+               LLVMValueRef context_ptr,
+               LLVMValueRef mask,
+               LLVMValueRef *src,
+               LLVMValueRef dst_ptr)
+{
+   struct lp_build_context bld;
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask_ctx;
+   LLVMTypeRef vec_type;
+   LLVMTypeRef int_vec_type;
+   LLVMValueRef const_ptr;
+   LLVMValueRef con[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef res[4];
+   unsigned chan;
+
+   lp_build_context_init(&bld, builder, type);
+
+   flow = lp_build_flow_create(builder);
+   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+
+   vec_type = lp_build_vec_type(type);
+   int_vec_type = lp_build_int_vec_type(type);
+
+   const_ptr = lp_jit_context_blend_color(builder, context_ptr);
+   const_ptr = LLVMBuildBitCast(builder, const_ptr,
+                                LLVMPointerType(vec_type, 0), "");
+
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+      con[chan] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, const_ptr, &index, 1, ""), "");
+
+      dst[chan] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dst_ptr, &index, 1, ""), "");
+
+      lp_build_name(con[chan], "con.%c", "rgba"[chan]);
+      lp_build_name(dst[chan], "dst.%c", "rgba"[chan]);
+   }
+
+   lp_build_blend_soa(builder, blend, type, src, dst, con, res);
+
+   for(chan = 0; chan < 4; ++chan) {
+      if(blend->colormask & (1 << chan)) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         lp_build_name(res[chan], "res.%c", "rgba"[chan]);
+         res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
+         LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      }
+   }
+
+   lp_build_mask_end(&mask_ctx);
+   lp_build_flow_destroy(flow);
+}
+
+
+/**
+ * Generate the runtime callable function for the whole fragment pipeline.
+ */
+static struct lp_fragment_shader_variant *
+generate_fragment(struct llvmpipe_context *lp,
+                  struct lp_fragment_shader *shader,
+                  const struct lp_fragment_shader_variant_key *key)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+   struct lp_fragment_shader_variant *variant;
+   struct lp_type fs_type;
+   struct lp_type blend_type;
+   LLVMTypeRef fs_elem_type;
+   LLVMTypeRef fs_vec_type;
+   LLVMTypeRef fs_int_vec_type;
+   LLVMTypeRef blend_vec_type;
+   LLVMTypeRef blend_int_vec_type;
+   LLVMTypeRef arg_types[9];
+   LLVMTypeRef func_type;
+   LLVMValueRef context_ptr;
+   LLVMValueRef x;
+   LLVMValueRef y;
+   LLVMValueRef a0_ptr;
+   LLVMValueRef dadx_ptr;
+   LLVMValueRef dady_ptr;
+   LLVMValueRef mask_ptr;
+   LLVMValueRef color_ptr;
+   LLVMValueRef depth_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef x0;
+   LLVMValueRef y0;
+   struct lp_build_sampler_soa *sampler;
+   struct lp_build_interp_soa_context interp;
+   LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef blend_mask;
+   LLVMValueRef blend_in_color[NUM_CHANNELS];
+   unsigned num_fs;
+   unsigned i;
+   unsigned chan;
+
+#ifdef DEBUG
+   tgsi_dump(shader->base.tokens, 0);
+   if(key->depth.enabled) {
+      debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
+      debug_printf("depth.writemask = %u\n", key->depth.writemask);
+      debug_printf("depth.occlusion_count = %u\n", key->depth.occlusion_count);
+   }
+   if(key->alpha.enabled) {
+      debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
+      debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
+   }
+   if(key->blend.logicop_enable) {
+      debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
+   }
+   else if(key->blend.blend_enable) {
+      debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rgb_func, TRUE));
+      debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_src_factor, TRUE));
+      debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_dst_factor, TRUE));
+      debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.alpha_func, TRUE));
+      debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_src_factor, TRUE));
+      debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
+   }
+   debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
+#endif
+
+   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+   if(!variant)
+      return NULL;
+
+   variant->shader = shader;
+   memcpy(&variant->key, key, sizeof *key);
+
+   /* TODO: actually pick these based on the fs and color buffer
+    * characteristics. */
+
+   memset(&fs_type, 0, sizeof fs_type);
+   fs_type.floating = TRUE; /* floating point values */
+   fs_type.sign = TRUE;     /* values are signed */
+   fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
+   fs_type.width = 32;      /* 32-bit float */
+   fs_type.length = 4;      /* 4 element per vector */
+   num_fs = 4;
+
+   memset(&blend_type, 0, sizeof blend_type);
+   blend_type.floating = FALSE; /* values are integers */
+   blend_type.sign = FALSE;     /* values are unsigned */
+   blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
+   blend_type.width = 8;        /* 8-bit ubyte values */
+   blend_type.length = 16;      /* 16 elements per vector */
+
+   /* 
+    * Generate the function prototype. Any change here must be reflected in
+    * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
+    */
+
+   fs_elem_type = lp_build_elem_type(fs_type);
+   fs_vec_type = lp_build_vec_type(fs_type);
+   fs_int_vec_type = lp_build_int_vec_type(fs_type);
+
+   blend_vec_type = lp_build_vec_type(blend_type);
+   blend_int_vec_type = lp_build_int_vec_type(blend_type);
+
+   arg_types[0] = screen->context_ptr_type;            /* context */
+   arg_types[1] = LLVMInt32Type();                     /* x */
+   arg_types[2] = LLVMInt32Type();                     /* y */
+   arg_types[3] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
+   arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
+   arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dady */
+   arg_types[6] = LLVMPointerType(fs_int_vec_type, 0); /* mask */
+   arg_types[7] = LLVMPointerType(blend_vec_type, 0);  /* color */
+   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+
+   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+
+   variant->function = LLVMAddFunction(screen->module, "shader", func_type);
+   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(variant->function, i), LLVMNoAliasAttribute);
+
+   context_ptr  = LLVMGetParam(variant->function, 0);
+   x            = LLVMGetParam(variant->function, 1);
+   y            = LLVMGetParam(variant->function, 2);
+   a0_ptr       = LLVMGetParam(variant->function, 3);
+   dadx_ptr     = LLVMGetParam(variant->function, 4);
+   dady_ptr     = LLVMGetParam(variant->function, 5);
+   mask_ptr     = LLVMGetParam(variant->function, 6);
+   color_ptr    = LLVMGetParam(variant->function, 7);
+   depth_ptr    = LLVMGetParam(variant->function, 8);
+
+   lp_build_name(context_ptr, "context");
+   lp_build_name(x, "x");
+   lp_build_name(y, "y");
+   lp_build_name(a0_ptr, "a0");
+   lp_build_name(dadx_ptr, "dadx");
+   lp_build_name(dady_ptr, "dady");
+   lp_build_name(mask_ptr, "mask");
+   lp_build_name(color_ptr, "color");
+   lp_build_name(depth_ptr, "depth");
+
+   /*
+    * Function body
+    */
+
+   block = LLVMAppendBasicBlock(variant->function, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   generate_pos0(builder, x, y, &x0, &y0);
+
+   lp_build_interp_soa_init(&interp, shader->base.tokens, builder, fs_type,
+                            a0_ptr, dadx_ptr, dady_ptr,
+                            x0, y0, 2, 0);
+
+#if 0
+   /* C texture sampling */
+   sampler = lp_c_sampler_soa_create(context_ptr);
+#else
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
+#endif
+
+   for(i = 0; i < num_fs; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef out_color[NUM_CHANNELS];
+      LLVMValueRef depth_ptr_i;
+
+      if(i != 0)
+         lp_build_interp_soa_update(&interp);
+
+      fs_mask[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, mask_ptr, &index, 1, ""), "");
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
+
+      generate_fs(lp, shader, key,
+                  builder,
+                  fs_type,
+                  context_ptr,
+                  i,
+                  &interp,
+                  sampler,
+                  &fs_mask[i],
+                  out_color,
+                  depth_ptr_i);
+
+      for(chan = 0; chan < NUM_CHANNELS; ++chan)
+         fs_out_color[chan][i] = out_color[chan];
+   }
+
+   sampler->destroy(sampler);
+
+   /* 
+    * Convert the fs's output color and mask to fit to the blending type. 
+    */
+
+   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+      lp_build_conv(builder, fs_type, blend_type,
+                    fs_out_color[chan], num_fs,
+                    &blend_in_color[chan], 1);
+      lp_build_name(blend_in_color[chan], "color.%c", "rgba"[chan]);
+
+   }
+
+   lp_build_conv_mask(builder, fs_type, blend_type,
+                               fs_mask, num_fs,
+                               &blend_mask, 1);
+
+   /*
+    * Blending.
+    */
+
+   generate_blend(&key->blend,
+                  builder,
+                  blend_type,
+                  context_ptr,
+                  blend_mask,
+                  blend_in_color,
+                  color_ptr);
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+
+   LLVMRunFunctionPassManager(screen->pass, variant->function);
+
+#ifdef DEBUG
+   LLVMDumpValue(variant->function);
+   debug_printf("\n");
+#endif
+
+   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
+      LLVMDumpValue(variant->function);
+      abort();
+   }
+
+   variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
+
+#ifdef DEBUG
+   lp_disassemble(variant->jit_function);
+#endif
+
+   variant->next = shader->variants;
+   shader->variants = variant;
+
+   return variant;
+}
+
+
+void *
+llvmpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct lp_fragment_shader *shader;
+
+   shader = CALLOC_STRUCT(lp_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   /* get/save the summary info for this shader */
+   tgsi_scan_shader(templ->tokens, &shader->info);
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   return shader;
+}
+
+
+void
+llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->fs = (struct lp_fragment_shader *) fs;
+
+   llvmpipe->dirty |= LP_NEW_FS;
+}
+
+
+void
+llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   struct lp_fragment_shader *shader = fs;
+   struct lp_fragment_shader_variant *variant;
+
+   assert(fs != llvmpipe->fs);
+
+   variant = shader->variants;
+   while(variant) {
+      struct lp_fragment_shader_variant *next = variant->next;
+
+      if(variant->function) {
+         if(variant->jit_function)
+            LLVMFreeMachineCodeForFunction(screen->engine, variant->function);
+         LLVMDeleteFunction(variant->function);
+      }
+
+      FREE(variant);
+
+      variant = next;
+   }
+
+   FREE((void *) shader->base.tokens);
+   FREE(shader);
+}
+
+
+
+void
+llvmpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* note: reference counting */
+   pipe_buffer_reference(&llvmpipe->constants[shader].buffer,
+			 buf ? buf->buffer : NULL);
+
+   llvmpipe->dirty |= LP_NEW_CONSTANTS;
+}
+
+
+/**
+ * We need to generate several variants of the fragment pipeline to match
+ * all the combinations of the contributing state atoms.
+ *
+ * TODO: there is actually no reason to tie this to context state -- the
+ * generated code could be cached globally in the screen.
+ */
+static void
+make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   memset(key, 0, sizeof *key);
+
+   if(lp->framebuffer.zsbuf &&
+      lp->depth_stencil->depth.enabled) {
+      key->zsbuf_format = lp->framebuffer.zsbuf->format;
+      memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   }
+
+   key->alpha.enabled = lp->depth_stencil->alpha.enabled;
+   if(key->alpha.enabled)
+      key->alpha.func = lp->depth_stencil->alpha.func;
+   /* alpha.ref_value is passed in jit_context */
+
+   if(lp->framebuffer.cbufs[0]) {
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+      assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
+             format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
+
+      /* mask out color channels not present in the color buffer */
+      for(chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         if(swizzle > 4)
+            key->blend.colormask &= ~(1 << chan);
+      }
+   }
+
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+         lp_sampler_static_state(&key->sampler[i], lp->texture[i], lp->sampler[i]);
+}
+
+
+void 
+llvmpipe_update_fs(struct llvmpipe_context *lp)
+{
+   struct lp_fragment_shader *shader = lp->fs;
+   struct lp_fragment_shader_variant_key key;
+   struct lp_fragment_shader_variant *variant;
+
+   make_variant_key(lp, shader, &key);
+
+   variant = shader->variants;
+   while(variant) {
+      if(memcmp(&variant->key, &key, sizeof key) == 0)
+         break;
+
+      variant = variant->next;
+   }
+
+   if(!variant)
+      variant = generate_fragment(lp, shader, &key);
+
+   shader->current = variant;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
new file mode 100644
index 0000000000..4561c6b845
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   return mem_dup(rast, sizeof(*rast));
+}
+
+void llvmpipe_bind_rasterizer_state(struct pipe_context *pipe,
+                                    void *setup)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(llvmpipe->draw, setup);
+
+   llvmpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+
+   llvmpipe->dirty |= LP_NEW_RASTERIZER;
+}
+
+void llvmpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                      void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
new file mode 100644
index 0000000000..c69d90c723
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -0,0 +1,127 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+#include "lp_tex_cache.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+llvmpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+void
+llvmpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == llvmpipe->num_samplers &&
+       !memcmp(llvmpipe->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < num; ++i)
+      llvmpipe->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      llvmpipe->sampler[i] = NULL;
+
+   llvmpipe->num_samplers = num;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+void
+llvmpipe_set_sampler_textures(struct pipe_context *pipe,
+                              unsigned num, struct pipe_texture **texture)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == llvmpipe->num_textures &&
+       !memcmp(llvmpipe->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+
+      pipe_texture_reference(&llvmpipe->texture[i], tex);
+      lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
+
+      if(tex) {
+         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+         struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
+         jit_tex->width = tex->width[0];
+         jit_tex->height = tex->height[0];
+         jit_tex->stride = lp_tex->stride[0];
+         if(!lp_tex->dt)
+            jit_tex->data = lp_tex->data;
+      }
+   }
+
+   llvmpipe->num_textures = num;
+
+   llvmpipe->dirty |= LP_NEW_TEXTURE;
+}
+
+
+void
+llvmpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
new file mode 100644
index 0000000000..177a26b7b1
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_surface.h"
+#include "lp_tile_cache.h"
+
+#include "draw/draw_context.h"
+
+
+/**
+ * XXX this might get moved someday
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ * Here, we flush the old surfaces and update the tile cache to point to the new
+ * surfaces.
+ */
+void
+llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      /* check if changing cbuf */
+      if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
+         /* flush old */
+         lp_flush_tile_cache(lp->cbuf_cache[i]);
+
+         /* assign new */
+         lp->framebuffer.cbufs[i] = fb->cbufs[i];
+
+         /* update cache */
+         lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
+      }
+   }
+
+   lp->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+   /* zbuf changing? */
+   if (lp->framebuffer.zsbuf != fb->zsbuf) {
+
+      if(lp->zsbuf_transfer) {
+         struct pipe_screen *screen = pipe->screen;
+
+         if(lp->zsbuf_map) {
+            screen->transfer_unmap(screen, lp->zsbuf_transfer);
+            lp->zsbuf_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(lp->zsbuf_transfer);
+         lp->zsbuf_transfer = NULL;
+      }
+
+      /* assign new */
+      lp->framebuffer.zsbuf = fb->zsbuf;
+
+      /* Tell draw module how deep the Z/depth buffer is */
+      if (lp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
+         depth_bits = pf_get_component_bits(lp->framebuffer.zsbuf->format,
+                                            PIPE_FORMAT_COMP_Z);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(lp->draw, mrd);
+      }
+   }
+
+   lp->framebuffer.width = fb->width;
+   lp->framebuffer.height = fb->height;
+
+   lp->dirty |= LP_NEW_FRAMEBUFFER;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
new file mode 100644
index 0000000000..1a17631a4c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_surface.h"
+
+#include "draw/draw_context.h"
+
+
+void
+llvmpipe_set_vertex_elements(struct pipe_context *pipe,
+                             unsigned count,
+                             const struct pipe_vertex_element *attribs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(llvmpipe->vertex_element, attribs,
+          count * sizeof(struct pipe_vertex_element));
+   llvmpipe->num_vertex_elements = count;
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   draw_set_vertex_elements(llvmpipe->draw, count, attribs);
+}
+
+
+void
+llvmpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(llvmpipe->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   llvmpipe->num_vertex_buffers = count;
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(llvmpipe->draw, count, buffers);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vs.c b/src/gallium/drivers/llvmpipe/lp_state_vs.c
new file mode 100644
index 0000000000..15c3029614
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -0,0 +1,96 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_state.h"
+
+
+void *
+llvmpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_vertex_shader *state;
+
+   state = CALLOC_STRUCT(lp_vertex_shader);
+   if (state == NULL ) 
+      goto fail;
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_vertex_shader(llvmpipe->draw, templ);
+   if (state->draw_data == NULL) 
+      goto fail;
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+void
+llvmpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->vs = (const struct lp_vertex_shader *)vs;
+
+   draw_bind_vertex_shader(llvmpipe->draw,
+                           (llvmpipe->vs ? llvmpipe->vs->draw_data : NULL));
+
+   llvmpipe->dirty |= LP_NEW_VS;
+}
+
+
+void
+llvmpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   struct lp_vertex_shader *state =
+      (struct lp_vertex_shader *)vs;
+
+   draw_delete_vertex_shader(llvmpipe->draw, state->draw_data);
+   FREE( state );
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
new file mode 100644
index 0000000000..6110b0a193
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "lp_context.h"
+#include "lp_surface.h"
+
+
+static void
+lp_surface_copy(struct pipe_context *pipe,
+                struct pipe_surface *dest, unsigned destx, unsigned desty,
+                struct pipe_surface *src, unsigned srcx, unsigned srcy,
+                unsigned width, unsigned height)
+{
+   util_surface_copy(pipe, FALSE,
+                     dest, destx, desty,
+                     src, srcx, srcy,
+                     width, height);
+}
+
+void
+lp_init_surface_functions(struct llvmpipe_context *lp)
+{
+   lp->pipe.surface_copy = lp_surface_copy;
+   lp->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.h b/src/gallium/drivers/llvmpipe/lp_surface.h
new file mode 100644
index 0000000000..4d78a53c4f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_SURFACE_H
+#define LP_SURFACE_H
+
+
+struct llvmpipe_context;
+
+
+extern void
+lp_init_surface_functions(struct llvmpipe_context *lp);
+
+
+#endif /* LP_SURFACE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
new file mode 100644
index 0000000000..a88e110c66
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -0,0 +1,128 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_TEST_H
+#define LP_TEST_H
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Analysis.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/BitWriter.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_debug_dump.h"
+
+#include "lp_bld_type.h"
+
+
+void
+write_tsv_header(FILE *fp);
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp);
+
+
+static INLINE uint64_t
+rdtsc(void)
+{
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   uint32_t hi, lo;
+   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+#else
+   return 0;
+#endif
+}
+
+
+float
+random_float(void);
+
+
+void
+dump_type(FILE *fp, struct lp_type type);
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index);
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index);
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst);
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src);
+
+
+void
+random_vec(struct lp_type type, void *dst);
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref);
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src);
+
+
+#endif /* !LP_TEST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
new file mode 100644
index 0000000000..94b661dcba
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -0,0 +1,881 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for blend LLVM IR generation
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Blend computation code derived from code written by
+ * @author Brian Paul <brian@vmware.com>
+ */
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_debug.h"
+#include "lp_test.h"
+
+
+enum vector_mode
+{
+   AoS = 0,
+   SoA = 1
+};
+
+
+typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res);
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "mode\t"
+           "type\t"
+           "sep_func\t"
+           "sep_src_factor\t"
+           "sep_dst_factor\t"
+           "rgb_func\t"
+           "rgb_src_factor\t"
+           "rgb_dst_factor\t"
+           "alpha_func\t"
+           "alpha_src_factor\t"
+           "alpha_dst_factor\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct pipe_blend_state *blend,
+              enum vector_mode mode,
+              struct lp_type type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   if (mode == AoS) {
+      fprintf(fp, "%.1f\t", cycles / type.length);
+      fprintf(fp, "aos\t");
+   }
+
+   if (mode == SoA) {
+      fprintf(fp, "%.1f\t", cycles / (4 * type.length));
+      fprintf(fp, "soa\t");
+   }
+
+   fprintf(fp, "%s%u%sx%u\t",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           "%s\t%s\t%s\t",
+           blend->rgb_func != blend->alpha_func ? "true" : "false",
+           blend->rgb_src_factor != blend->alpha_src_factor ? "true" : "false",
+           blend->rgb_dst_factor != blend->alpha_dst_factor ? "true" : "false");
+
+   fprintf(fp,
+           "%s\t%s\t%s\t%s\t%s\t%s\n",
+           debug_dump_blend_func(blend->rgb_func, TRUE),
+           debug_dump_blend_factor(blend->rgb_src_factor, TRUE),
+           debug_dump_blend_factor(blend->rgb_dst_factor, TRUE),
+           debug_dump_blend_func(blend->alpha_func, TRUE),
+           debug_dump_blend_factor(blend->alpha_src_factor, TRUE),
+           debug_dump_blend_factor(blend->alpha_dst_factor, TRUE));
+
+   fflush(fp);
+}
+
+
+static void
+dump_blend_type(FILE *fp,
+                const struct pipe_blend_state *blend,
+                enum vector_mode mode,
+                struct lp_type type)
+{
+   fprintf(fp, "%s", mode ? "soa" : "aos");
+
+   fprintf(fp, " type=%s%u%sx%u",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           " %s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
+           "rgb_func",         debug_dump_blend_func(blend->rgb_func, TRUE),
+           "rgb_src_factor",   debug_dump_blend_factor(blend->rgb_src_factor, TRUE),
+           "rgb_dst_factor",   debug_dump_blend_factor(blend->rgb_dst_factor, TRUE),
+           "alpha_func",       debug_dump_blend_func(blend->alpha_func, TRUE),
+           "alpha_src_factor", debug_dump_blend_factor(blend->alpha_src_factor, TRUE),
+           "alpha_dst_factor", debug_dump_blend_factor(blend->alpha_dst_factor, TRUE));
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_blend_test(LLVMModuleRef module,
+               const struct pipe_blend_state *blend,
+               enum vector_mode mode,
+               struct lp_type type)
+{
+   LLVMTypeRef ret_type;
+   LLVMTypeRef vec_type;
+   LLVMTypeRef args[4];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMValueRef const_ptr;
+   LLVMValueRef res_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+
+   ret_type = LLVMInt64Type();
+   vec_type = lp_build_vec_type(type);
+
+   args[3] = args[2] = args[1] = args[0] = LLVMPointerType(vec_type, 0);
+   func = LLVMAddFunction(module, "test", LLVMFunctionType(LLVMVoidType(), args, 4, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   dst_ptr = LLVMGetParam(func, 1);
+   const_ptr = LLVMGetParam(func, 2);
+   res_ptr = LLVMGetParam(func, 3);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   if (mode == AoS) {
+      LLVMValueRef src;
+      LLVMValueRef dst;
+      LLVMValueRef con;
+      LLVMValueRef res;
+
+      src = LLVMBuildLoad(builder, src_ptr, "src");
+      dst = LLVMBuildLoad(builder, dst_ptr, "dst");
+      con = LLVMBuildLoad(builder, const_ptr, "const");
+
+      res = lp_build_blend_aos(builder, blend, type, src, dst, con, 3);
+
+      lp_build_name(res, "res");
+
+      LLVMBuildStore(builder, res, res_ptr);
+   }
+
+   if (mode == SoA) {
+      LLVMValueRef src[4];
+      LLVMValueRef dst[4];
+      LLVMValueRef con[4];
+      LLVMValueRef res[4];
+      unsigned i;
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         src[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, src_ptr, &index, 1, ""), "");
+         dst[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dst_ptr, &index, 1, ""), "");
+         con[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, const_ptr, &index, 1, ""), "");
+         lp_build_name(src[i], "src.%c", "rgba"[i]);
+         lp_build_name(con[i], "con.%c", "rgba"[i]);
+         lp_build_name(dst[i], "dst.%c", "rgba"[i]);
+      }
+
+      lp_build_blend_soa(builder, blend, type, src, dst, con, res);
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         lp_build_name(res[i], "res.%c", "rgba"[i]);
+         LLVMBuildStore(builder, res[i], LLVMBuildGEP(builder, res_ptr, &index, 1, ""));
+      }
+   }
+
+   LLVMBuildRetVoid(builder);;
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+/** Add and limit result to ceiling of 1.0 */
+#define ADD_SAT(R, A, B) \
+do { \
+   R = (A) + (B);  if (R > 1.0f) R = 1.0f; \
+} while (0)
+
+/** Subtract and limit result to floor of 0.0 */
+#define SUB_SAT(R, A, B) \
+do { \
+   R = (A) - (B);  if (R < 0.0f) R = 0.0f; \
+} while (0)
+
+
+static void
+compute_blend_ref_term(unsigned rgb_factor,
+                       unsigned alpha_factor,
+                       const double *factor,
+                       const double *src,
+                       const double *dst,
+                       const double *con,
+                       double *term)
+{
+   double temp;
+
+   switch (rgb_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[0] = factor[0]; /* R */
+      term[1] = factor[1]; /* G */
+      term[2] = factor[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      term[0] = factor[0] * src[0]; /* R */
+      term[1] = factor[1] * src[1]; /* G */
+      term[2] = factor[2] * src[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[0] = factor[0] * src[3]; /* R */
+      term[1] = factor[1] * src[3]; /* G */
+      term[2] = factor[2] * src[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      term[0] = factor[0] * dst[0]; /* R */
+      term[1] = factor[1] * dst[1]; /* G */
+      term[2] = factor[2] * dst[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[0] = factor[0] * dst[3]; /* R */
+      term[1] = factor[1] * dst[3]; /* G */
+      term[2] = factor[2] * dst[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      temp = MIN2(src[3], 1.0f - dst[3]);
+      term[0] = factor[0] * temp; /* R */
+      term[1] = factor[1] * temp; /* G */
+      term[2] = factor[2] * temp; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      term[0] = factor[0] * con[0]; /* R */
+      term[1] = factor[1] * con[1]; /* G */
+      term[2] = factor[2] * con[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[0] = factor[0] * con[3]; /* R */
+      term[1] = factor[1] * con[3]; /* G */
+      term[2] = factor[2] * con[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[0] = 0.0f; /* R */
+      term[1] = 0.0f; /* G */
+      term[2] = 0.0f; /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      term[0] = factor[0] * (1.0f - src[0]); /* R */
+      term[1] = factor[1] * (1.0f - src[1]); /* G */
+      term[2] = factor[2] * (1.0f - src[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[0] = factor[0] * (1.0f - src[3]); /* R */
+      term[1] = factor[1] * (1.0f - src[3]); /* G */
+      term[2] = factor[2] * (1.0f - src[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[0] = factor[0] * (1.0f - dst[3]); /* R */
+      term[1] = factor[1] * (1.0f - dst[3]); /* G */
+      term[2] = factor[2] * (1.0f - dst[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      term[0] = factor[0] * (1.0f - dst[0]); /* R */
+      term[1] = factor[1] * (1.0f - dst[1]); /* G */
+      term[2] = factor[2] * (1.0f - dst[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      term[0] = factor[0] * (1.0f - con[0]); /* R */
+      term[1] = factor[1] * (1.0f - con[1]); /* G */
+      term[2] = factor[2] * (1.0f - con[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[0] = factor[0] * (1.0f - con[3]); /* R */
+      term[1] = factor[1] * (1.0f - con[3]); /* G */
+      term[2] = factor[2] * (1.0f - con[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (alpha_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[3] = factor[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[3] = factor[3] * src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[3] = factor[3] * dst[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      term[3] = src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[3] = factor[3] * con[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[3] = 0.0f; /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[3] = factor[3] * (1.0f - src[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[3] = factor[3] * (1.0f - dst[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[3] = factor[3] * (1.0f - con[3]);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+compute_blend_ref(const struct pipe_blend_state *blend,
+                  const double *src,
+                  const double *dst,
+                  const double *con,
+                  double *res)
+{
+   double src_term[4];
+   double dst_term[4];
+
+   compute_blend_ref_term(blend->rgb_src_factor, blend->alpha_src_factor, src, src, dst, con, src_term);
+   compute_blend_ref_term(blend->rgb_dst_factor, blend->alpha_dst_factor, dst, src, dst, con, dst_term);
+
+   /*
+    * Combine RGB terms
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      ADD_SAT(res[0], src_term[0], dst_term[0]); /* R */
+      ADD_SAT(res[1], src_term[1], dst_term[1]); /* G */
+      ADD_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      SUB_SAT(res[0], src_term[0], dst_term[0]); /* R */
+      SUB_SAT(res[1], src_term[1], dst_term[1]); /* G */
+      SUB_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      SUB_SAT(res[0], dst_term[0], src_term[0]); /* R */
+      SUB_SAT(res[1], dst_term[1], src_term[1]); /* G */
+      SUB_SAT(res[2], dst_term[2], src_term[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      res[0] = MIN2(src_term[0], dst_term[0]); /* R */
+      res[1] = MIN2(src_term[1], dst_term[1]); /* G */
+      res[2] = MIN2(src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      res[0] = MAX2(src_term[0], dst_term[0]); /* R */
+      res[1] = MAX2(src_term[1], dst_term[1]); /* G */
+      res[2] = MAX2(src_term[2], dst_term[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      ADD_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      SUB_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      SUB_SAT(res[3], dst_term[3], src_term[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      res[3] = MIN2(src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      res[3] = MAX2(src_term[3], dst_term[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         const struct pipe_blend_state *blend,
+         enum vector_mode mode,
+         struct lp_type type)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef func = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   blend_test_ptr_t blend_test_ptr;
+   boolean success;
+   const unsigned n = 32;
+   int64_t cycles[n];
+   double cycles_avg = 0.0;
+   unsigned i, j;
+
+   if(verbose >= 1)
+      dump_blend_type(stdout, blend, mode, type);
+
+   module = LLVMModuleCreateWithName("test");
+
+   func = add_blend_test(module, blend, mode, type);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      if(verbose < 1)
+         dump_blend_type(stderr, blend, mode, type);
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   if(verbose >= 2)
+      LLVMDumpModule(module);
+
+   blend_test_ptr = (blend_test_ptr_t)LLVMGetPointerToGlobal(engine, func);
+
+   if(verbose >= 2)
+      lp_disassemble(blend_test_ptr);
+
+   success = TRUE;
+   for(i = 0; i < n && success; ++i) {
+      if(mode == AoS) {
+         uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t con[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t res[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+
+         random_vec(type, src);
+         random_vec(type, dst);
+         random_vec(type, con);
+
+         {
+            double fsrc[LP_MAX_VECTOR_LENGTH];
+            double fdst[LP_MAX_VECTOR_LENGTH];
+            double fcon[LP_MAX_VECTOR_LENGTH];
+            double fref[LP_MAX_VECTOR_LENGTH];
+
+            read_vec(type, src, fsrc);
+            read_vec(type, dst, fdst);
+            read_vec(type, con, fcon);
+
+            for(j = 0; j < type.length; j += 4)
+               compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);
+
+            write_vec(type, ref, fref);
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         if(!compare_vec(type, res, ref)) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+
+            fprintf(stderr, "  Src: ");
+            dump_vec(stderr, type, src);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Dst: ");
+            dump_vec(stderr, type, dst);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Con: ");
+            dump_vec(stderr, type, con);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Res: ");
+            dump_vec(stderr, type, res);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref: ");
+            dump_vec(stderr, type, ref);
+            fprintf(stderr, "\n");
+         }
+      }
+
+      if(mode == SoA) {
+         const unsigned stride = type.length*type.width/8;
+         uint8_t src[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t dst[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t con[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t res[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t ref[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+         boolean mismatch;
+
+         for(j = 0; j < 4; ++j) {
+            random_vec(type, src + j*stride);
+            random_vec(type, dst + j*stride);
+            random_vec(type, con + j*stride);
+         }
+
+         {
+            double fsrc[4];
+            double fdst[4];
+            double fcon[4];
+            double fref[4];
+            unsigned k;
+
+            for(k = 0; k < type.length; ++k) {
+               for(j = 0; j < 4; ++j) {
+                  fsrc[j] = read_elem(type, src + j*stride, k);
+                  fdst[j] = read_elem(type, dst + j*stride, k);
+                  fcon[j] = read_elem(type, con + j*stride, k);
+               }
+
+               compute_blend_ref(blend, fsrc, fdst, fcon, fref);
+
+               for(j = 0; j < 4; ++j)
+                  write_elem(type, ref + j*stride, k, fref[j]);
+            }
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         mismatch = FALSE;
+         for (j = 0; j < 4; ++j)
+            if(!compare_vec(type, res + j*stride, ref + j*stride))
+               mismatch = TRUE;
+
+         if (mismatch) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+            for(j = 0; j < 4; ++j) {
+               char channel = "RGBA"[j];
+               fprintf(stderr, "  Src%c: ", channel);
+               dump_vec(stderr, type, src + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Dst%c: ", channel);
+               dump_vec(stderr, type, dst + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Con%c: ", channel);
+               dump_vec(stderr, type, con + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Res%c: ", channel);
+               dump_vec(stderr, type, res + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Ref%c: ", channel);
+               dump_vec(stderr, type, ref + j*stride);
+               fprintf(stderr, "\n");
+            }
+         }
+      }
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, blend, mode, type, cycles_avg, success);
+
+   if (!success) {
+      if(verbose < 2)
+         LLVMDumpModule(module);
+      LLVMWriteBitcodeToFile(module, "blend.bc");
+      fprintf(stderr, "blend.bc written\n");
+      fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n");
+      abort();
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, func);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+const unsigned
+blend_factors[] = {
+   PIPE_BLENDFACTOR_ZERO,
+   PIPE_BLENDFACTOR_ONE,
+   PIPE_BLENDFACTOR_SRC_COLOR,
+   PIPE_BLENDFACTOR_SRC_ALPHA,
+   PIPE_BLENDFACTOR_DST_COLOR,
+   PIPE_BLENDFACTOR_DST_ALPHA,
+   PIPE_BLENDFACTOR_CONST_COLOR,
+   PIPE_BLENDFACTOR_CONST_ALPHA,
+#if 0
+   PIPE_BLENDFACTOR_SRC1_COLOR,
+   PIPE_BLENDFACTOR_SRC1_ALPHA,
+#endif
+   PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE,
+   PIPE_BLENDFACTOR_INV_SRC_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC_ALPHA,
+   PIPE_BLENDFACTOR_INV_DST_COLOR,
+   PIPE_BLENDFACTOR_INV_DST_ALPHA,
+   PIPE_BLENDFACTOR_INV_CONST_COLOR,
+   PIPE_BLENDFACTOR_INV_CONST_ALPHA,
+#if 0
+   PIPE_BLENDFACTOR_INV_SRC1_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC1_ALPHA,
+#endif
+};
+
+
+const unsigned
+blend_funcs[] = {
+   PIPE_BLEND_ADD,
+   PIPE_BLEND_SUBTRACT,
+   PIPE_BLEND_REVERSE_SUBTRACT,
+   PIPE_BLEND_MIN,
+   PIPE_BLEND_MAX
+};
+
+
+const struct lp_type blend_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
+};
+
+
+const unsigned num_funcs = sizeof(blend_funcs)/sizeof(blend_funcs[0]);
+const unsigned num_factors = sizeof(blend_factors)/sizeof(blend_factors[0]);
+const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   enum vector_mode mode;
+   const struct lp_type *type;
+   bool success = TRUE;
+
+   for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
+      for(alpha_func = blend_funcs; alpha_func < &blend_funcs[num_funcs]; ++alpha_func) {
+         for(rgb_src_factor = blend_factors; rgb_src_factor < &blend_factors[num_factors]; ++rgb_src_factor) {
+            for(rgb_dst_factor = blend_factors; rgb_dst_factor <= rgb_src_factor; ++rgb_dst_factor) {
+               for(alpha_src_factor = blend_factors; alpha_src_factor < &blend_factors[num_factors]; ++alpha_src_factor) {
+                  for(alpha_dst_factor = blend_factors; alpha_dst_factor <= alpha_src_factor; ++alpha_dst_factor) {
+                     for(mode = 0; mode < 2; ++mode) {
+                        for(type = blend_types; type < &blend_types[num_types]; ++type) {
+
+                           if(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+                              *alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+                              continue;
+
+                           memset(&blend, 0, sizeof blend);
+                           blend.blend_enable      = 1;
+                           blend.rgb_func          = *rgb_func;
+                           blend.rgb_src_factor    = *rgb_src_factor;
+                           blend.rgb_dst_factor    = *rgb_dst_factor;
+                           blend.alpha_func        = *alpha_func;
+                           blend.alpha_src_factor  = *alpha_src_factor;
+                           blend.alpha_dst_factor  = *alpha_dst_factor;
+                           blend.colormask         = PIPE_MASK_RGBA;
+
+                           if(!test_one(verbose, fp, &blend, mode, *type))
+                             success = FALSE;
+
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   enum vector_mode mode;
+   const struct lp_type *type;
+   unsigned long i;
+   bool success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
+      
+      do {
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      do {
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      mode = rand() & 1;
+
+      type = &blend_types[rand() % num_types];
+
+      memset(&blend, 0, sizeof blend);
+      blend.blend_enable      = 1;
+      blend.rgb_func          = *rgb_func;
+      blend.rgb_src_factor    = *rgb_src_factor;
+      blend.rgb_dst_factor    = *rgb_dst_factor;
+      blend.alpha_func        = *alpha_func;
+      blend.alpha_src_factor  = *alpha_src_factor;
+      blend.alpha_dst_factor  = *alpha_dst_factor;
+      blend.colormask         = PIPE_MASK_RGBA;
+
+      if(!test_one(verbose, fp, &blend, mode, *type))
+        success = FALSE;
+   }
+
+   return success;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
new file mode 100644
index 0000000000..9dcf58e5dc
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -0,0 +1,427 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for type conversion.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_debug.h"
+#include "lp_test.h"
+
+
+typedef void (*conv_test_ptr_t)(const void *src, const void *dst);
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "src_type\t"
+           "dst_type\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%.1f\t", cycles / MAX2(src_type.length, dst_type.length));
+
+   dump_type(fp, src_type);
+   fprintf(fp, "\t");
+
+   dump_type(fp, dst_type);
+   fprintf(fp, "\n");
+
+   fflush(fp);
+}
+
+
+static void
+dump_conv_types(FILE *fp,
+               struct lp_type src_type,
+               struct lp_type dst_type)
+{
+   fprintf(fp, "src_type=");
+   dump_type(fp, src_type);
+
+   fprintf(fp, " dst_type=");
+   dump_type(fp, dst_type);
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_conv_test(LLVMModuleRef module,
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
+{
+   LLVMTypeRef args[2];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef src[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef dst[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   args[0] = LLVMPointerType(lp_build_vec_type(src_type), 0);
+   args[1] = LLVMPointerType(lp_build_vec_type(dst_type), 0);
+
+   func = LLVMAddFunction(module, "test", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   dst_ptr = LLVMGetParam(func, 1);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   for(i = 0; i < num_srcs; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, src_ptr, &index, 1, "");
+      src[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   lp_build_conv(builder, src_type, dst_type, src, num_srcs, dst, num_dsts);
+
+   for(i = 0; i < num_dsts; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, dst_ptr, &index, 1, "");
+      LLVMBuildStore(builder, dst[i], ptr);
+   }
+
+   LLVMBuildRetVoid(builder);;
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         struct lp_type src_type,
+         struct lp_type dst_type)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef func = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   conv_test_ptr_t conv_test_ptr;
+   boolean success;
+   const unsigned n = 32;
+   int64_t cycles[n];
+   double cycles_avg = 0.0;
+   unsigned num_srcs;
+   unsigned num_dsts;
+   double eps;
+   unsigned i, j;
+
+   if(verbose >= 1)
+      dump_conv_types(stdout, src_type, dst_type);
+
+   if(src_type.length > dst_type.length) {
+      num_srcs = 1;
+      num_dsts = src_type.length/dst_type.length;
+   }
+   else  {
+      num_dsts = 1;
+      num_srcs = dst_type.length/src_type.length;
+   }
+
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
+
+   module = LLVMModuleCreateWithName("test");
+
+   func = add_conv_test(module, src_type, num_srcs, dst_type, num_dsts);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      if(verbose < 1)
+         dump_conv_types(stderr, src_type, dst_type);
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   if(verbose >= 2)
+      LLVMDumpModule(module);
+
+   conv_test_ptr = (conv_test_ptr_t)LLVMGetPointerToGlobal(engine, func);
+
+   if(verbose >= 2)
+      lp_disassemble(conv_test_ptr);
+
+   success = TRUE;
+   for(i = 0; i < n && success; ++i) {
+      unsigned src_stride = src_type.length*src_type.width/8;
+      unsigned dst_stride = dst_type.length*dst_type.width/8;
+      uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      int64_t start_counter = 0;
+      int64_t end_counter = 0;
+
+      for(j = 0; j < num_srcs; ++j) {
+         random_vec(src_type, src + j*src_stride);
+         read_vec(src_type, src + j*src_stride, fref + j*src_type.length);
+      }
+
+      for(j = 0; j < num_dsts; ++j) {
+         write_vec(dst_type, ref + j*dst_stride, fref + j*dst_type.length);
+      }
+
+      start_counter = rdtsc();
+      conv_test_ptr(src, dst);
+      end_counter = rdtsc();
+
+      cycles[i] = end_counter - start_counter;
+
+      for(j = 0; j < num_dsts; ++j) {
+         if(!compare_vec_with_eps(dst_type, dst + j*dst_stride, ref + j*dst_stride, eps))
+            success = FALSE;
+      }
+
+      if (!success) {
+         if(verbose < 1)
+            dump_conv_types(stderr, src_type, dst_type);
+         fprintf(stderr, "MISMATCH\n");
+
+         for(j = 0; j < num_srcs; ++j) {
+            fprintf(stderr, "  Src%u: ", j);
+            dump_vec(stderr, src_type, src + j*src_stride);
+            fprintf(stderr, "\n");
+         }
+
+#if 1
+         fprintf(stderr, "  Ref: ");
+         for(j = 0; j < src_type.length*num_srcs; ++j)
+            fprintf(stderr, " %f", fref[j]);
+         fprintf(stderr, "\n");
+#endif
+
+         for(j = 0; j < num_dsts; ++j) {
+            fprintf(stderr, "  Dst%u: ", j);
+            dump_vec(stderr, dst_type, dst + j*dst_stride);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref%u: ", j);
+            dump_vec(stderr, dst_type, ref + j*dst_stride);
+            fprintf(stderr, "\n");
+         }
+      }
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, src_type, dst_type, cycles_avg, success);
+
+   if (!success) {
+      static boolean firsttime = TRUE;
+      if(firsttime) {
+         if(verbose < 2)
+            LLVMDumpModule(module);
+         LLVMWriteBitcodeToFile(module, "conv.bc");
+         fprintf(stderr, "conv.bc written\n");
+         fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
+         firsttime = FALSE;
+         //abort();
+      }
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, func);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+const struct lp_type conv_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
+
+   /* TODO: test fixed formats too */
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
+};
+
+
+const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   bool success = TRUE;
+
+   for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
+      for(dst_type = conv_types; dst_type < &conv_types[num_types]; ++dst_type) {
+
+         if(src_type == dst_type)
+            continue;
+
+         if(src_type->norm != dst_type->norm)
+            continue;
+
+         if(!test_one(verbose, fp, *src_type, *dst_type))
+           success = FALSE;
+
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   unsigned long i;
+   bool success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      src_type = &conv_types[rand() % num_types];
+      
+      do {
+         dst_type = &conv_types[rand() % num_types];
+      } while (src_type == dst_type || src_type->norm != dst_type->norm);
+
+      if(!test_one(verbose, fp, *src_type, *dst_type))
+        success = FALSE;
+   }
+
+   return success;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
new file mode 100644
index 0000000000..d8455e5649
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -0,0 +1,272 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Analysis.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "util/u_format.h"
+
+#include "lp_bld_flow.h"
+#include "lp_bld_format.h"
+
+
+struct pixel_test_case
+{
+   enum pipe_format format;
+   uint32_t packed;
+   double unpacked[4];
+};
+
+
+struct pixel_test_case test_cases[] =
+{
+   {PIPE_FORMAT_R5G6B5_UNORM,   0x0000, {0.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_R5G6B5_UNORM,   0x001f, {0.0, 0.0, 1.0, 1.0}},
+   {PIPE_FORMAT_R5G6B5_UNORM,   0x07e0, {0.0, 1.0, 0.0, 1.0}},
+   {PIPE_FORMAT_R5G6B5_UNORM,   0xf800, {1.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_R5G6B5_UNORM,   0xffff, {1.0, 1.0, 1.0, 1.0}},
+
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0x0000, {0.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0x001f, {0.0, 0.0, 1.0, 0.0}},
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0x03e0, {0.0, 1.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0x7c00, {1.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0x8000, {0.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_A1R5G5B5_UNORM, 0xffff, {1.0, 1.0, 1.0, 1.0}},
+
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0x00000000, {0.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0x000000ff, {0.0, 0.0, 1.0, 0.0}},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0x0000ff00, {0.0, 1.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0x00ff0000, {1.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0xff000000, {0.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_A8R8G8B8_UNORM, 0xffffffff, {1.0, 1.0, 1.0, 1.0}},
+
+#if 0
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0x00000000, {0.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0x000000ff, {0.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0x0000ff00, {0.0, 0.0, 1.0, 0.0}},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0x00ff0000, {0.0, 1.0, 0.0, 0.0}},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0xff000000, {1.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_R8G8B8A8_UNORM, 0xffffffff, {1.0, 1.0, 1.0, 1.0}},
+#endif
+
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0x00000000, {0.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0x000000ff, {0.0, 0.0, 0.0, 1.0}},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0x0000ff00, {1.0, 0.0, 0.0, 0.0}},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0x00ff0000, {0.0, 1.0, 0.0, 0.0}},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0xff000000, {0.0, 0.0, 1.0, 0.0}},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, 0xffffffff, {1.0, 1.0, 1.0, 1.0}},
+};
+
+
+typedef void (*load_ptr_t)(const void *, float *);
+
+
+static LLVMValueRef
+add_load_rgba_test(LLVMModuleRef module,
+                   enum pipe_format format)
+{
+   LLVMTypeRef args[2];
+   LLVMValueRef func;
+   LLVMValueRef ptr;
+   LLVMValueRef rgba_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef rgba;
+   struct lp_build_loop_state loop;
+
+   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
+
+   func = LLVMAddFunction(module, "load", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   ptr = LLVMGetParam(func, 0);
+   rgba_ptr = LLVMGetParam(func, 1);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 1, 0), &loop);
+
+   rgba = lp_build_load_rgba_aos(builder, format, ptr);
+   LLVMBuildStore(builder, rgba, rgba_ptr);
+
+   lp_build_loop_end(builder, LLVMConstInt(LLVMInt32Type(), 4, 0), NULL, &loop);
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+typedef void (*store_ptr_t)(void *, const float *);
+
+
+static LLVMValueRef
+add_store_rgba_test(LLVMModuleRef module,
+                    enum pipe_format format)
+{
+   LLVMTypeRef args[2];
+   LLVMValueRef func;
+   LLVMValueRef ptr;
+   LLVMValueRef rgba_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef rgba;
+
+   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
+
+   func = LLVMAddFunction(module, "store", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   ptr = LLVMGetParam(func, 0);
+   rgba_ptr = LLVMGetParam(func, 1);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   rgba = LLVMBuildLoad(builder, rgba_ptr, "");
+
+   lp_build_store_rgba_aos(builder, format, ptr, rgba);
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+static boolean
+test_format(const struct pixel_test_case *test)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef load = NULL;
+   LLVMValueRef store = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   const struct util_format_description *desc;
+   load_ptr_t load_ptr;
+   store_ptr_t store_ptr;
+   float unpacked[4];
+   unsigned packed;
+   boolean success;
+   unsigned i;
+
+   desc = util_format_description(test->format);
+   fprintf(stderr, "%s\n", desc->name);
+
+   module = LLVMModuleCreateWithName("test");
+
+   load = add_load_rgba_test(module, test->format);
+   store = add_store_rgba_test(module, test->format);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   load_ptr  = (load_ptr_t) LLVMGetPointerToGlobal(engine, load);
+   store_ptr = (store_ptr_t)LLVMGetPointerToGlobal(engine, store);
+
+   memset(unpacked, 0, sizeof unpacked);
+   packed = 0;
+
+   load_ptr(&test->packed, unpacked);
+   store_ptr(&packed, unpacked);
+
+   success = TRUE;
+   if(test->packed != packed)
+      success = FALSE;
+   for(i = 0; i < 4; ++i)
+      if(test->unpacked[i] != unpacked[i])
+         success = FALSE;
+
+   if (!success) {
+      printf("FAILED\n");
+      printf("  Packed: %08x\n", test->packed);
+      printf("          %08x\n", packed);
+      printf("  Unpacked: %f %f %f %f\n", unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
+      printf("            %f %f %f %f\n", test->unpacked[0], test->unpacked[1], test->unpacked[2], test->unpacked[3]);
+      LLVMDumpModule(module);
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, store);
+   LLVMFreeMachineCodeForFunction(engine, load);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+int main(int argc, char **argv)
+{
+   unsigned i;
+   int ret;
+
+   for (i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); ++i)
+      if(!test_format(&test_cases[i]))
+        ret = 1;
+
+   return ret;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
new file mode 100644
index 0000000000..4592dc0b2d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -0,0 +1,384 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "lp_bld_const.h"
+#include "lp_test.h"
+
+
+void
+dump_type(FILE *fp,
+          struct lp_type type)
+{
+   fprintf(fp, "%s%s%u%sx%u",
+           type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
+           type.floating ? "f" : (type.fixed ? "h" : "i"),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+}
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index)
+{
+   double scale = lp_const_scale(type);
+   double value;
+   assert(index < type.length);
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         value = *((const float *)src + index);
+         break;
+      case 64:
+         value =  *((const double *)src + index);
+         break;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+   else {
+      if(type.sign) {
+         switch(type.width) {
+         case 8:
+            value = *((const int8_t *)src + index);
+            break;
+         case 16:
+            value = *((const int16_t *)src + index);
+            break;
+         case 32:
+            value = *((const int32_t *)src + index);
+            break;
+         case 64:
+            value = *((const int64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+      else {
+         switch(type.width) {
+         case 8:
+            value = *((const uint8_t *)src + index);
+            break;
+         case 16:
+            value = *((const uint16_t *)src + index);
+            break;
+         case 32:
+            value = *((const uint32_t *)src + index);
+            break;
+         case 64:
+            value = *((const uint64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+   }
+   return value/scale;
+}
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
+{
+   assert(index < type.length);
+   if(!type.sign && value < 0.0)
+      value = 0.0;
+   if(type.norm && value < -1.0)
+      value = -1.0;
+   if(type.norm && value > 1.0)
+      value = 1.0;
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         *((float *)dst + index) = (float)(value);
+         break;
+      case 64:
+          *((double *)dst + index) = value;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   else {
+      double scale = lp_const_scale(type);
+      value = round(value*scale);
+      if(type.sign) {
+         long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1);
+         switch(type.width) {
+         case 8:
+            *((int8_t *)dst + index) = (int8_t)lvalue;
+            break;
+         case 16:
+            *((int16_t *)dst + index) = (int16_t)lvalue;
+            break;
+         case 32:
+            *((int32_t *)dst + index) = (int32_t)lvalue;
+            break;
+         case 64:
+            *((int64_t *)dst + index) = (int64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      else {
+         unsigned long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((unsigned long long)1 << type.width) - 1);
+         switch(type.width) {
+         case 8:
+            *((uint8_t *)dst + index) = (uint8_t)lvalue;
+            break;
+         case 16:
+            *((uint16_t *)dst + index) = (uint16_t)lvalue;
+            break;
+         case 32:
+            *((uint32_t *)dst + index) = (uint32_t)lvalue;
+            break;
+         case 64:
+            *((uint64_t *)dst + index) = (uint64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+}
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index)
+{
+   double value;
+   assert(index < type.length);
+   value = (double)rand()/(double)RAND_MAX;
+   if(!type.norm) {
+      unsigned long long mask;
+      if (type.floating)
+         mask = ~(unsigned long long)0;
+      else if (type.fixed)
+         mask = ((unsigned long long)1 << (type.width / 2)) - 1;
+      else if (type.sign)
+         mask = ((unsigned long long)1 << (type.width - 1)) - 1;
+      else
+         mask = ((unsigned long long)1 << type.width) - 1;
+      value += (double)(mask & rand());
+   }
+   if(!type.sign)
+      if(rand() & 1)
+         value = -value;
+   write_elem(type, dst, index, value);
+}
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      dst[i] = read_elem(type, src, i);
+}
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      write_elem(type, dst, i, src[i]);
+}
+
+
+float
+random_float(void)
+{
+    return (float)((double)rand()/(double)RAND_MAX);
+}
+
+
+void
+random_vec(struct lp_type type, void *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      random_elem(type, dst, i);
+}
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i) {
+      double res_elem = read_elem(type, res, i);
+      double ref_elem = read_elem(type, ref, i);
+      double delta = fabs(res_elem - ref_elem);
+      if(delta >= 2.0*eps)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref)
+{
+   double eps = lp_const_eps(type);
+   return compare_vec_with_eps(type, res, ref, eps);
+}
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i) {
+      if(i)
+         fprintf(fp, " ");
+      if (type.floating) {
+         double value;
+         switch(type.width) {
+         case 32:
+            value = *((const float *)src + i);
+            break;
+         case 64:
+            value = *((const double *)src + i);
+            break;
+         default:
+            assert(0);
+            value = 0.0;
+         }
+         fprintf(fp, "%f", value);
+      }
+      else {
+         if(type.sign && !type.norm) {
+            long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const int8_t *)src + i);
+               format = "%3lli";
+               break;
+            case 16:
+               value = *((const int16_t *)src + i);
+               format = "%5lli";
+               break;
+            case 32:
+               value = *((const int32_t *)src + i);
+               format = "%10lli";
+               break;
+            case 64:
+               value = *((const int64_t *)src + i);
+               format = "%20lli";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+         else {
+            unsigned long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const uint8_t *)src + i);
+               format = type.norm ? "%2x" : "%4llu";
+               break;
+            case 16:
+               value = *((const uint16_t *)src + i);
+               format = type.norm ? "%4x" : "%6llx";
+               break;
+            case 32:
+               value = *((const uint32_t *)src + i);
+               format = type.norm ? "%8x" : "%11llx";
+               break;
+            case 64:
+               value = *((const uint64_t *)src + i);
+               format = type.norm ? "%16x" : "%21llx";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+      }
+   }
+}
+
+
+int main(int argc, char **argv)
+{
+   unsigned verbose = 0;
+   FILE *fp = NULL;
+   unsigned long n = 1000;
+   unsigned i;
+   boolean success;
+
+   for(i = 1; i < argc; ++i) {
+      if(strcmp(argv[i], "-v") == 0)
+         ++verbose;
+      else if(strcmp(argv[i], "-o") == 0)
+         fp = fopen(argv[++i], "wt");
+      else
+         n = atoi(argv[i]);
+   }
+
+   if(fp) {
+      /* Warm up the caches */
+      test_some(0, NULL, 100);
+
+      write_tsv_header(fp);
+   }
+      
+   if(n)
+      success = test_some(verbose, fp, n);
+   else
+      success = test_all(verbose, fp);
+
+   if(fp)
+      fclose(fp);
+
+   return success ? 0 : 1;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
new file mode 100644
index 0000000000..773e848242
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -0,0 +1,304 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "util/u_format.h"
+#include "lp_context.h"
+#include "lp_surface.h"
+#include "lp_texture.h"
+#include "lp_tex_cache.h"
+
+
+
+/**
+ * Return the position in the cache for the tile that contains win pos (x,y).
+ * We currently use a direct mapped cache so this is like a hack key.
+ * At some point we should investige something more sophisticated, like
+ * a LRU replacement policy.
+ */
+#define CACHE_POS(x, y) \
+   (((x) + (y) * 5) % NUM_ENTRIES)
+
+
+
+/**
+ * Is the tile at (x,y) in cleared state?
+ */
+static INLINE uint
+is_clear_flag_set(const uint *bitvec, union tex_tile_address addr)
+{
+   int pos, bit;
+   pos = addr.bits.y * (MAX_TEX_WIDTH / TEX_TILE_SIZE) + addr.bits.x;
+   assert(pos / 32 < (MAX_TEX_WIDTH / TEX_TILE_SIZE) * (MAX_TEX_HEIGHT / TEX_TILE_SIZE) / 32);
+   bit = bitvec[pos / 32] & (1 << (pos & 31));
+   return bit;
+}
+   
+
+/**
+ * Mark the tile at (x,y) as not cleared.
+ */
+static INLINE void
+clear_clear_flag(uint *bitvec, union tex_tile_address addr)
+{
+   int pos;
+   pos = addr.bits.y * (MAX_TEX_WIDTH / TEX_TILE_SIZE) + addr.bits.x;
+   assert(pos / 32 < (MAX_TEX_WIDTH / TEX_TILE_SIZE) * (MAX_TEX_HEIGHT / TEX_TILE_SIZE) / 32);
+   bitvec[pos / 32] &= ~(1 << (pos & 31));
+}
+   
+
+struct llvmpipe_tex_tile_cache *
+lp_create_tex_tile_cache( struct pipe_screen *screen )
+{
+   struct llvmpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( llvmpipe_tex_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+lp_destroy_tex_tile_cache(struct llvmpipe_tex_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_destroy(tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+void
+lp_tex_tile_cache_map_transfers(struct llvmpipe_tex_tile_cache *tc)
+{
+   if (tc->transfer && !tc->transfer_map)
+      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
+
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
+}
+
+
+void
+lp_tex_tile_cache_unmap_transfers(struct llvmpipe_tex_tile_cache *tc)
+{
+   if (tc->transfer_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->transfer);
+      tc->transfer_map = NULL;
+   }
+
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+void
+lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
+{
+   if (tc->texture) {
+      struct llvmpipe_texture *lpt = llvmpipe_texture(tc->texture);
+      if (lpt->timestamp != tc->timestamp) {
+         /* texture was modified, invalidate all cached tiles */
+         uint i;
+         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
+         for (i = 0; i < NUM_ENTRIES; i++) {
+            tc->entries[i].addr.bits.invalid = 1;
+         }
+
+         tc->timestamp = lpt->timestamp;
+      }
+   }
+}
+
+/**
+ * Specify the texture to cache.
+ */
+void
+lp_tex_tile_cache_set_texture(struct llvmpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (tc->texture != texture) {
+      pipe_texture_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         struct pipe_screen *screen = tc->tex_trans->texture->screen;
+         
+         if (tc->tex_trans_map) {
+            screen->transfer_unmap(screen, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to lp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct llvmpipe_cached_tex_tile *
+lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
+                        union tex_tile_address addr )
+{
+   struct pipe_screen *screen = tc->screen;
+   struct llvmpipe_cached_tex_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TEX_TILE_SIZE, y/TEX_TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            screen->tex_transfer_destroy(tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            screen->get_tex_transfer(screen, tc->texture, 
+                                     addr.bits.face, 
+                                     addr.bits.level, 
+                                     addr.bits.z, 
+                                     PIPE_TRANSFER_READ, 0, 0,
+                                     tc->texture->width[addr.bits.level],
+                                     tc->texture->height[addr.bits.level]);
+
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      {
+         unsigned x = addr.bits.x * TEX_TILE_SIZE;
+         unsigned y = addr.bits.y * TEX_TILE_SIZE;
+         unsigned w = TEX_TILE_SIZE;
+         unsigned h = TEX_TILE_SIZE;
+
+         if (pipe_clip_tile(x, y, &w, &h, tc->tex_trans)) {
+            assert(0);
+         }
+
+         util_format_read_4ub(tc->tex_trans->format,
+                              (uint8_t *)tile->color, sizeof tile->color[0],
+                              tc->tex_trans_map, tc->tex_trans->stride,
+                              x, y, w, h);
+      }
+
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.h b/src/gallium/drivers/llvmpipe/lp_tex_cache.h
new file mode 100644
index 0000000000..9fa6c36812
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.h
@@ -0,0 +1,151 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEX_CACHE_H
+#define LP_TEX_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct llvmpipe_context;
+struct llvmpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TEX_TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TEX_TILE_SIZE */
+      unsigned y:6;             /* 4096 / TEX_TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct llvmpipe_cached_tex_tile
+{
+   union tex_tile_address addr;
+   uint8_t color[TEX_TILE_SIZE][TEX_TILE_SIZE][4];
+};
+
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_TEX_WIDTH 2048
+#define MAX_TEX_HEIGHT 2048
+
+
+struct llvmpipe_tex_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_texture *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct llvmpipe_cached_tex_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   struct llvmpipe_cached_tex_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct llvmpipe_tex_tile_cache *
+lp_create_tex_tile_cache( struct pipe_screen *screen );
+
+extern void
+lp_destroy_tex_tile_cache(struct llvmpipe_tex_tile_cache *tc);
+
+extern void
+lp_tex_tile_cache_map_transfers(struct llvmpipe_tex_tile_cache *tc);
+
+extern void
+lp_tex_tile_cache_unmap_transfers(struct llvmpipe_tex_tile_cache *tc);
+
+extern void
+lp_tex_tile_cache_set_texture(struct llvmpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+void
+lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc);
+
+extern const struct llvmpipe_cached_tex_tile *
+lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
+                        union tex_tile_address addr );
+
+static INLINE const union tex_tile_address
+tex_tile_address( unsigned x,
+                  unsigned y,
+                  unsigned z,
+                  unsigned face,
+                  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TEX_TILE_SIZE;
+   addr.bits.y = y / TEX_TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct llvmpipe_cached_tex_tile *
+lp_get_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
+                       union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return lp_find_cached_tex_tile( tc, addr );
+}
+
+
+#endif /* LP_TEX_CACHE_H */
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
new file mode 100644
index 0000000000..9ad1bde956
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEX_SAMPLE_H
+#define LP_TEX_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+#include "tgsi/tgsi_exec.h"
+
+
+struct llvmpipe_tex_tile_cache;
+struct lp_sampler_static_state;
+
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct lp_shader_sampler
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   unsigned processor;
+
+   /* For lp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   const struct pipe_texture *texture;
+   const struct pipe_sampler_state *sampler;
+
+   struct llvmpipe_tex_tile_cache *cache;
+};
+
+
+
+static INLINE struct lp_shader_sampler *
+lp_shader_sampler(const struct tgsi_sampler *sampler)
+{
+   return (struct lp_shader_sampler *) sampler;
+}
+
+
+
+extern void
+lp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+/**
+ * Texture sampling code generator that just calls lp_get_samples C function
+ * for the actual sampling computation.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr);
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key,
+                           LLVMValueRef context_ptr);
+
+
+#endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
new file mode 100644
index 0000000000..a1365a045f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -0,0 +1,1713 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling
+ *
+ * Authors:
+ *   Brian Paul
+ */
+
+#include "lp_context.h"
+#include "lp_quad.h"
+#include "lp_surface.h"
+#include "lp_texture.h"
+#include "lp_tex_sample.h"
+#include "lp_tex_cache.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+
+/*
+ * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
+ * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * The tests/texwrap.c demo is a good test.
+ * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
+ * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
+ */
+#define FRAC(f)  ((f) - util_ifloor(f))
+
+
+/**
+ * Linear interpolation macro
+ */
+static INLINE float
+lerp(float a, float v0, float v1)
+{
+   return v0 + a * (v1 - v0);
+}
+
+
+/**
+ * Do 2D/biliner interpolation of float values.
+ * v00, v10, v01 and v11 are typically four texture samples in a square/box.
+ * a and b are the horizontal and vertical interpolants.
+ * It's important that this function is inlined when compiled with
+ * optimization!  If we find that's not true on some systems, convert
+ * to a macro.
+ */
+static INLINE float
+lerp_2d(float a, float b,
+        float v00, float v10, float v01, float v11)
+{
+   const float temp0 = lerp(a, v00, v10);
+   const float temp1 = lerp(a, v01, v11);
+   return lerp(b, temp0, temp1);
+}
+
+
+/**
+ * As above, but 3D interpolation of 8 values.
+ */
+static INLINE float
+lerp_3d(float a, float b, float c,
+        float v000, float v100, float v010, float v110,
+        float v001, float v101, float v011, float v111)
+{
+   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
+   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
+   return lerp(c, temp0, temp1);
+}
+
+
+
+/**
+ * If A is a signed integer, A % B doesn't give the right value for A < 0
+ * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ */
+#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+
+
+/**
+ * Apply texture coord wrapping mode and return integer texture indexes
+ * for a vector of four texcoords (S or T or P).
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the incoming texcoords
+ * \param size  the texture image size
+ * \param icoord  returns the integer texcoords
+ * \return  integer texture index
+ */
+static INLINE void
+nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                   int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* s limited to [0,1) */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch] * size);
+         icoord[ch] = REMAINDER(i, size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP:
+      /* s limited to [0,1] */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         if (s[ch] <= 0.0F)
+            icoord[ch] = 0;
+         else if (s[ch] >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(s[ch] * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] < min)
+               icoord[ch] = 0;
+            else if (s[ch] > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [-1, size] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] <= min)
+               icoord[ch] = -1;
+            else if (s[ch] >= max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      {
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const int flr = util_ifloor(s[ch]);
+            float u;
+            if (flr & 1)
+               u = 1.0F - (s[ch] - (float) flr);
+            else
+               u = s[ch] - (float) flr;
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* s limited to [0,1] */
+         /* i limited to [0,size-1] */
+         const float u = fabsf(s[ch]);
+         if (u <= 0.0F)
+            icoord[ch] = 0;
+         else if (u >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(u * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = -1;
+            else if (u > max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Used to compute texel locations for linear sampling for four texcoords.
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the texcoords
+ * \param size  the texture image size
+ * \param icoord0  returns first texture indexes
+ * \param icoord1  returns second texture indexes (usually icoord0 + 1)
+ * \param w  returns blend factor/weight between texture indexes
+ * \param icoord  returns the computed integer texture coords
+ */
+static INLINE void
+linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                  int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         float u = s[ch] * size - 0.5F;
+         icoord0[ch] = REMAINDER(util_ifloor(u), size);
+         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = CLAMP(s[ch], min, max);
+            u = u * size - 0.5f;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         const int flr = util_ifloor(s[ch]);
+         float u;
+         if (flr & 1)
+            u = 1.0F - (s[ch] - (float) flr);
+         else
+            u = s[ch] - (float) flr;
+         u = u * size - 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = fabsf(s[ch]);
+            if (u <= min)
+               u = min * size;
+            else if (u >= max)
+               u = max * size;
+            else
+               u *= size;
+            u -= 0.5F;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                          int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch]);
+         icoord[ch]= CLAMP(i, 0, (int) size-1);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords.
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* Not exactly what the spec says, but it matches NVIDIA output */
+         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord1[ch] > (int) size - 1)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = ( sc / ma + 1.0F ) * 0.5F;
+   *newT = ( tc / ma + 1.0F ) * 0.5F;
+
+   return face;
+}
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ *
+ * This is only done for fragment shaders, not vertex shaders.
+ */
+static float
+compute_lambda(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias)
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float rho, lambda;
+
+   if (samp->processor == TGSI_PROCESSOR_VERTEX)
+      return lodbias;
+
+   assert(sampler->normalized_coords);
+
+   assert(s);
+   {
+      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
+      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
+      dsdx = fabsf(dsdx);
+      dsdy = fabsf(dsdy);
+      rho = MAX2(dsdx, dsdy) * texture->width[0];
+   }
+   if (t) {
+      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
+      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
+      float max;
+      dtdx = fabsf(dtdx);
+      dtdy = fabsf(dtdy);
+      max = MAX2(dtdx, dtdy) * texture->height[0];
+      rho = MAX2(rho, max);
+   }
+   if (p) {
+      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
+      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
+      float max;
+      dpdx = fabsf(dpdx);
+      dpdy = fabsf(dpdy);
+      max = MAX2(dpdx, dpdy) * texture->depth[0];
+      rho = MAX2(rho, max);
+   }
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Do several things here:
+ * 1. Compute lambda from the texcoords, if needed
+ * 2. Determine if we're minifying or magnifying
+ * 3. If minifying, choose mipmap levels
+ * 4. Return image filter to use within mipmap images
+ * \param level0  Returns first mipmap level to sample from
+ * \param level1  Returns second mipmap level to sample from
+ * \param levelBlend  Returns blend factor between levels, in [0,1]
+ * \param imgFilter  Returns either the min or mag filter, depending on lambda
+ */
+static void
+choose_mipmap_levels(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     unsigned *level0, unsigned *level1, float *levelBlend,
+                     unsigned *imgFilter)
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+
+   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* no mipmap selection needed */
+      *level0 = *level1 = CLAMP((int) sampler->min_lod,
+                                0, (int) texture->last_level);
+
+      if (sampler->min_img_filter != sampler->mag_img_filter) {
+         /* non-mipmapped texture, but still need to determine if doing
+          * minification or magnification.
+          */
+         float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
+         if (lambda <= 0.0) {
+            *imgFilter = sampler->mag_img_filter;
+         }
+         else {
+            *imgFilter = sampler->min_img_filter;
+         }
+      }
+      else {
+         *imgFilter = sampler->mag_img_filter;
+      }
+   }
+   else {
+      float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
+
+      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
+         /* magnifying */
+         *imgFilter = sampler->mag_img_filter;
+         *level0 = *level1 = 0;
+      }
+      else {
+         /* minifying */
+         *imgFilter = sampler->min_img_filter;
+
+         /* choose mipmap level(s) and compute the blend factor between them */
+         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+            /* Nearest mipmap level */
+            const int lvl = (int) (lambda + 0.5);
+            *level0 =
+            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
+         }
+         else {
+            /* Linear interpolation between mipmap levels */
+            const int lvl = (int) lambda;
+            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
+            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+         }
+      }
+   }
+}
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param face  the cube face in 0..5
+ * \param level  the mipmap level
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param z  which slice of a 3D texture
+ * \param rgba  the quad to put the texel/color into
+ * \param j  which element of the rgba quad to write to
+ *
+ * XXX maybe move this into lp_tile_cache.c and merge with the
+ * lp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+static void
+get_texel_quad_2d(const struct tgsi_sampler *tgsi_sampler,
+                  unsigned face, unsigned level, int x, int y, 
+                  const uint8_t *out[4])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+
+   const struct llvmpipe_cached_tex_tile *tile
+      = lp_get_cached_tex_tile(samp->cache,
+                               tex_tile_address(x, y, 0, face, level));
+
+   y %= TEX_TILE_SIZE;
+   x %= TEX_TILE_SIZE;
+      
+   out[0] = &tile->color[y  ][x  ][0];
+   out[1] = &tile->color[y  ][x+1][0];
+   out[2] = &tile->color[y+1][x  ][0];
+   out[3] = &tile->color[y+1][x+1][0];
+}
+
+static INLINE const uint8_t *
+get_texel_2d_ptr(const struct tgsi_sampler *tgsi_sampler,
+                 unsigned face, unsigned level, int x, int y)
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+
+   const struct llvmpipe_cached_tex_tile *tile
+      = lp_get_cached_tex_tile(samp->cache,
+                               tex_tile_address(x, y, 0, face, level));
+
+   y %= TEX_TILE_SIZE;
+   x %= TEX_TILE_SIZE;
+
+   return &tile->color[y][x][0];
+}
+
+
+static void
+get_texel_quad_2d_mt(const struct tgsi_sampler *tgsi_sampler,
+                     unsigned face, unsigned level, 
+                     int x0, int y0, 
+                     int x1, int y1,
+                     const uint8_t *out[4])
+{
+   unsigned i;
+
+   for (i = 0; i < 4; i++) {
+      unsigned tx = (i & 1) ? x1 : x0;
+      unsigned ty = (i >> 1) ? y1 : y0;
+
+      out[i] = get_texel_2d_ptr( tgsi_sampler, face, level, tx, ty );
+   }
+}
+
+static void
+get_texel(const struct tgsi_sampler *tgsi_sampler,
+                 unsigned face, unsigned level, int x, int y, int z,
+                 float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      rgba[0][j] = sampler->border_color[0];
+      rgba[1][j] = sampler->border_color[1];
+      rgba[2][j] = sampler->border_color[2];
+      rgba[3][j] = sampler->border_color[3];
+   }
+   else {
+      const unsigned tx = x % TEX_TILE_SIZE;
+      const unsigned ty = y % TEX_TILE_SIZE;
+      const struct llvmpipe_cached_tex_tile *tile;
+
+      tile = lp_get_cached_tex_tile(samp->cache,
+                                    tex_tile_address(x, y, z, face, level));
+
+      rgba[0][j] = ubyte_to_float(tile->color[ty][tx][0]);
+      rgba[1][j] = ubyte_to_float(tile->color[ty][tx][1]);
+      rgba[2][j] = ubyte_to_float(tile->color[ty][tx][2]);
+      rgba[3][j] = ubyte_to_float(tile->color[ty][tx][3]);
+      if (0)
+      {
+         debug_printf("Get texel %f %f %f %f from %s\n",
+                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
+                      pf_name(texture->format));
+      }
+   }
+}
+
+
+/**
+ * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+ * When we sampled the depth texture, the depth value was put into all
+ * RGBA channels.  We look at the red channel here.
+ * \param rgba  quad of (depth) texel values
+ * \param p  texture 'P' components for four pixels in quad
+ * \param j  which pixel in the quad to test [0..3]
+ */
+static INLINE void
+shadow_compare(const struct pipe_sampler_state *sampler,
+               float rgba[NUM_CHANNELS][QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               uint j)
+{
+   int k;
+   switch (sampler->compare_func) {
+   case PIPE_FUNC_LESS:
+      k = p[j] < rgba[0][j];
+      break;
+   case PIPE_FUNC_LEQUAL:
+      k = p[j] <= rgba[0][j];
+      break;
+   case PIPE_FUNC_GREATER:
+      k = p[j] > rgba[0][j];
+      break;
+   case PIPE_FUNC_GEQUAL:
+      k = p[j] >= rgba[0][j];
+      break;
+   case PIPE_FUNC_EQUAL:
+      k = p[j] == rgba[0][j];
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      k = p[j] != rgba[0][j];
+      break;
+   case PIPE_FUNC_ALWAYS:
+      k = 1;
+      break;
+   case PIPE_FUNC_NEVER:
+      k = 0;
+      break;
+   default:
+      k = 0;
+      assert(0);
+      break;
+   }
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
+   rgba[3][j] = 1.0F;
+}
+
+
+/**
+ * As above, but do four z/texture comparisons.
+ */
+static INLINE void
+shadow_compare4(const struct pipe_sampler_state *sampler,
+                float rgba[NUM_CHANNELS][QUAD_SIZE],
+                const float p[QUAD_SIZE])
+{
+   int j, k0, k1, k2, k3;
+   float val;
+
+   /* compare four texcoords vs. four texture samples */
+   switch (sampler->compare_func) {
+   case PIPE_FUNC_LESS:
+      k0 = p[0] < rgba[0][0];
+      k1 = p[1] < rgba[0][1];
+      k2 = p[2] < rgba[0][2];
+      k3 = p[3] < rgba[0][3];
+      break;
+   case PIPE_FUNC_LEQUAL:
+      k0 = p[0] <= rgba[0][0];
+      k1 = p[1] <= rgba[0][1];
+      k2 = p[2] <= rgba[0][2];
+      k3 = p[3] <= rgba[0][3];
+      break;
+   case PIPE_FUNC_GREATER:
+      k0 = p[0] > rgba[0][0];
+      k1 = p[1] > rgba[0][1];
+      k2 = p[2] > rgba[0][2];
+      k3 = p[3] > rgba[0][3];
+      break;
+   case PIPE_FUNC_GEQUAL:
+      k0 = p[0] >= rgba[0][0];
+      k1 = p[1] >= rgba[0][1];
+      k2 = p[2] >= rgba[0][2];
+      k3 = p[3] >= rgba[0][3];
+      break;
+   case PIPE_FUNC_EQUAL:
+      k0 = p[0] == rgba[0][0];
+      k1 = p[1] == rgba[0][1];
+      k2 = p[2] == rgba[0][2];
+      k3 = p[3] == rgba[0][3];
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      k0 = p[0] != rgba[0][0];
+      k1 = p[1] != rgba[0][1];
+      k2 = p[2] != rgba[0][2];
+      k3 = p[3] != rgba[0][3];
+      break;
+   case PIPE_FUNC_ALWAYS:
+      k0 = k1 = k2 = k3 = 1;
+      break;
+   case PIPE_FUNC_NEVER:
+      k0 = k1 = k2 = k3 = 0;
+      break;
+   default:
+      k0 = k1 = k2 = k3 = 0;
+      assert(0);
+      break;
+   }
+
+   /* convert four pass/fail values to an intensity in [0,1] */
+   val = 0.25F * (k0 + k1 + k2 + k3);
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for (j = 0; j < 4; j++) {
+      rgba[0][j] = rgba[1][j] = rgba[2][j] = val;
+      rgba[3][j] = 1.0F;
+   }
+}
+
+
+
+static void
+lp_get_samples_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                    const float s[QUAD_SIZE],
+                                    const float t[QUAD_SIZE],
+                                    const float p[QUAD_SIZE],
+                                    float lodbias,
+                                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = 1 << (samp->xpot - level);
+   unsigned ypot = 1 << (samp->ypot - level);
+   unsigned xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
+      
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const uint8_t *tx[4];
+      
+
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax)
+      {
+         get_texel_quad_2d(tgsi_sampler, 0, level, x0, y0, tx);
+      }
+      else 
+      {
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_mt(tgsi_sampler, 0, level, 
+                              x0, y0, x1, y1, tx);
+      }
+
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              ubyte_to_float(tx[0][c]), ubyte_to_float(tx[1][c]),
+                              ubyte_to_float(tx[2][c]), ubyte_to_float(tx[3][c]));
+      }
+   }
+}
+
+
+static void
+lp_get_samples_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias,
+                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = 1 << (samp->xpot - level);
+   unsigned ypot = 1 << (samp->ypot - level);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const uint8_t *out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = ubyte_to_float(out[c]);
+      }
+   }
+}
+
+
+static void
+lp_get_samples_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias,
+                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = 1 << (samp->xpot - level);
+   unsigned ypot = 1 << (samp->ypot - level);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const uint8_t *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = ubyte_to_float(out[c]);
+      }
+   }
+}
+
+
+static void
+lp_get_samples_2d_linear_mip_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                               const float s[QUAD_SIZE],
+                                               const float t[QUAD_SIZE],
+                                               const float p[QUAD_SIZE],
+                                               float lodbias,
+                                               float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
+                                           s, t, p, 0, rgba );
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
+                                           s, t, p, 0, rgba );
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
+                                           s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
+                                           s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
+}
+
+/**
+ * Common code for sampling 1D/2D/cube textures.
+ * Could probably extend for 3D...
+ */
+static void
+lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
+                         const float s[QUAD_SIZE],
+                         const float t[QUAD_SIZE],
+                         const float p[QUAD_SIZE],
+                         float lodbias,
+                         float rgba[NUM_CHANNELS][QUAD_SIZE],
+                         const unsigned faces[4])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(tgsi_sampler, s, t, p, 
+                        lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(sampler, rgba, p, j);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
+                         rgba2, j);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(sampler, rgba2, p, j);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+
+         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare4(sampler, tx, p);
+            }
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1],
+                                    tx[c][2], tx[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+
+               /* XXX: This is incorrect -- will often end up with (x0
+                *  == x1 && y0 == y1), meaning that we fetch the same
+                *  texel four times and linearly interpolate between
+                *  identical values.  The correct approach would be to
+                *  call linear_texcoord again for the second level.
+                */
+               x0[j] /= 2;
+               y0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare4(sampler, tx, p);
+               }
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
+                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static INLINE void
+lp_get_samples_1d(struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   static const float tzero[4] = {0, 0, 0, 0};
+   lp_get_samples_2d_common(sampler, s, tzero, NULL,
+                            lodbias, rgba, faces);
+}
+
+
+static INLINE void
+lp_get_samples_2d(struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   lp_get_samples_2d_common(sampler, s, t, p,
+                            lodbias, rgba, faces);
+}
+
+
+static INLINE void
+lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   /* get/map pipe_surfaces corresponding to 3D tex slices */
+   unsigned level0, level1, j, imgFilter;
+   int width, height, depth;
+   float levelBlend;
+   const uint face = 0;
+
+   choose_mipmap_levels(tgsi_sampler, s, t, p, 
+                        lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4], z[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               z[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+         float xw[4], yw[4], zw[4]; /* interpolation weights */
+         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int c;
+            float tx0[4][4], tx1[4][4];
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                    tx0[c][0], tx0[c][1],
+                                    tx0[c][2], tx0[c][3],
+                                    tx1[c][0], tx1[c][1],
+                                    tx1[c][2], tx1[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               z0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               z1[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                        tx0[c][0], tx0[c][1],
+                                        tx0[c][2], tx0[c][3],
+                                        tx1[c][0], tx1[c][1],
+                                        tx1[c][2], tx1[c][3]);
+               }
+
+               /* blend mipmap levels */
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+lp_get_samples_cube(struct tgsi_sampler *sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   unsigned faces[QUAD_SIZE], j;
+   float ssss[4], tttt[4];
+   for (j = 0; j < QUAD_SIZE; j++) {
+      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   }
+   lp_get_samples_2d_common(sampler, ssss, tttt, NULL,
+                            lodbias, rgba, faces);
+}
+
+
+static void
+lp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   const uint face = 0;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(tgsi_sampler, s, t, p, 
+                        lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   /* texture RECTS cannot be mipmapped */
+   assert(level0 == level1);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(sampler, rgba, p, j);
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare4(sampler, tx, p);
+            }
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Error condition handler
+ */
+static INLINE void
+lp_get_samples_null(struct tgsi_sampler *tgsi_sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   int i,j;
+
+   for (i = 0; i < 4; i++)
+      for (j = 0; j < 4; j++)
+         rgba[i][j] = 1.0;
+}
+
+/**
+ * Called via tgsi_sampler::get_samples() when using a sampler for the
+ * first time.  Determine the actual sampler function, link it in and
+ * call it.
+ */
+void
+lp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+
+   /* Default to the 'undefined' case:
+    */
+   tgsi_sampler->get_samples = lp_get_samples_null;
+
+   if (!texture) {
+      assert(0);                /* is this legal?? */
+      goto out;
+   }
+
+   if (!sampler->normalized_coords) {
+      assert (texture->target == PIPE_TEXTURE_2D);
+      tgsi_sampler->get_samples = lp_get_samples_rect;
+      goto out;
+   }
+
+   switch (texture->target) {
+   case PIPE_TEXTURE_1D:
+      tgsi_sampler->get_samples = lp_get_samples_1d;
+      break;
+   case PIPE_TEXTURE_2D:
+      tgsi_sampler->get_samples = lp_get_samples_2d;
+      break;
+   case PIPE_TEXTURE_3D:
+      tgsi_sampler->get_samples = lp_get_samples_3d;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      tgsi_sampler->get_samples = lp_get_samples_cube;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   /* Do this elsewhere: 
+    */
+   samp->xpot = util_unsigned_logbase2( samp->texture->width[0] );
+   samp->ypot = util_unsigned_logbase2( samp->texture->height[0] );
+
+   /* Try to hook in a faster sampler.  Ultimately we'll have to
+    * code-generate these.  Luckily most of this looks like it is
+    * orthogonal state within the sampler.
+    */
+   if (texture->target == PIPE_TEXTURE_2D &&
+       sampler->min_img_filter == sampler->mag_img_filter &&
+       sampler->wrap_s == sampler->wrap_t &&
+       sampler->compare_mode == FALSE &&
+       sampler->normalized_coords) 
+   {
+      if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+         samp->level = CLAMP((int) sampler->min_lod,
+                             0, (int) texture->last_level);
+
+         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
+            switch (sampler->min_img_filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_repeat_POT;
+               break;
+            case PIPE_TEX_FILTER_LINEAR:
+               tgsi_sampler->get_samples = lp_get_samples_2d_linear_repeat_POT;
+               break;
+            default:
+               break;
+            }
+         } 
+         else if (sampler->wrap_s == PIPE_TEX_WRAP_CLAMP) {
+            switch (sampler->min_img_filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_clamp_POT;
+               break;
+            default:
+               break;
+            }
+         }
+      }
+      else if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
+            switch (sampler->min_img_filter) {
+            case PIPE_TEX_FILTER_LINEAR:
+               tgsi_sampler->get_samples = lp_get_samples_2d_linear_mip_linear_repeat_POT;
+               break;
+            default:
+               break;
+            }
+         } 
+      }
+   }
+   else if (0) {
+      _debug_printf("target %d/%d min_mip %d/%d min_img %d/%d wrap %d/%d compare %d/%d norm %d/%d\n",
+                    texture->target, PIPE_TEXTURE_2D,
+                    sampler->min_mip_filter, PIPE_TEX_MIPFILTER_NONE,
+                    sampler->min_img_filter, sampler->mag_img_filter,
+                    sampler->wrap_s, sampler->wrap_t,
+                    sampler->compare_mode, FALSE,
+                    sampler->normalized_coords, TRUE);
+   }
+
+out:
+   tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
+}
+
+
+void PIPE_CDECL
+lp_fetch_texel_soa( struct tgsi_sampler **samplers,
+                    uint32_t unit,
+                    float *store )
+{
+   struct tgsi_sampler *sampler = samplers[unit];
+
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n",
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      sampler->get_samples(sampler,
+                           &store[0],
+                           &store[4],
+                           &store[8],
+                           0.0f, /*store[12],  lodbias */
+                           rgba);
+      memcpy(store, rgba, sizeof rgba);
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_tgsi.h"
+
+
+struct lp_c_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   LLVMValueRef context_ptr;
+
+   LLVMValueRef samplers_ptr;
+
+   /** Coords/texels store */
+   LLVMValueRef store_ptr;
+};
+
+
+static void
+lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
+                                  LLVMBuilderRef builder,
+                                  struct lp_type type,
+                                  unsigned unit,
+                                  unsigned num_coords,
+                                  const LLVMValueRef *coords,
+                                  LLVMValueRef lodbias,
+                                  LLVMValueRef *texel)
+{
+   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
+   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
+   LLVMValueRef args[3];
+   unsigned i;
+
+   if(!sampler->samplers_ptr)
+      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
+
+   if(!sampler->store_ptr)
+      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
+                                            vec_type,
+                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
+                                            "texel_store");
+
+   for (i = 0; i < num_coords; i++) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      LLVMBuildStore(builder, coords[i], coord_ptr);
+   }
+
+   args[0] = sampler->samplers_ptr;
+   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   args[2] = sampler->store_ptr;
+
+   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
+
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
+   }
+}
+
+
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr)
+{
+   struct lp_c_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_c_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
+   sampler->context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
new file mode 100644
index 0000000000..d2a6ae21f5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_tgsi.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in lp_jit_context
+ * and lp_jit_texture and the sampler code generator. It provides the
+ * texture layout information required by the texture sampler code generator
+ * in terms of the state stored in lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(struct lp_sampler_dynamic_state *base,
+                       LLVMBuilderRef builder,
+                       unsigned unit,
+                       unsigned member_index,
+                       const char *member_name)
+{
+   struct llvmpipe_sampler_dynamic_state *state = (struct llvmpipe_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), LP_JIT_CONTEXT_TEXTURES_INDEX, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to fetch
+ * the members of lp_jit_texture to fulfill the sampler code generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture sampler code
+ * generator a reusable module without dependencies to llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index) \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( struct lp_sampler_dynamic_state *base, \
+                            LLVMBuilderRef builder, \
+                            unsigned unit) \
+   { \
+      return lp_llvm_texture_member(base, builder, unit, _index, #_name ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,    LP_JIT_TEXTURE_WIDTH)
+LP_LLVM_TEXTURE_MEMBER(height,   LP_JIT_TEXTURE_HEIGHT)
+LP_LLVM_TEXTURE_MEMBER(stride,   LP_JIT_TEXTURE_STRIDE)
+LP_LLVM_TEXTURE_MEMBER(data_ptr, LP_JIT_TEXTURE_DATA)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *base,
+                                     LLVMBuilderRef builder,
+                                     struct lp_type type,
+                                     unsigned unit,
+                                     unsigned num_coords,
+                                     const LLVMValueRef *coords,
+                                     LLVMValueRef lodbias,
+                                     LLVMValueRef *texel)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords,
+                       coords,
+                       lodbias,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                           LLVMValueRef context_ptr)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.stride = lp_llvm_texture_stride;
+   sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
new file mode 100644
index 0000000000..724d437833
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -0,0 +1,429 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+#include "lp_tex_cache.h"
+#include "lp_screen.h"
+#include "lp_winsys.h"
+
+
+/* Simple, maximally packed layout.
+ */
+
+
+/* Conventional allocation path for non-display textures:
+ */
+static boolean
+llvmpipe_texture_layout(struct llvmpipe_screen *screen,
+                        struct llvmpipe_texture * lpt)
+{
+   struct pipe_texture *pt = &lpt->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   unsigned buffer_size = 0;
+
+   pf_get_block(lpt->base.format, &lpt->base.block);
+
+   for (level = 0; level <= pt->last_level; level++) {
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
+      lpt->stride[level] = align(pt->nblocksx[level]*pt->block.size, 16);
+
+      lpt->level_offset[level] = buffer_size;
+
+      buffer_size += (pt->nblocksy[level] *
+                      ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
+                      lpt->stride[level]);
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+
+   lpt->data = align_malloc(buffer_size, 16);
+
+   return lpt->data != NULL;
+}
+
+static boolean
+llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
+                              struct llvmpipe_texture * lpt)
+{
+   struct llvmpipe_winsys *winsys = screen->winsys;
+
+   pf_get_block(lpt->base.format, &lpt->base.block);
+   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
+   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
+
+   lpt->dt = winsys->displaytarget_create(winsys,
+                                          lpt->base.format,
+                                          lpt->base.width[0],
+                                          lpt->base.height[0],
+                                          16,
+                                          &lpt->stride[0] );
+
+   return lpt->dt != NULL;
+}
+
+
+
+
+
+static struct pipe_texture *
+llvmpipe_texture_create(struct pipe_screen *_screen,
+                        const struct pipe_texture *templat)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_texture *lpt = CALLOC_STRUCT(llvmpipe_texture);
+   if (!lpt)
+      return NULL;
+
+   lpt->base = *templat;
+   pipe_reference_init(&lpt->base.reference, 1);
+   lpt->base.screen = &screen->base;
+
+   /* XXX: The xlib state tracker is brain-dead and will request
+    * PIPE_FORMAT_Z16_UNORM no matter how much we tell it we don't support it.
+    */
+   if(lpt->base.format == PIPE_FORMAT_Z16_UNORM)
+      lpt->base.format = PIPE_FORMAT_Z32_UNORM;
+
+   if (lpt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+                              PIPE_TEXTURE_USAGE_PRIMARY)) {
+      if (!llvmpipe_displaytarget_layout(screen, lpt))
+         goto fail;
+   }
+   else {
+      if (!llvmpipe_texture_layout(screen, lpt))
+         goto fail;
+   }
+    
+   return &lpt->base;
+
+ fail:
+   FREE(lpt);
+   return NULL;
+}
+
+
+static struct pipe_texture *
+llvmpipe_texture_blanket(struct pipe_screen * screen,
+                         const struct pipe_texture *base,
+                         const unsigned *stride,
+                         struct pipe_buffer *buffer)
+{
+   /* FIXME */
+#if 0
+   struct llvmpipe_texture *lpt;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   lpt = CALLOC_STRUCT(llvmpipe_texture);
+   if (!lpt)
+      return NULL;
+
+   lpt->base = *base;
+   pipe_reference_init(&lpt->base.reference, 1);
+   lpt->base.screen = screen;
+   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
+   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
+   lpt->stride[0] = stride[0];
+
+   pipe_buffer_reference(&lpt->buffer, buffer);
+
+   return &lpt->base;
+#else
+   return NULL;
+#endif
+}
+
+
+static void
+llvmpipe_texture_destroy(struct pipe_texture *pt)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pt->screen);
+   struct llvmpipe_texture *lpt = llvmpipe_texture(pt);
+
+   if(lpt->dt) {
+      struct llvmpipe_winsys *winsys = screen->winsys;
+      winsys->displaytarget_destroy(winsys, lpt->dt);
+   }
+   else
+      align_free(lpt->data);
+
+   FREE(lpt);
+}
+
+
+static struct pipe_surface *
+llvmpipe_get_tex_surface(struct pipe_screen *screen,
+                         struct pipe_texture *pt,
+                         unsigned face, unsigned level, unsigned zslice,
+                         unsigned usage)
+{
+   struct llvmpipe_texture *lpt = llvmpipe_texture(pt);
+   struct pipe_surface *ps;
+
+   assert(level <= pt->last_level);
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->offset = lpt->level_offset[level];
+      ps->usage = usage;
+
+      /* Because we are llvmpipe, anything that the state tracker
+       * thought was going to be done with the GPU will actually get
+       * done with the CPU.  Let's adjust the flags to take that into
+       * account.
+       */
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_WRITE) {
+         /* GPU_WRITE means "render" and that can involve reads (blending) */
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_CPU_READ;
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         lpt->timestamp++;
+         llvmpipe_screen(screen)->timestamp++;
+      }
+
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE) {
+         ps->offset += face * pt->nblocksy[level] * lpt->stride[level];
+      }
+      else if (pt->target == PIPE_TEXTURE_3D) {
+         ps->offset += zslice * pt->nblocksy[level] * lpt->stride[level];
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+llvmpipe_tex_surface_destroy(struct pipe_surface *surf)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert(surf->texture);
+   pipe_texture_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+static struct pipe_transfer *
+llvmpipe_get_tex_transfer(struct pipe_screen *screen,
+                          struct pipe_texture *texture,
+                          unsigned face, unsigned level, unsigned zslice,
+                          enum pipe_transfer_usage usage,
+                          unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct llvmpipe_texture *lptex = llvmpipe_texture(texture);
+   struct llvmpipe_transfer *lpt;
+
+   assert(texture);
+   assert(level <= texture->last_level);
+
+   lpt = CALLOC_STRUCT(llvmpipe_transfer);
+   if (lpt) {
+      struct pipe_transfer *pt = &lpt->base;
+      pipe_texture_reference(&pt->texture, texture);
+      pt->format = texture->format;
+      pt->block = texture->block;
+      pt->x = x;
+      pt->y = y;
+      pt->width = w;
+      pt->height = h;
+      pt->nblocksx = texture->nblocksx[level];
+      pt->nblocksy = texture->nblocksy[level];
+      pt->stride = lptex->stride[level];
+      pt->usage = usage;
+      pt->face = face;
+      pt->level = level;
+      pt->zslice = zslice;
+
+      lpt->offset = lptex->level_offset[level];
+
+      if (texture->target == PIPE_TEXTURE_CUBE) {
+         lpt->offset += face * pt->nblocksy * pt->stride;
+      }
+      else if (texture->target == PIPE_TEXTURE_3D) {
+         lpt->offset += zslice * pt->nblocksy * pt->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+      return pt;
+   }
+   return NULL;
+}
+
+
+static void 
+llvmpipe_tex_transfer_destroy(struct pipe_transfer *transfer)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert (transfer->texture);
+   pipe_texture_reference(&transfer->texture, NULL);
+   FREE(transfer);
+}
+
+
+static void *
+llvmpipe_transfer_map( struct pipe_screen *_screen,
+                       struct pipe_transfer *transfer )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   ubyte *map, *xfer_map;
+   struct llvmpipe_texture *lpt;
+
+   assert(transfer->texture);
+   lpt = llvmpipe_texture(transfer->texture);
+
+   if(lpt->dt) {
+      struct llvmpipe_winsys *winsys = screen->winsys;
+      unsigned flags = 0;
+
+      if (transfer->usage != PIPE_TRANSFER_READ) {
+         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+      }
+
+      if (transfer->usage != PIPE_TRANSFER_WRITE) {
+         flags |= PIPE_BUFFER_USAGE_CPU_READ;
+      }
+
+      map = winsys->displaytarget_map(winsys, lpt->dt, flags);
+      if (map == NULL)
+         return NULL;
+   }
+   else
+      map = lpt->data;
+
+   /* May want to different things here depending on read/write nature
+    * of the map:
+    */
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
+   {
+      /* Do something to notify sharing contexts of a texture change.
+       * In llvmpipe, that would mean flushing the texture cache.
+       */
+      screen->timestamp++;
+   }
+   
+   xfer_map = map + llvmpipe_transfer(transfer)->offset +
+      transfer->y / transfer->block.height * transfer->stride +
+      transfer->x / transfer->block.width * transfer->block.size;
+   /*printf("map = %p  xfer map = %p\n", map, xfer_map);*/
+   return xfer_map;
+}
+
+
+static void
+llvmpipe_transfer_unmap(struct pipe_screen *_screen,
+                       struct pipe_transfer *transfer)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_texture *lpt;
+
+   assert(transfer->texture);
+   lpt = llvmpipe_texture(transfer->texture);
+
+   if(lpt->dt) {
+      struct llvmpipe_winsys *winsys = screen->winsys;
+      winsys->displaytarget_unmap(winsys, lpt->dt);
+   }
+}
+
+
+void
+llvmpipe_init_texture_funcs(struct llvmpipe_context *lp)
+{
+}
+
+
+void
+llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = llvmpipe_texture_create;
+   screen->texture_blanket = llvmpipe_texture_blanket;
+   screen->texture_destroy = llvmpipe_texture_destroy;
+
+   screen->get_tex_surface = llvmpipe_get_tex_surface;
+   screen->tex_surface_destroy = llvmpipe_tex_surface_destroy;
+
+   screen->get_tex_transfer = llvmpipe_get_tex_transfer;
+   screen->tex_transfer_destroy = llvmpipe_tex_transfer_destroy;
+   screen->transfer_map = llvmpipe_transfer_map;
+   screen->transfer_unmap = llvmpipe_transfer_unmap;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
new file mode 100644
index 0000000000..00a20763e4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -0,0 +1,90 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEXTURE_H
+#define LP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+
+
+struct pipe_context;
+struct pipe_screen;
+struct llvmpipe_context;
+struct llvmpipe_displaytarget;
+
+struct llvmpipe_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned stride[PIPE_MAX_TEXTURE_LEVELS];
+
+   /**
+    * Display target, for textures with the PIPE_TEXTURE_USAGE_DISPLAY_TARGET
+    * usage.
+    */
+   struct llvmpipe_displaytarget *dt;
+
+   /**
+    * Malloc'ed data for regular textures, or a mapping to dt above.
+    */
+   void *data;
+
+   unsigned timestamp;
+};
+
+struct llvmpipe_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+};
+
+
+/** cast wrappers */
+static INLINE struct llvmpipe_texture *
+llvmpipe_texture(struct pipe_texture *pt)
+{
+   return (struct llvmpipe_texture *) pt;
+}
+
+static INLINE struct llvmpipe_transfer *
+llvmpipe_transfer(struct pipe_transfer *pt)
+{
+   return (struct llvmpipe_transfer *) pt;
+}
+
+
+extern void
+llvmpipe_init_texture_funcs( struct llvmpipe_context *llvmpipe );
+
+extern void
+llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* LP_TEXTURE */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
new file mode 100644
index 0000000000..2e576e6039
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -0,0 +1,302 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+#include "lp_context.h"
+#include "lp_surface.h"
+#include "lp_texture.h"
+#include "lp_tile_soa.h"
+#include "lp_tile_cache.h"
+
+
+struct llvmpipe_tile_cache *
+lp_create_tile_cache( struct pipe_screen *screen )
+{
+   struct llvmpipe_tile_cache *tc;
+
+   tc = CALLOC_STRUCT( llvmpipe_tile_cache );
+   if(!tc)
+      return NULL;
+
+   tc->screen = screen;
+
+   return tc;
+}
+
+
+void
+lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   unsigned x, y;
+
+   for (y = 0; y < MAX_HEIGHT; y += TILE_SIZE) {
+      for (x = 0; x < MAX_WIDTH; x += TILE_SIZE) {
+         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
+
+         if(tile->color)
+            align_free(tile->color);
+      }
+   }
+
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+
+   FREE( tc );
+}
+
+
+/**
+ * Specify the surface to cache.
+ */
+void
+lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
+                          struct pipe_surface *ps)
+{
+   if (tc->transfer) {
+      struct pipe_screen *screen = tc->transfer->texture->screen;
+
+      if (ps == tc->surface)
+         return;
+
+      if (tc->transfer_map) {
+         screen->transfer_unmap(screen, tc->transfer);
+         tc->transfer_map = NULL;
+      }
+
+      screen->tex_transfer_destroy(tc->transfer);
+      tc->transfer = NULL;
+   }
+
+   tc->surface = ps;
+
+   if (ps) {
+      struct pipe_screen *screen = ps->texture->screen;
+      unsigned x, y;
+
+      tc->transfer = screen->get_tex_transfer(screen, ps->texture, ps->face,
+                                              ps->level, ps->zslice,
+                                              PIPE_TRANSFER_READ_WRITE,
+                                              0, 0, ps->width, ps->height);
+
+      for (y = 0; y < ps->height; y += TILE_SIZE) {
+         for (x = 0; x < ps->width; x += TILE_SIZE) {
+            struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
+
+            tile->status = LP_TILE_STATUS_UNDEFINED;
+
+            if(!tile->color)
+               tile->color = align_malloc( TILE_SIZE*TILE_SIZE*NUM_CHANNELS, 16 );
+         }
+      }
+   }
+}
+
+
+/**
+ * Return the transfer being cached.
+ */
+struct pipe_surface *
+lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc)
+{
+   return tc->surface;
+}
+
+
+void
+lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc)
+{
+   if (tc->transfer && !tc->transfer_map)
+      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
+}
+
+
+void
+lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc)
+{
+   if (tc->transfer_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->transfer);
+      tc->transfer_map = NULL;
+   }
+}
+
+
+/**
+ * Set a tile to a solid color.
+ */
+static void
+clear_tile(struct llvmpipe_cached_tile *tile,
+           uint8_t clear_color[4])
+{
+   if (clear_color[0] == clear_color[1] &&
+       clear_color[1] == clear_color[2] &&
+       clear_color[2] == clear_color[3]) {
+      memset(tile->color, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+   }
+   else {
+      uint x, y, chan;
+      for (y = 0; y < TILE_SIZE; y++)
+         for (x = 0; x < TILE_SIZE; x++)
+            for (chan = 0; chan < 4; ++chan)
+               TILE_PIXEL(tile->color, x, y, chan) = clear_color[chan];
+   }
+}
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
+{
+   struct pipe_transfer *pt = tc->transfer;
+   unsigned x, y;
+
+   if(!pt)
+      return;
+
+   /* push the tile to all positions marked as clear */
+   for (y = 0; y < pt->height; y += TILE_SIZE) {
+      for (x = 0; x < pt->width; x += TILE_SIZE) {
+         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
+
+         switch(tile->status) {
+         case LP_TILE_STATUS_UNDEFINED:
+            break;
+
+         case LP_TILE_STATUS_CLEAR: {
+            /**
+             * Actually clear the tiles which were flagged as being in a clear state.
+             */
+
+            struct pipe_screen *screen = pt->texture->screen;
+            unsigned tw = TILE_SIZE;
+            unsigned th = TILE_SIZE;
+            void *dst;
+
+            if (pipe_clip_tile(x, y, &tw, &th, pt))
+               continue;
+
+            dst = screen->transfer_map(screen, pt);
+            assert(dst);
+            if(!dst)
+               continue;
+
+            util_fill_rect(dst, &pt->block, pt->stride,
+                           x, y, tw,  th,
+                           tc->clear_val);
+
+            screen->transfer_unmap(screen, pt);
+
+            tile->status = LP_TILE_STATUS_UNDEFINED;
+            break;
+         }
+
+         case LP_TILE_STATUS_DEFINED:
+            lp_put_tile_rgba_soa(pt, x, y, tile->color);
+            tile->status = LP_TILE_STATUS_UNDEFINED;
+            break;
+         }
+      }
+   }
+}
+
+
+/**
+ * Get a tile from the cache.
+ * \param x, y  position of tile, in pixels
+ */
+void *
+lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
+                   unsigned x, unsigned y )
+{
+   struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
+   struct pipe_transfer *pt = tc->transfer;
+   
+   switch(tile->status) {
+   case LP_TILE_STATUS_CLEAR:
+      /* don't get tile from framebuffer, just clear it */
+      clear_tile(tile, tc->clear_color);
+      tile->status = LP_TILE_STATUS_DEFINED;
+      break;
+
+   case LP_TILE_STATUS_UNDEFINED:
+      /* get new tile data from transfer */
+      lp_get_tile_rgba_soa(pt, x & ~(TILE_SIZE - 1), y & ~(TILE_SIZE - 1), tile->color);
+      tile->status = LP_TILE_STATUS_DEFINED;
+      break;
+
+   case LP_TILE_STATUS_DEFINED:
+      /* nothing to do */
+      break;
+   }
+
+   return tile->color;
+}
+
+
+/**
+ * When a whole surface is being cleared to a value we can avoid
+ * fetching tiles above.
+ * Save the color and set a 'clearflag' for each tile of the screen.
+ */
+void
+lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
+                    uint clearValue)
+{
+   struct pipe_transfer *pt = tc->transfer;
+   const unsigned w = pt->width;
+   const unsigned h = pt->height;
+   unsigned x, y, chan;
+
+   for(chan = 0; chan < 4; ++chan)
+      tc->clear_color[chan] = float_to_ubyte(rgba[chan]);
+
+   tc->clear_val = clearValue;
+
+   /* push the tile to all positions marked as clear */
+   for (y = 0; y < h; y += TILE_SIZE) {
+      for (x = 0; x < w; x += TILE_SIZE) {
+         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
+         tile->status = LP_TILE_STATUS_CLEAR;
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
new file mode 100644
index 0000000000..6d8ba5ece7
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -0,0 +1,106 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TILE_CACHE_H
+#define LP_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+#include "lp_tile_soa.h"
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+/** XXX move these */
+#define MAX_WIDTH 2048
+#define MAX_HEIGHT 2048
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct llvmpipe_tile_cache *
+lp_create_tile_cache( struct pipe_screen *screen );
+
+extern void
+lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc);
+
+extern void
+lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
+                          struct pipe_surface *lps);
+
+extern struct pipe_surface *
+lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc);
+
+extern void
+lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc);
+
+extern void
+lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc);
+
+extern void
+lp_flush_tile_cache(struct llvmpipe_tile_cache *tc);
+
+extern void
+lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
+                    uint clearValue);
+
+extern void *
+lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
+                   unsigned x, unsigned y );
+
+
+#endif /* LP_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.c b/src/gallium/drivers/llvmpipe/lp_tile_soa.c
new file mode 100644
index 0000000000..4e4ccb31cc
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.c
@@ -0,0 +1,931 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * RGBA/float tile get/put functions.
+ * Usable both by drivers and state trackers.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+#include "util/u_tile.h"
+#include "lp_tile_cache.h"
+#include "lp_tile_soa.h"
+
+
+const unsigned char
+tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {
+   {  0,  1,  4,  5,  8,  9, 12, 13},
+   {  2,  3,  6,  7, 10, 11, 14, 15}
+};
+
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+a8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const unsigned pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
+         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
+         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
+         TILE_PIXEL(p, j, i, 3) = (pixel >> 24) & 0xff;
+      }
+   }
+}
+
+
+static void
+a8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, g, b, a;
+         r = TILE_PIXEL(p, j, i, 0);
+         g = TILE_PIXEL(p, j, i, 1);
+         b = TILE_PIXEL(p, j, i, 2);
+         a = TILE_PIXEL(p, j, i, 3);
+         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+x8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const unsigned pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
+         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
+         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
+         TILE_PIXEL(p, j, i, 3) = 0xff;
+      }
+   }
+}
+
+
+static void
+x8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, g, b;
+         r = TILE_PIXEL(p, j, i, 0);
+         g = TILE_PIXEL(p, j, i, 1);
+         b = TILE_PIXEL(p, j, i, 2);
+         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+
+static void
+b8g8r8a8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const unsigned pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = (pixel >>  8) & 0xff;
+         TILE_PIXEL(p, j, i, 1) = (pixel >> 16) & 0xff;
+         TILE_PIXEL(p, j, i, 2) = (pixel >> 24) & 0xff;
+         TILE_PIXEL(p, j, i, 3) = (pixel >>  0) & 0xff;
+      }
+   }
+}
+
+
+static void
+b8g8r8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, g, b, a;
+         r = TILE_PIXEL(p, j, i, 0);
+         g = TILE_PIXEL(p, j, i, 1);
+         b = TILE_PIXEL(p, j, i, 2);
+         a = TILE_PIXEL(p, j, i, 3);
+         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+
+static void
+a1r5g5b5_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const ushort pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = ((pixel >> 10) & 0x1f) * 255 / 31;
+         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x1f) * 255 / 31;
+         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
+         TILE_PIXEL(p, j, i, 3) = ((pixel >> 15)       ) * 255;
+      }
+   }
+}
+
+
+static void
+a1r5g5b5_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, g, b, a;
+         r = TILE_PIXEL(p, j, i, 0);
+         g = TILE_PIXEL(p, j, i, 1);
+         b = TILE_PIXEL(p, j, i, 2);
+         a = TILE_PIXEL(p, j, i, 3);
+         r = r >> 3;  /* 5 bits */
+         g = g >> 3;  /* 5 bits */
+         b = b >> 3;  /* 5 bits */
+         a = a >> 7;  /* 1 bit */
+         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+
+static void
+a4r4g4b4_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const ushort pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = ((pixel >>  8) & 0xf) * 255 / 15;
+         TILE_PIXEL(p, j, i, 1) = ((pixel >>  4) & 0xf) * 255 / 15;
+         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0xf) * 255 / 15;
+         TILE_PIXEL(p, j, i, 3) = ((pixel >> 12)      ) * 255 / 15;
+      }
+   }
+}
+
+
+static void
+a4r4g4b4_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, g, b, a;
+         r = TILE_PIXEL(p, j, i, 0);
+         g = TILE_PIXEL(p, j, i, 1);
+         b = TILE_PIXEL(p, j, i, 2);
+         a = TILE_PIXEL(p, j, i, 3);
+         r >>= 4;
+         g >>= 4;
+         b >>= 4;
+         a >>= 4;
+         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+
+static void
+r5g6b5_get_tile_rgba(const ushort *src,
+                     unsigned w, unsigned h,
+                     uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         const ushort pixel = *src++;
+         TILE_PIXEL(p, j, i, 0) = ((pixel >> 11) & 0x1f) * 255 / 31;
+         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x3f) * 255 / 63;
+         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
+         TILE_PIXEL(p, j, i, 3) = 255;
+      }
+   }
+}
+
+
+static void
+r5g6b5_put_tile_rgba(ushort *dst,
+                     unsigned w, unsigned h,
+                     const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         uint r = (uint) TILE_PIXEL(p, j, i, 0) * 31 / 255;
+         uint g = (uint) TILE_PIXEL(p, j, i, 1) * 63 / 255;
+         uint b = (uint) TILE_PIXEL(p, j, i, 2) * 31 / 255;
+         *dst++ = (r << 11) | (g << 5) | (b);
+      }
+   }
+}
+
+
+
+/*** PIPE_FORMAT_Z16_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z16_get_tile_rgba(const ushort *src,
+                  unsigned w, unsigned h,
+                  uint8_t *p)
+{
+   const float scale = 1.0f / 65535.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = *src++ * scale;
+      }
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_L8_UNORM ***/
+
+static void
+l8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, src++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) = *src;
+         TILE_PIXEL(p, j, i, 3) = 255;
+      }
+   }
+}
+
+
+static void
+l8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r;
+         r = TILE_PIXEL(p, j, i, 0);
+         *dst++ = (ubyte) r;
+      }
+   }
+}
+
+
+
+/*** PIPE_FORMAT_A8_UNORM ***/
+
+static void
+a8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, src++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) = 0;
+         TILE_PIXEL(p, j, i, 3) = *src;
+      }
+   }
+}
+
+
+static void
+a8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned a;
+         a = TILE_PIXEL(p, j, i, 3);
+         *dst++ = (ubyte) a;
+      }
+   }
+}
+
+
+
+/*** PIPE_FORMAT_R16_SNORM ***/
+
+static void
+r16_get_tile_rgba(const short *src,
+                  unsigned w, unsigned h,
+                  uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, src++) {
+         TILE_PIXEL(p, j, i, 0) = MAX2(src[0] >> 7, 0);
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) = 0;
+         TILE_PIXEL(p, j, i, 3) = 255;
+      }
+   }
+}
+
+
+static void
+r16_put_tile_rgba(short *dst,
+                  unsigned w, unsigned h,
+                  const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, dst++) {
+         dst[0] = TILE_PIXEL(p, j, i, 0) << 7;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
+
+static void
+r16g16b16a16_get_tile_rgba(const short *src,
+                           unsigned w, unsigned h,
+                           uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, src += 4) {
+         TILE_PIXEL(p, j, i, 0) = src[0] >> 8;
+         TILE_PIXEL(p, j, i, 1) = src[1] >> 8;
+         TILE_PIXEL(p, j, i, 2) = src[2] >> 8;
+         TILE_PIXEL(p, j, i, 3) = src[3] >> 8;
+      }
+   }
+}
+
+
+static void
+r16g16b16a16_put_tile_rgba(short *dst,
+                           unsigned w, unsigned h,
+                           const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, dst += 4) {
+         dst[0] = TILE_PIXEL(p, j, i, 0) << 8;
+         dst[1] = TILE_PIXEL(p, j, i, 1) << 8;
+         dst[2] = TILE_PIXEL(p, j, i, 2) << 8;
+         dst[3] = TILE_PIXEL(p, j, i, 3) << 8;
+      }
+   }
+}
+
+
+
+/*** PIPE_FORMAT_I8_UNORM ***/
+
+static void
+i8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++, src++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = *src;
+      }
+   }
+}
+
+
+static void
+i8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r;
+         r = TILE_PIXEL(p, j, i, 0);
+         *dst++ = (ubyte) r;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_A8L8_UNORM ***/
+
+static void
+a8l8_get_tile_rgba(const ushort *src,
+                   unsigned w, unsigned h,
+                   uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         ushort ra = *src++;
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) = ra & 0xff;
+         TILE_PIXEL(p, j, i, 3) = ra >> 8;
+      }
+   }
+}
+
+
+static void
+a8l8_put_tile_rgba(ushort *dst,
+                   unsigned w, unsigned h,
+                   const uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         unsigned r, a;
+         r = TILE_PIXEL(p, j, i, 0);
+         a = TILE_PIXEL(p, j, i, 3);
+         *dst++ = (a << 8) | r;
+      }
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_Z32_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32_get_tile_rgba(const unsigned *src,
+                  unsigned w, unsigned h,
+                  uint8_t *p)
+{
+   const double scale = 1.0 / (double) 0xffffffff;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = (float) (*src++ * scale);
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+s8z24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    uint8_t *p)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ & 0xffffff));
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+z24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    uint8_t *p)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ >> 8));
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+                   unsigned w, unsigned h,
+                   uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = *src++;
+      }
+   }
+}
+
+
+/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
+
+/**
+ * Convert YCbCr (or YCrCb) to RGBA.
+ */
+static void
+ycbcr_get_tile_rgba(const ushort *src,
+                    unsigned w, unsigned h,
+                    uint8_t *p,
+                    boolean rev)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      /* do two texels at a time */
+      for (j = 0; j < (w & ~1); j += 2, src += 2) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         TILE_PIXEL(p, j, i, 0) = r;
+         TILE_PIXEL(p, j, i, 1) = g;
+         TILE_PIXEL(p, j, i, 2) = b;
+         TILE_PIXEL(p, j, i, 3) = 255;
+
+         /* odd pixel: use y1,cr,cb */
+         r = 1.164f * (y1-16) + 1.596f * (cr-128);
+         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y1-16) + 2.018f * (cb-128);
+         TILE_PIXEL(p, j + 1, i, 0) = r;
+         TILE_PIXEL(p, j + 1, i, 1) = g;
+         TILE_PIXEL(p, j + 1, i, 2) = b;
+         TILE_PIXEL(p, j + 1, i, 3) = 255;
+      }
+      /* do the last texel */
+      if (w & 1) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         TILE_PIXEL(p, j, i, 0) = r;
+         TILE_PIXEL(p, j, i, 1) = g;
+         TILE_PIXEL(p, j, i, 2) = b;
+         TILE_PIXEL(p, j, i, 3) = 255;
+      }
+   }
+}
+
+
+static void
+fake_get_tile_rgba(const ushort *src,
+                   unsigned w, unsigned h,
+                   uint8_t *p)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      for (j = 0; j < w; j++) {
+         TILE_PIXEL(p, j, i, 0) =
+         TILE_PIXEL(p, j, i, 1) =
+         TILE_PIXEL(p, j, i, 2) =
+         TILE_PIXEL(p, j, i, 3) = (i ^ j) & 1 ? 255 : 0;
+      }
+   }
+}
+
+
+static void
+lp_tile_raw_to_rgba_soa(enum pipe_format format,
+                        void *src,
+                        uint w, uint h,
+                        uint8_t *p)
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_get_tile_rgba((ushort *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_get_tile_rgba((ubyte *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_get_tile_rgba((ubyte *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_get_tile_rgba((ubyte *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_get_tile_rgba((ushort *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_get_tile_rgba((short *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_get_tile_rgba((short *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      z16_get_tile_rgba((ushort *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      z32_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      s8z24_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      z24s8_get_tile_rgba((unsigned *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      z32f_get_tile_rgba((float *) src, w, h, p);
+      break;
+   case PIPE_FORMAT_YCBCR:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, p, FALSE);
+      break;
+   case PIPE_FORMAT_YCBCR_REV:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, p, TRUE);
+      break;
+   default:
+      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
+      fake_get_tile_rgba(src, w, h, p);
+   }
+}
+
+
+void
+lp_get_tile_rgba_soa(struct pipe_transfer *pt,
+                     uint x, uint y,
+                     uint8_t *p)
+{
+   uint w = TILE_SIZE, h = TILE_SIZE;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, pt))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
+
+   if (!packed)
+      return;
+
+   if(pt->format == PIPE_FORMAT_YCBCR || pt->format == PIPE_FORMAT_YCBCR_REV)
+      assert((x & 1) == 0);
+
+   pipe_get_tile_raw(pt, x, y, w, h, packed, 0);
+
+   lp_tile_raw_to_rgba_soa(pt->format, packed, w, h, p);
+
+   FREE(packed);
+}
+
+
+void
+lp_put_tile_rgba_soa(struct pipe_transfer *pt,
+                     uint x, uint y,
+                     const uint8_t *p)
+{
+   uint w = TILE_SIZE, h = TILE_SIZE;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, pt))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
+
+   if (!packed)
+      return;
+
+   switch (pt->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      assert(0);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_put_tile_rgba((ubyte *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_put_tile_rgba((ubyte *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_put_tile_rgba((ubyte *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_put_tile_rgba((ushort *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_put_tile_rgba((short *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      /*z16_put_tile_rgba((ushort *) packed, w, h, p);*/
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      /*z32_put_tile_rgba((unsigned *) packed, w, h, p);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p);*/
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p);*/
+      break;
+   default:
+      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(pt->format));
+   }
+
+   pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
+
+   FREE(packed);
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
new file mode 100644
index 0000000000..3d8c703b73
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -0,0 +1,81 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TILE_SOA_H
+#define LP_TILE_SOA_H
+
+#include "pipe/p_compiler.h"
+#include "tgsi/tgsi_exec.h" // for NUM_CHANNELS
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_transfer;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+#define TILE_VECTOR_HEIGHT 2
+#define TILE_VECTOR_WIDTH 8
+
+extern const unsigned char
+tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH];
+
+#define TILE_C_STRIDE (TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH)
+#define TILE_X_STRIDE (NUM_CHANNELS*TILE_C_STRIDE)
+#define TILE_Y_STRIDE (TILE_VECTOR_HEIGHT*TILE_SIZE*NUM_CHANNELS)
+
+#define TILE_PIXEL(_p, _x, _y, _c) \
+   ((_p)[((_y)/TILE_VECTOR_HEIGHT)*TILE_Y_STRIDE + \
+         ((_x)/TILE_VECTOR_WIDTH)*TILE_X_STRIDE + \
+         (_c)*TILE_C_STRIDE + \
+         tile_offset[(_y) % TILE_VECTOR_HEIGHT][(_x) % TILE_VECTOR_WIDTH]])
+
+
+void
+lp_get_tile_rgba_soa(struct pipe_transfer *pt,
+                     uint x, uint y,
+                     uint8_t *p);
+
+void
+lp_put_tile_rgba_soa(struct pipe_transfer *pt,
+                     uint x, uint y,
+                     const uint8_t *p);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_winsys.h b/src/gallium/drivers/llvmpipe/lp_winsys.h
new file mode 100644
index 0000000000..595481c2cb
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_winsys.h
@@ -0,0 +1,128 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * llvmpipe public interface.
+ */
+
+
+#ifndef LP_WINSYS_H
+#define LP_WINSYS_H
+
+
+#include "pipe/p_compiler.h" // for boolean
+#include "pipe/p_format.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_screen;
+struct pipe_context;
+
+
+/**
+ * Opaque pointer.
+ */
+struct llvmpipe_displaytarget;
+
+
+/**
+ * This is the interface that llvmpipe expects any window system
+ * hosting it to implement.
+ * 
+ * llvmpipe is for the most part a self sufficient driver. The only thing it
+ * does not know is how to display a surface.
+ */
+struct llvmpipe_winsys
+{
+   void 
+   (*destroy)( struct llvmpipe_winsys *ws );
+
+   boolean
+   (*is_displaytarget_format_supported)( struct llvmpipe_winsys *ws,
+                                         enum pipe_format format );
+   
+   /**
+    * Allocate storage for a render target.
+    * 
+    * Often surfaces which are meant to be blitted to the front screen (i.e.,
+    * display targets) must be allocated with special characteristics, memory 
+    * pools, or obtained directly from the windowing system.
+    *  
+    * This callback is invoked by the pipe_screen when creating a texture marked
+    * with the PIPE_TEXTURE_USAGE_DISPLAY_TARGET flag to get the underlying 
+    * storage.
+    */
+   struct llvmpipe_displaytarget *
+   (*displaytarget_create)( struct llvmpipe_winsys *ws,
+                            enum pipe_format format,
+                            unsigned width, unsigned height,
+                            unsigned alignment,
+                            unsigned *stride );
+
+   void *
+   (*displaytarget_map)( struct llvmpipe_winsys *ws, 
+                         struct llvmpipe_displaytarget *dt,
+                         unsigned flags );
+
+   void
+   (*displaytarget_unmap)( struct llvmpipe_winsys *ws,
+                           struct llvmpipe_displaytarget *dt );
+
+   /**
+    * @sa pipe_screen:flush_frontbuffer.
+    *
+    * This call will likely become asynchronous eventually.
+    */
+   void
+   (*displaytarget_display)( struct llvmpipe_winsys *ws, 
+                             struct llvmpipe_displaytarget *dt,
+                             void *context_private );
+
+   void 
+   (*displaytarget_destroy)( struct llvmpipe_winsys *ws, 
+                             struct llvmpipe_displaytarget *dt );
+};
+
+
+struct pipe_context *
+llvmpipe_create( struct pipe_screen * );
+
+
+struct pipe_screen *
+llvmpipe_create_screen( struct llvmpipe_winsys * );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LP_WINSYS_H */
diff --git a/src/gallium/drivers/llvmpipe/sp2lp.sh b/src/gallium/drivers/llvmpipe/sp2lp.sh
new file mode 100755
index 0000000000..c45a81ce3c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/sp2lp.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+#
+# Port changes from softpipe to llvmpipe. Invoke as
+#
+#   sp2lp.sh <commit>
+#
+# Note that this will only affect llvmpipe -- you still need to actually
+# cherry-pick/merge the softpipe changes themselves if they affect directories
+# outside src/gallium/drivers/softpipe
+
+git format-patch \
+	--keep-subject \
+	--relative=src/gallium/drivers/softpipe \
+	--src-prefix=a/src/gallium/drivers/llvmpipe/ \
+	--dst-prefix=b/src/gallium/drivers/llvmpipe/ \
+	--stdout "$1^1..$1" \
+| sed \
+	-e 's/\<softpipe\>/llvmpipe/g' \
+	-e 's/\<sp\>/lp/g' \
+	-e 's/\<softpipe_/llvmpipe_/g' \
+	-e 's/\<sp_/lp_/g' \
+	-e 's/\<SP_/LP_/g' \
+	-e 's/\<SOFTPIPE_/LLVMPIPE_/g' \
+	-e 's/\<spt\>/lpt/g' \
+	-e 's/\<sps\>/lps/g' \
+	-e 's/\<spfs\>/lpfs/g' \
+	-e 's/\<sptex\>/lptex/g' \
+	-e 's/\<setup_\(point\|line\|tri\)\>/llvmpipe_\0/g' \
+	-e 's/\<llvmpipe_cached_tile\>/llvmpipe_cached_tex_tile/g' \
+	-e 's/_get_cached_tile_tex\>/_get_cached_tex_tile/g' \
+	-e 's/\<TILE_SIZE\>/TEX_TILE_SIZE/g' \
+	-e 's/\<tile_address\>/tex_tile_address/g' \
+	-e 's/\<tile->data\.color\>/tile->color/g' \
+| patch -p1
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 832366e646..e4cf91c005 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -4,6 +4,8 @@
 
 #include <util/u_memory.h>
 
+#include <errno.h>
+
 #include "nouveau/nouveau_bo.h"
 #include "nouveau_winsys.h"
 #include "nouveau_screen.h"
@@ -141,12 +143,13 @@ nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 			    unsigned offset, unsigned length, unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	uint32_t flags = nouveau_screen_map_flags(usage);
 	int ret;
 
-	ret = nouveau_bo_map_range(bo, offset, length,
-				   nouveau_screen_map_flags(usage));
+	ret = nouveau_bo_map_range(bo, offset, length, flags);
 	if (ret) {
-		debug_printf("map_range failed: %d\n", ret);
+		if (!(flags & NOUVEAU_BO_NOWAIT) || ret != -EBUSY)
+			debug_printf("map_range failed: %d\n", ret);
 		return NULL;
 	}
 
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 9968b07896..ebfc67ad1c 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -22,4 +22,15 @@ nouveau_bo(struct pipe_buffer *pb)
 int nouveau_screen_init(struct nouveau_screen *, struct nouveau_device *);
 void nouveau_screen_fini(struct nouveau_screen *);
 
+struct nouveau_miptree {
+	struct pipe_texture base;
+	struct nouveau_bo *bo;
+};
+
+static inline struct nouveau_miptree *
+nouveau_miptree(struct pipe_texture *pt)
+{
+	return (struct nouveau_miptree *)pt;
+}
+
 #endif
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index ff2febb668..170ce3eb7e 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -16,8 +16,6 @@ nv04_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 0;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index f315cf54f0..f88e138c79 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -15,11 +15,13 @@ nv04_surface_format(enum pipe_format format)
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
 	case PIPE_FORMAT_R16_SNORM:
 	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
 	case PIPE_FORMAT_X8R8G8B8_UNORM:
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
 	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z24X8_UNORM:
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
 	default:
 		return -1;
@@ -33,9 +35,11 @@ nv04_rect_format(enum pipe_format format)
 	case PIPE_FORMAT_A8_UNORM:
 		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
 	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
 		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z24X8_UNORM:
 		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
 	default:
 		return -1;
@@ -110,10 +114,10 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	unsigned cx;
 	unsigned cy;
 
-	/* POT or GTFO */
-	assert(!(w & (w - 1)) && !(h & (h - 1)));
+#if 0
 	/* That's the way she likes it */
 	assert(src_pitch == ((struct nv04_surface *)dst)->pitch);
+#endif
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
 	OUT_RELOCo(chan, dst_bo,
@@ -133,7 +137,7 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	for (cy = 0; cy < h; cy += sub_h) {
 	  for (cx = 0; cx < w; cx += sub_w) {
 	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(cx, cy) *
+	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(cx+dx, cy+dy) *
 			     dst->texture->block.size, NOUVEAU_BO_GART |
 			     NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
@@ -153,8 +157,8 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	    OUT_RING  (chan, src_pitch |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	    OUT_RELOCl(chan, src_bo, src->offset + cy * src_pitch +
-			     cx * src->texture->block.size, NOUVEAU_BO_GART |
+	    OUT_RELOCl(chan, src_bo, src->offset + (cy+sy) * src_pitch +
+			     (cx+sx) * src->texture->block.size, NOUVEAU_BO_GART |
 			     NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 	    OUT_RING  (chan, 0);
 	  }
@@ -210,6 +214,43 @@ nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
 }
 
 static int
+nv04_surface_copy_m2mf_swizzle(struct nv04_surface_2d *ctx,
+			       struct pipe_surface *dst, int dx, int dy,
+			       struct pipe_surface *src, int sx, int sy)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	unsigned dst_offset = dst->offset + nv04_swizzle_bits(dx, dy) *
+	                      dst->texture->block.size;
+	unsigned src_offset = src->offset + sy * src_pitch +
+	                      sx * src->texture->block.size;
+
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	OUT_RELOCl(chan, src_bo, src_offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst_offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RING  (chan, src_pitch);
+	OUT_RING  (chan, dst_pitch);
+	OUT_RING  (chan, 1 * src->texture->block.size);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0x0101);
+	OUT_RING  (chan, 0);
+
+	return 0;
+}
+
+static int
 nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
 		       int w, int h)
@@ -258,8 +299,59 @@ nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 	assert(src->format == dst->format);
 
 	/* Setup transfer to swizzle the texture to vram if needed */
-	if (src_linear && !dst_linear && w > 1 && h > 1) {
-		nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+	if (src_linear && !dst_linear) {
+		int x,y;
+
+		if ((w>1) && (h>1)) {
+			int potWidth = 1<<log2i(w);
+			int potHeight = 1<<log2i(h);
+			int remainWidth = w-potWidth;
+			int remainHeight = h-potHeight;
+			int squareDim = (potWidth>potHeight ? potHeight : potWidth);
+
+			/* top left is always POT, but we can only swizzle squares */
+			for (y=0; y<potHeight; y+=squareDim) {
+				for (x=0; x<potWidth; x+= squareDim) {
+					nv04_surface_copy_swizzle(ctx, dst, dx+x, dy+y,
+					                          src, sx+x, sy+y,
+					                          squareDim, squareDim);
+				}
+			}
+
+			/* top right */
+			if (remainWidth>0) {
+			nv04_surface_copy(ctx, dst, dx+potWidth, dy,
+				                  src, sx+potWidth, sy,
+				                  remainWidth, potHeight);
+			}
+
+			/* bottom left */
+			if (remainHeight>0) {
+				nv04_surface_copy(ctx, dst, dx, dy+potHeight,
+			                  src, sx, sy+potHeight,
+				                  potWidth, remainHeight);
+			}
+
+			/* bottom right */
+			if ((remainWidth>0) && (remainHeight>0)) {
+				nv04_surface_copy(ctx, dst, dx+potWidth, dy+potHeight,
+				                  src, sx+potWidth, sy+potHeight,
+				                  remainWidth, remainHeight);
+			}
+		} else if (w==1) {
+			/* We have a column to copy to a swizzled texture */
+			for (y=0; y<h; y++) {
+				nv04_surface_copy_m2mf_swizzle(ctx, dst, dx, dy+y,
+				                               src, sx, sy+y);
+			}
+		} else if (h==1) {
+			/* We have a row to copy to a swizzled texture */
+			for (x=0; x<w; x++) {
+				nv04_surface_copy_m2mf_swizzle(ctx, dst, dx+x, dy,
+				                               src, sx+x, sy);
+			}
+		}
+
 		return;
 	}
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 4469b22d91..ee5901e743 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -15,8 +15,6 @@ nv10_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index e6924ad71e..4eeacd1afd 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -15,8 +15,6 @@ nv20_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index c8b40784b0..41af38450b 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -22,8 +22,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
@@ -95,7 +93,7 @@ nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	} else
 	if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
-		switch (tex_usage) {
+		switch (format) {
 		case PIPE_FORMAT_Z24S8_UNORM:
 		case PIPE_FORMAT_Z24X8_UNORM:
 		case PIPE_FORMAT_Z16_UNORM:
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index 5d2a4216c5..bd13dfddd1 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -21,8 +21,6 @@ nv40_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index e02afc4be9..fca078b174 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -31,15 +31,24 @@ static void
 nv50_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
-	struct nv50_context *nv50 = (struct nv50_context *)pipe;
-	
-	FIRE_RING(nv50->screen->base.channel);
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+
+	/* We need this in the ddx for reliable composite, not sure what we're
+	 * actually flushing. We generate all our own flushes with flags = 0. */
+	WAIT_RING(chan, 2);
+	BEGIN_RING(chan, eng2d, 0x0110, 1);
+	OUT_RING  (chan, 0);
+
+	if (flags & PIPE_FLUSH_FRAME)
+		FIRE_RING(chan);
 }
 
 static void
 nv50_destroy(struct pipe_context *pipe)
 {
-	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_context *nv50 = nv50_context(pipe);
 
 	draw_destroy(nv50->draw);
 	FREE(nv50);
@@ -102,6 +111,9 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv50->pipe.is_texture_referenced = nv50_is_texture_referenced;
 	nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced;
 
+	screen->base.channel->user_private = nv50;
+	screen->base.channel->flush_notify = nv50_state_flush_notify;
+
 	nv50_init_surface_functions(nv50);
 	nv50_init_state_functions(nv50);
 	nv50_init_query_functions(nv50);
@@ -112,5 +124,3 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 
 	return &nv50->pipe;
 }
-
-		
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 9b8cc4d37d..4608854d71 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -71,12 +71,11 @@ struct nv50_sampler_stateobj {
 struct nv50_miptree_level {
 	int *image_offset;
 	unsigned pitch;
+	unsigned tile_mode;
 };
 
 struct nv50_miptree {
-	struct pipe_texture base;
-
-	struct nouveau_bo *bo;
+	struct nouveau_miptree base;
 
 	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
 	int image_nr;
@@ -117,8 +116,10 @@ struct nv50_state {
 	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *programs;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
+	struct nouveau_stateobj *vtxattr;
 };
 
 struct nv50_context {
@@ -190,10 +191,12 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_linkage_validate(struct nv50_context *nv50);
 extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
 
 /* nv50_state_validate.c */
 extern boolean nv50_state_validate(struct nv50_context *nv50);
+extern void nv50_state_flush_notify(struct nouveau_channel *chan);
 
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 22465e0227..93479a0314 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -31,20 +31,25 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
 	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
-	struct pipe_texture *pt = &mt->base;
+	struct pipe_texture *pt = &mt->base.base;
 	unsigned width = tmp->width[0], height = tmp->height[0];
 	unsigned depth = tmp->depth[0];
 	uint32_t tile_mode, tile_flags, tile_h;
 	int ret, i, l;
 
-	mt->base = *tmp;
-	pipe_reference_init(&mt->base.reference, 1);
-	mt->base.screen = pscreen;
+	*pt = *tmp;
+	pipe_reference_init(&pt->reference, 1);
+	pt->screen = pscreen;
 
 	switch (pt->format) {
-	case PIPE_FORMAT_Z24X8_UNORM:
+	case PIPE_FORMAT_Z32_FLOAT:
+		tile_flags = 0x4800;
+		break;
 	case PIPE_FORMAT_Z24S8_UNORM:
-	case PIPE_FORMAT_Z16_UNORM:
+		tile_flags = 0x1800;
+		break;
+	case PIPE_FORMAT_X8Z24_UNORM:
+	case PIPE_FORMAT_S8Z24_UNORM:
 		tile_flags = 0x2800;
 		break;
 	default:
@@ -82,20 +87,27 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 
 		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
 		lvl->pitch = align(pt->width[l] * pt->block.size, 64);
+		lvl->tile_mode = tile_mode;
 
 		width = MAX2(1, width >> 1);
 		height = MAX2(1, height >> 1);
 		depth = MAX2(1, depth >> 1);
+
+		if (tile_mode && height <= (tile_h >> 1)) {
+			tile_mode--;
+			tile_h >>= 1;
+		}
 	}
 
 	for (i = 0; i < mt->image_nr; i++) {
 		for (l = 0; l <= pt->last_level; l++) {
 			struct nv50_miptree_level *lvl = &mt->level[l];
 			int size;
+			tile_h = 1 << (lvl->tile_mode + 2);
 
 			size  = align(pt->width[l], 8) * pt->block.size;
 			size  = align(size, 64);
-			size *= align(pt->height[l], tile_h) * pt->block.size;
+			size *= align(pt->height[l], tile_h);
 
 			lvl->image_offset[i] = mt->total_size;
 
@@ -104,13 +116,14 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 	}
 
 	ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size,
-				  tile_mode, tile_flags, &mt->bo);
+				  mt->level[0].tile_mode, tile_flags,
+				  &mt->base.bo);
 	if (ret) {
 		FREE(mt);
 		return NULL;
 	}
-			     
-	return &mt->base;
+
+	return pt;
 }
 
 static struct pipe_texture *
@@ -129,15 +142,16 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	if (!mt)
 		return NULL;
 
-	mt->base = *pt;
-	pipe_reference_init(&mt->base.reference, 1);
-	mt->base.screen = pscreen;
+	mt->base.base = *pt;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
 	mt->image_nr = 1;
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+	mt->level[0].tile_mode = bo->tile_mode;
 
-	nouveau_bo_ref(bo, &mt->bo);
-	return &mt->base;
+	nouveau_bo_ref(bo, &mt->base.bo);
+	return &mt->base.base;
 }
 
 static void
@@ -145,8 +159,8 @@ nv50_miptree_destroy(struct pipe_texture *pt)
 {
 	struct nv50_miptree *mt = nv50_miptree(pt);
 
-	nouveau_bo_ref(NULL, &mt->bo);
-        FREE(mt);
+	nouveau_bo_ref(NULL, &mt->base.bo);
+	FREE(mt);
 }
 
 static struct pipe_surface *
@@ -189,8 +203,8 @@ nv50_miptree_surface_del(struct pipe_surface *ps)
 {
 	struct nv50_surface *s = nv50_surface(ps);
 
-        pipe_texture_reference(&ps->texture, NULL);
-        FREE(s);
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(s);
 }
 
 void
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 4ec9c03305..eb90d5e66f 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -112,6 +112,10 @@ struct nv50_pc {
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
 
+	/* broadcast and destination replacement regs */
+	struct nv50_reg *r_brdc;
+	struct nv50_reg *r_dst[4];
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
@@ -124,6 +128,25 @@ struct nv50_pc {
 	boolean allow32;
 };
 
+static INLINE void
+ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
+{
+	reg->type = type;
+	reg->index = index;
+	reg->hw = hw;
+	reg->neg = 0;
+	reg->rhw = -1;
+	reg->acc = 0;
+}
+
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+	static const unsigned cnt[16]
+	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	return cnt[val & 0xf];
+}
+
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
@@ -184,11 +207,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 
 	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
 		if (!pc->r_temp[i]) {
-			r = CALLOC_STRUCT(nv50_reg);
-			r->type = P_TEMP;
-			r->index = -1;
-			r->hw = i;
-			r->rhw = -1;
+			r = MALLOC_STRUCT(nv50_reg);
+			ctor_reg(r, P_TEMP, -1, i);
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -251,13 +271,11 @@ alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
 
 	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
 	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
-		return alloc_temp4(pc, dst, idx + 1);
+		return alloc_temp4(pc, dst, idx + 4);
 
 	for (i = 0; i < 4; i++) {
-		dst[i] = CALLOC_STRUCT(nv50_reg);
-		dst[i]->type = P_TEMP;
-		dst[i]->index = -1;
-		dst[i]->hw = idx + i;
+		dst[i] = MALLOC_STRUCT(nv50_reg);
+		ctor_reg(dst[i], P_TEMP, -1, idx + i);
 		pc->r_temp[idx + i] = dst[i];
 	}
 
@@ -296,7 +314,7 @@ kill_temp_temp(struct nv50_pc *pc)
 static int
 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
 {
-	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
+	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
 			       (pc->immd_nr + 1) * 4 * sizeof(float));
 	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
 	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
@@ -309,7 +327,7 @@ ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
-	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
@@ -319,9 +337,7 @@ alloc_immd(struct nv50_pc *pc, float f)
 	if (hw == pc->immd_nr * 4)
 		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
 
-	r->type = P_IMMD;
-	r->hw = hw;
-	r->index = -1;
+	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
 }
 
@@ -786,6 +802,9 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 #define CVTOP_SAT	0x08
 #define CVTOP_ABS	0x10
 
+/* 0x04 == 32 bit */
+/* 0x40 == dst is float */
+/* 0x80 == src is float */
 #define CVT_F32_F32 0xc4
 #define CVT_F32_S32 0x44
 #define CVT_F32_U32 0x64
@@ -795,7 +814,7 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cop, unsigned fmt)
+	 int wp, unsigned cvn, unsigned fmt)
 {
 	struct nv50_program_exec *e;
 
@@ -804,7 +823,7 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 
 	e->inst[0] |= 0xa0000000;
 	e->inst[1] |= 0x00004000;
-	e->inst[1] |= (cop << 16);
+	e->inst[1] |= (cvn << 16);
 	e->inst[1] |= (fmt << 24);
 	set_src_0(pc, src, e);
 
@@ -821,49 +840,80 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+/* nv50 Condition codes:
+ *  0x1 = LT
+ *  0x2 = EQ
+ *  0x3 = LE
+ *  0x4 = GT
+ *  0x5 = NE
+ *  0x6 = GE
+ *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
+ *  0x8 = unordered bit (allows NaN)
+ */
 static void
-emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
 {
+	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
 	struct nv50_program_exec *e = exec(pc);
-	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 	struct nv50_reg *rdst;
 
-	assert(c_op <= 7);
+	assert(ccode < 16);
 	if (check_swap_src_0_1(pc, &src0, &src1))
-		c_op = inv_cop[c_op];
+		ccode = cc_swapped[ccode & 7] | (ccode & 8);
 
 	rdst = dst;
-	if (dst->type != P_TEMP)
+	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
 	/* set.u32 */
 	set_long(pc, e);
 	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (3 << 29);
-	e->inst[1] |= (c_op << 14);
-	/*XXX: breaks things, .u32 by default?
-	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
-	 *     doesn't seem to match what the hw actually does.
-	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	e->inst[1] |= 0x60000000 | (ccode << 14);
+
+	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
+	 * that doesn't seem to match what the hw actually does
+	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
 	 */
-	set_dst(pc, dst, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
 	set_src_0(pc, src0, e);
 	set_src_1(pc, src1, e);
-	emit(pc, e);
 
-	/* cvt.f32.u32 */
-	e = exec(pc);
-	e->inst[0] = 0xa0000001;
-	e->inst[1] = 0x64014780;
-	set_dst(pc, rdst, e);
-	set_src_0(pc, dst, e);
 	emit(pc, e);
 
-	if (dst != rdst)
+	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
+	if (rdst)
+		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
+static INLINE unsigned
+map_tgsi_setop_cc(unsigned op)
+{
+	switch (op) {
+	case TGSI_OPCODE_SLT: return 0x1;
+	case TGSI_OPCODE_SGE: return 0x6;
+	case TGSI_OPCODE_SEQ: return 0x2;
+	case TGSI_OPCODE_SGT: return 0x4;
+	case TGSI_OPCODE_SLE: return 0x3;
+	case TGSI_OPCODE_SNE: return 0xd;
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
@@ -890,6 +940,12 @@ emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
+static INLINE void
+emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+}
+
 static void
 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	 struct nv50_reg **src)
@@ -1014,6 +1070,7 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 		break;
 	}
 
+	/* some cards need t[0]'s hw index to be a multiple of 4 */
 	alloc_temp4(pc, t, 0);
 
 	if (proj) {
@@ -1105,10 +1162,10 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 		m = 0xffff7fff;
 		break;
 	case 0x8:
-		/* INTERP */
-		m = ~0x02000000;
-		if (e->inst[0] & 0x02000000)
-			q = 0x00020000;
+		/* INTERP (move centroid, perspective and flat bits) */
+		m = ~0x03000100;
+		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
+		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
 		break;
 	case 0x9:
 		/* RCP */
@@ -1158,6 +1215,70 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	}
 }
 
+/* Return a read mask for source registers deduced from opcode & write mask. */
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
+{
+	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+	case TGSI_OPCODE_DP3:
+		return 0x7;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL: /* WriteMask ignored */
+		return 0xf;
+	case TGSI_OPCODE_DST:
+		return mask & (c ? 0xa : 0x6);
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+		return 0x1;
+	case TGSI_OPCODE_LIT:
+		return 0xb;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		const struct tgsi_instruction_ext_texture *tex;
+
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		mask = 0x7;
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			mask |= 0x8;
+
+		switch (tex->Texture) {
+		case TGSI_TEXTURE_1D:
+			mask &= 0x9;
+			break;
+		case TGSI_TEXTURE_2D:
+			mask &= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+		return mask;
+	case TGSI_OPCODE_XPD:
+		x = 0;
+		if (mask & 1) x |= 0x6;
+		if (mask & 2) x |= 0x5;
+		if (mask & 4) x |= 0x3;
+		return x;
+	default:
+		break;
+	}
+
+	return mask;
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -1257,93 +1378,126 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 	return r;
 }
 
-/* returns TRUE if instruction can overwrite sources before they're read */
+/* return TRUE for ops that produce only a single result */
 static boolean
-direct2dest_op(const struct tgsi_full_instruction *insn)
+is_scalar_op(unsigned op)
 {
-	if (insn->Instruction.Saturate)
-		return FALSE;
-
-	switch (insn->Instruction.Opcode) {
+	switch (op) {
 	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP2:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
 	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
+		/*
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+		*/
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+/* Returns a bitmask indicating which dst components depend
+ * on source s, component c (reverse of nv50_tgsi_src_mask).
+ */
+static unsigned
+nv50_tgsi_dst_revdep(unsigned op, int s, int c)
+{
+	if (is_scalar_op(op))
+		return 0x1;
+
+	switch (op) {
+	case TGSI_OPCODE_DST:
+		return (1 << c) & (s ? 0xa : 0x6);
+	case TGSI_OPCODE_XPD:
+		switch (c) {
+		case 0: return 0x6;
+		case 1: return 0x5;
+		case 2: return 0x3;
+		case 3: return 0x0;
+		default:
+			assert(0);
+			return 0x0;
+		}
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
 	case TGSI_OPCODE_TXP:
-		return FALSE;
+		/* these take care of dangerous swizzles themselves */
+		return 0x0;
+	case TGSI_OPCODE_IF:
+	case TGSI_OPCODE_KIL:
+		/* don't call this function for these ops */
+		assert(0);
+		return 0;
 	default:
-		return TRUE;
+		/* linear vector instruction */
+		return (1 << c);
 	}
 }
 
 static boolean
-nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+nv50_program_tx_insn(struct nv50_pc *pc,
+		     const struct tgsi_full_instruction *inst)
 {
-	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
-	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
 	unsigned mask, sat, unit;
-	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
+	memset(src, 0, sizeof(src));
+
 	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
+		if ((mask & (1 << c)) && !pc->r_dst[c])
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
-			dst[c] = NULL;
-		rdst[c] = NULL;
-		src[0][c] = NULL;
-		src[1][c] = NULL;
-		src[2][c] = NULL;
+			dst[c] = pc->r_dst[c];
+		rdst[c] = dst[c];
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		unsigned src_mask;
+		boolean neg_supp;
+
+		src_mask = nv50_tgsi_src_mask(inst, i);
+		neg_supp = negate_supported(inst, i);
 
 		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs,
-					     negate_supported(inst, i));
+			if (src_mask & (1 << c))
+				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
 	}
 
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			rdst[c] = dst[c];
-			dst[c] = temp_temp(pc);
-		}
+	brdc = temp = pc->r_brdc;
+	if (brdc && brdc->type != P_TEMP) {
+		temp = temp_temp(pc);
+		if (sat)
+			brdc = temp;
 	} else
-	if (direct2dest_op(inst)) {
+	if (sat) {
 		for (c = 0; c < 4; c++) {
-			if (!dst[c] || dst[c]->type != P_TEMP)
+			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
-
-			for (i = c + 1; i < 4; i++) {
-				if (dst[c] == src[0][i] ||
-				    dst[c] == src[1][i] ||
-				    dst[c] == src[2][i])
-					break;
-			}
-			if (i == 4)
-				continue;
-
-			assimilate = TRUE;
 			rdst[c] = dst[c];
-			dst[c] = alloc_temp(pc, NULL);
+			dst[c] = temp_temp(pc);
 		}
 	}
 
+	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
+
 	switch (inst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
 		for (c = 0; c < 4; c++) {
@@ -1359,74 +1513,56 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_add(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
-	case TGSI_OPCODE_COS:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, temp, temp);
+	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_mov(pc, dst[c], temp);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_CEIL, CVT_F32_F32);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 5, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, brdc, temp);
 		break;
 	case TGSI_OPCODE_DP3:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
 		break;
 	case TGSI_OPCODE_DP4:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, temp, src[0][3], src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, temp, src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_add(pc, brdc, src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DST:
-	{
-		struct nv50_reg *one = alloc_immd(pc, 1.0);
-		if (mask & (1 << 0))
-			emit_mov(pc, dst[0], one);
 		if (mask & (1 << 1))
 			emit_mul(pc, dst[1], src[0][1], src[1][1]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], src[0][2]);
 		if (mask & (1 << 3))
 			emit_mov(pc, dst[3], src[1][3]);
-		FREE(one);
-	}
+		if (mask & (1 << 0))
+			emit_mov_immdval(pc, dst[0], 1.0f);
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 6, brdc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1449,19 +1585,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
-		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = temp_temp(pc);
-		emit_flop(pc, 3, temp, src[0][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 3, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -1494,6 +1623,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_MOV:
+	case TGSI_OPCODE_SWZ:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -1508,31 +1638,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = temp_temp(pc);
-		emit_pow(pc, temp, src[0][0], src[1][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 0, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 0, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 2, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 2, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
+		if (mask & 3)
+			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
@@ -1542,28 +1659,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
-	case TGSI_OPCODE_SGE:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
-		}
-		break;
 	case TGSI_OPCODE_SIN:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 4, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_SUB:
@@ -1581,6 +1699,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_tex(pc, dst, mask, src[0], unit,
 			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
+	case TGSI_OPCODE_TRUNC:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_TRUNC, CVT_F32_F32);
+		}
+		break;
 	case TGSI_OPCODE_XPD:
 		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
@@ -1605,17 +1731,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		return FALSE;
 	}
 
+	if (brdc) {
+		if (sat)
+			emit_sat(pc, brdc, brdc);
+		for (c = 0; c < 4; c++)
+			if ((mask & (1 << c)) && dst[c] != brdc)
+				emit_mov(pc, dst[c], brdc);
+	} else
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
-				 CVT_F32_F32);
+			/* in this case we saturate later */
+			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
+				continue;
+			emit_sat(pc, rdst[c], dst[c]);
 		}
-	} else if (assimilate) {
-		for (c = 0; c < 4; c++)
-			if (rdst[c])
-				assimilate_temp(pc, rdst[c], dst[c]);
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -1624,9 +1755,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
 				FREE(src[i][c]);
-			else
-			if (src[i][c]->acc == pc->insn_cur)
-				release_hw(pc, src[i][c]);
 		}
 	}
 
@@ -1634,180 +1762,271 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
-/* Adjust a bitmask that indicates what components of a source are used,
- * we use this in tx_prep so we only load interpolants that are needed.
- */
 static void
-insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	const struct tgsi_instruction_ext_texture *tex;
-
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_DP3:
-		*mask = 0x7;
-		break;
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-		*mask = 0xF;
-		break;
-	case TGSI_OPCODE_LIT:
-		*mask = 0xB;
-		break;
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-		*mask = 0x1;
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXP:
-		assert(insn->Instruction.Extended);
-		tex = &insn->InstructionExtTexture;
-
-		*mask = 0x7;
-		if (tex->Texture == TGSI_TEXTURE_1D)
-			*mask = 0x1;
-		else
-		if (tex->Texture == TGSI_TEXTURE_2D)
-			*mask = 0x3;
-
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			*mask |= 0x8;
-		break;
-	default:
-		break;
-	}
-}
-
-static void
-prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
-		  unsigned *r_usage[2])
-{
-	const struct tgsi_full_instruction *insn;
+	struct nv50_reg *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
+	unsigned i, c, k, mask;
 
-	unsigned i, c, k, n, mask, *acc_p;
-
-	insn = &tok->FullInstruction;
 	dst = &insn->FullDstRegisters[0].DstRegister;
 	mask = dst->WriteMask;
 
-	if (!r_usage[0])
-		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
-	if (!r_usage[1])
-		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+        if (dst->File == TGSI_FILE_TEMPORARY)
+                reg = pc->temp;
+        else
+        if (dst->File == TGSI_FILE_OUTPUT)
+                reg = pc->result;
 
-	if (dst->File == TGSI_FILE_TEMPORARY) {
+	if (reg) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+			reg[dst->Index * 4 + c].acc = pc->insn_nr;
 		}
 	}
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		src = &insn->FullSrcRegisters[i];
 
-		switch (src->SrcRegister.File) {
-		case TGSI_FILE_TEMPORARY:
-			acc_p = r_usage[0];
-			break;
-		case TGSI_FILE_INPUT:
-			acc_p = r_usage[1];
-			break;
-		default:
+		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
+			reg = pc->temp;
+		else
+		if (src->SrcRegister.File == TGSI_FILE_INPUT)
+			reg = pc->attr;
+		else
 			continue;
-		}
 
-		insn_adjust_mask(insn, &mask);
+		mask = nv50_tgsi_src_mask(insn, i);
 
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-
 			k = tgsi_util_get_full_src_register_extswizzle(src, c);
-			switch (k) {
-			case TGSI_EXTSWIZZLE_X:
-			case TGSI_EXTSWIZZLE_Y:
-			case TGSI_EXTSWIZZLE_Z:
-			case TGSI_EXTSWIZZLE_W:
-				n = src->SrcRegister.Index * 4 + k;
-				acc_p[n] = pc->insn_nr;
-				break;
-			default:
-				break;
-			}
+
+			if (k > TGSI_EXTSWIZZLE_W)
+				continue;
+
+			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
 		}
 	}
 }
 
+/* Returns a bitmask indicating which dst components need to be
+ * written to temporaries first to avoid 'corrupting' sources.
+ *
+ * m[i]   (out) indicate component to write in the i-th position
+ * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
+ */
+static unsigned
+nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
+{
+	unsigned i, c, x, unsafe;
+
+	for (c = 0; c < 4; c++)
+		m[c] = c;
+
+	/* Swap as long as a dst component written earlier is depended on
+	 * by one written later, but the next one isn't depended on by it.
+	 */
+	for (c = 0; c < 3; c++) {
+		if (rdep[m[c + 1]] & (1 << m[c]))
+			continue; /* if next one is depended on by us */
+		for (i = c + 1; i < 4; i++)
+			/* if we are depended on by a later one */
+			if (rdep[m[c]] & (1 << m[i]))
+				break;
+		if (i == 4)
+			continue;
+		/* now, swap */
+		x = m[c];
+		m[c] = m[c + 1];
+		m[c + 1] = x;
+
+		/* restart */
+		c = 0;
+	}
+
+	/* mark dependencies that could not be resolved by reordering */
+	for (i = 0; i < 3; ++i)
+		for (c = i + 1; c < 4; ++c)
+			if (rdep[m[i]] & (1 << m[c]))
+				unsafe |= (1 << i);
+
+	/* NOTE: $unsafe is with respect to order, not component */
+	return unsafe;
+}
+
+/* Select a suitable dst register for broadcasting scalar results,
+ * or return NULL if we have to allocate an extra TEMP.
+ *
+ * If e.g. only 1 component is written, we may also emit the final
+ * result to a write-only register.
+ */
+static struct nv50_reg *
+tgsi_broadcast_dst(struct nv50_pc *pc,
+		   const struct tgsi_full_dst_register *fd, unsigned mask)
+{
+	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->DstRegister.WriteMask);
+		if (c)
+			return tgsi_dst(pc, c - 1, fd);
+	} else {
+		int c = ffs(fd->DstRegister.WriteMask) - 1;
+		if ((1 << c) == fd->DstRegister.WriteMask)
+			return tgsi_dst(pc, c, fd);
+	}
+
+	return NULL;
+}
+
+/* Scan source swizzles and return a bitmask indicating dst regs that
+ * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
+ */
 static unsigned
-load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
-	       int *aid, int *p_oid)
+nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
+		       unsigned rdep[4])
 {
-	struct nv50_reg *iv;
-	int oid, c, n;
-	unsigned mask = 0;
+	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
+	const struct tgsi_full_src_register *fs;
+	unsigned i, deqs = 0;
 
-	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+	for (i = 0; i < 4; ++i)
+		rdep[i] = 0;
 
-	for (c = 0, n = i * 4; c < 4; c++, n++) {
-		oid = (*p_oid)++;
-		pc->attr[n].type = P_TEMP;
-		pc->attr[n].index = i;
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
+		boolean neg_supp = negate_supported(insn, i);
 
-		if (pc->attr[n].acc == acc[n])
+		fs = &insn->FullSrcRegisters[i];
+		if (fs->SrcRegister.File != fd->DstRegister.File ||
+		    fs->SrcRegister.Index != fd->DstRegister.Index)
 			continue;
-		mask |= (1 << c);
 
-		pc->attr[n].acc = acc[n];
-		pc->attr[n].rhw = pc->attr[n].hw = -1;
-		alloc_reg(pc, &pc->attr[n]);
+		for (chn = 0; chn < 4; ++chn) {
+			unsigned s, c;
+
+			if (!(mask & (1 << chn))) /* src is not read */
+				continue;
+			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
+			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
 
-		pc->attr[n].rhw = (*aid)++;
-		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+			if (c > TGSI_EXTSWIZZLE_W ||
+			    !(fd->DstRegister.WriteMask & (1 << c)))
+				continue;
+
+			/* no danger if src is copied to TEMP first */
+			if ((s != TGSI_UTIL_SIGN_KEEP) &&
+			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
+				continue;
 
-		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
-		(*mid)++;
-		pc->p->cfg.fp.regs[1] += 0x00010001;
+			rdep[c] |= nv50_tgsi_dst_revdep(
+				insn->Instruction.Opcode, i, chn);
+			deqs |= (1 << c);
+		}
 	}
 
-	return mask;
+	return deqs;
 }
 
 static boolean
-nv50_program_tx_prep(struct nv50_pc *pc)
+nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
-	struct tgsi_parse_context p;
-	boolean ret = FALSE;
-	unsigned i, c;
-	unsigned fcol, bcol, fcrd, depr;
+	struct tgsi_full_instruction insn = tok->FullInstruction;
+	const struct tgsi_full_dst_register *fd;
+	unsigned i, deqs, rdep[4], m[4];
+
+	fd = &tok->FullInstruction.FullDstRegisters[0];
+	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
+
+	if (is_scalar_op(insn.Instruction.Opcode)) {
+		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
+		if (!pc->r_brdc)
+			pc->r_brdc = temp_temp(pc);
+		return nv50_program_tx_insn(pc, &insn);
+	}
+	pc->r_brdc = NULL;
+
+	if (!deqs)
+		return nv50_program_tx_insn(pc, &insn);
 
-	/* count (centroid) perspective interpolations */
-	unsigned centroid_loads = 0;
-	unsigned perspect_loads = 0;
+	deqs = nv50_revdep_reorder(m, rdep);
 
-	/* track register access for temps and attrs */
-	unsigned *r_usage[2];
-	r_usage[0] = NULL;
-	r_usage[1] = NULL;
+	for (i = 0; i < 4; ++i) {
+		assert(pc->r_dst[m[i]] == NULL);
 
-	depr = fcol = bcol = fcrd = 0xffff;
+		insn.FullDstRegisters[0].DstRegister.WriteMask =
+			fd->DstRegister.WriteMask & (1 << m[i]);
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-		pc->p->cfg.fp.regs[0] = 0x01000404;
-		pc->p->cfg.fp.regs[1] = 0x00000400;
+		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
+			continue;
+
+		if (deqs & (1 << i))
+			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
+
+		if (!nv50_program_tx_insn(pc, &insn))
+			return FALSE;
 	}
 
-	tgsi_parse_init(&p, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&p)) {
-		const union tgsi_full_token *tok = &p.FullToken;
+	for (i = 0; i < 4; i++) {
+		struct nv50_reg *reg = pc->r_dst[i];
+		if (!reg)
+			continue;
+		pc->r_dst[i] = NULL;
+
+		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
+		else
+			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
+		free_temp(pc, reg);
+	}
 
-		tgsi_parse_token(&p);
+	return TRUE;
+}
+
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *iv, **ppiv;
+	unsigned mode = pc->interp_mode[reg->index];
+
+	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+	iv = *ppiv;
+
+	if ((mode & INTERP_PERSPECTIVE) && !iv) {
+		iv = *ppiv = alloc_temp(pc, NULL);
+		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
+
+		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+		emit_flop(pc, 0, iv, iv);
+
+		/* XXX: when loading interpolants dynamically, move these
+		 * to the program head, or make sure it can't be skipped.
+		 */
+	}
+
+	emit_interp(pc, reg, iv, mode);
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context tp;
+	struct nv50_program *p = pc->p;
+	boolean ret = FALSE;
+	unsigned i, c, flat_nr = 0;
+
+	tgsi_parse_init(&tp, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&tp)) {
+		const union tgsi_full_token *tok = &tp.FullToken;
+
+		tgsi_parse_token(&tp);
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_IMMEDIATE:
 		{
 			const struct tgsi_full_immediate *imm =
-				&p.FullToken.FullImmediate;
+				&tp.FullToken.FullImmediate;
 
 			ctor_immd(pc, imm->u[0].Float,
 				      imm->u[1].Float,
@@ -1818,78 +2037,61 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last, first, mode;
+			unsigned si, last, first, mode;
 
-			d = &p.FullToken.FullDeclaration;
+			d = &tp.FullToken.FullDeclaration;
 			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
-				if (pc->temp_nr < (last + 1))
-					pc->temp_nr = last + 1;
 				break;
 			case TGSI_FILE_OUTPUT:
-				if (pc->result_nr < (last + 1))
-					pc->result_nr = last + 1;
-
-				if (!d->Declaration.Semantic)
+				if (!d->Declaration.Semantic ||
+				    p->type == PIPE_SHADER_FRAGMENT)
 					break;
 
+				si = d->Semantic.SemanticIndex;
 				switch (d->Semantic.SemanticName) {
-				case TGSI_SEMANTIC_POSITION:
-					depr = first;
-					pc->p->cfg.fp.regs[2] |= 0x00000100;
-					pc->p->cfg.fp.regs[3] |= 0x00000011;
+				case TGSI_SEMANTIC_BCOLOR:
+					p->cfg.two_side[si].hw = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					p->cfg.psiz = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
 					break;
+					/*
+				case TGSI_SEMANTIC_CLIP_DISTANCE:
+					p->cfg.clpd = MIN2(p->cfg.clpd, first);
+					break;
+					*/
 				default:
 					break;
 				}
-
 				break;
 			case TGSI_FILE_INPUT:
 			{
-				if (pc->attr_nr < (last + 1))
-					pc->attr_nr = last + 1;
-
-				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				if (p->type != PIPE_SHADER_FRAGMENT)
 					break;
 
 				switch (d->Declaration.Interpolate) {
 				case TGSI_INTERPOLATE_CONSTANT:
 					mode = INTERP_FLAT;
+					flat_nr++;
 					break;
 				case TGSI_INTERPOLATE_PERSPECTIVE:
 					mode = INTERP_PERSPECTIVE;
+					p->cfg.regs[1] |= 0x08 << 24;
 					break;
 				default:
 					mode = INTERP_LINEAR;
 					break;
 				}
-
-				if (d->Declaration.Semantic) {
-					switch (d->Semantic.SemanticName) {
-					case TGSI_SEMANTIC_POSITION:
-						fcrd = first;
-						break;
-					case TGSI_SEMANTIC_COLOR:
-						fcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					case TGSI_SEMANTIC_BCOLOR:
-						bcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					}
-				}
-
-				if (d->Declaration.Centroid) {
+				if (d->Declaration.Centroid)
 					mode |= INTERP_CENTROID;
-					if (mode & INTERP_PERSPECTIVE)
-						centroid_loads++;
-				} else
-				if (mode & INTERP_PERSPECTIVE)
-					perspect_loads++;
 
 				assert(last < 32);
 				for (i = first; i <= last; i++)
@@ -1897,8 +2099,6 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			}
 				break;
 			case TGSI_FILE_CONSTANT:
-				if (pc->param_nr < (last + 1))
-					pc->param_nr = last + 1;
 				break;
 			case TGSI_FILE_SAMPLER:
 				break;
@@ -1911,182 +2111,155 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			pc->insn_nr++;
-			prep_inspect_insn(pc, tok, r_usage);
+			prep_inspect_insn(pc, &tok->FullInstruction);
 			break;
 		default:
 			break;
 		}
 	}
 
-	if (pc->temp_nr) {
-		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->temp)
-			goto out_err;
+	if (p->type == PIPE_SHADER_VERTEX) {
+		int rid = 0;
 
-		for (i = 0; i < pc->temp_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->temp[i*4+c].type = P_TEMP;
-				pc->temp[i*4+c].hw = -1;
-				pc->temp[i*4+c].rhw = -1;
-				pc->temp[i*4+c].index = i;
-				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
+		for (i = 0; i < pc->attr_nr * 4; ++i) {
+			if (pc->attr[i].acc) {
+				pc->attr[i].hw = rid++;
+				p->cfg.attr[i / 32] |= 1 << (i % 32);
 			}
 		}
-	}
-
-	if (pc->attr_nr) {
-		int oid = 4, mid = 4, aid = 0;
-		/* oid = VP output id
-		 * aid = FP attribute/interpolant id
-		 * mid = VP output mapping field ID
-		 */
 
-		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->attr)
-			goto out_err;
+		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+			p->cfg.io[i].hw = rid;
+			p->cfg.io[i].id_vp = i;
 
-		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			/* position should be loaded first */
-			if (fcrd != 0xffff) {
-				unsigned mask;
-				mid = 0;
-				mask = load_fp_attrib(pc, fcrd, r_usage[1],
-						      &mid, &aid, &oid);
-				oid = 0;
-				pc->p->cfg.fp.regs[1] |= (mask << 24);
-				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
-			}
-			pc->p->cfg.fp.map[0] += 0x03020100;
-
-			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
-
-			if (perspect_loads) {
-				pc->iv_p = alloc_temp(pc, NULL);
-
-				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
-					pc->p->cfg.fp.regs[1] |= 0x08000000;
-					pc->iv_p->rhw = aid++;
-					emit_interp(pc, pc->iv_p, NULL,
-						    INTERP_LINEAR);
-					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
-				} else {
-					pc->iv_p->rhw = aid - 1;
-					emit_flop(pc, 0, pc->iv_p,
-						  &pc->attr[fcrd * 4 + 3]);
-				}
-			}
-
-			if (centroid_loads) {
-				pc->iv_c = alloc_temp(pc, NULL);
-				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
-				emit_interp(pc, pc->iv_c, NULL,
-					    INTERP_CENTROID);
-				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
-				pc->p->cfg.fp.regs[1] |= 0x08000000;
+			for (c = 0; c < 4; ++c) {
+				int n = i * 4 + c;
+				if (!pc->result[n].acc)
+					continue;
+				pc->result[n].hw = rid++;
+				p->cfg.io[i].mask |= 1 << c;
 			}
+		}
 
-			for (c = 0; c < 4; c++) {
-				/* I don't know what these values do, but
-				 * let's set them like the blob does:
-				 */
-				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-			}
+		for (c = 0; c < 2; ++c)
+			if (p->cfg.two_side[c].hw < 0x40)
+				p->cfg.two_side[c] = p->cfg.io[
+					p->cfg.two_side[c].hw];
 
-			for (i = 0; i < pc->attr_nr; i++)
-				load_fp_attrib(pc, i, r_usage[1],
-					       &mid, &aid, &oid);
+		if (p->cfg.psiz < 0x40)
+			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+	} else
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		int rid, aid;
+		unsigned n = 0, m = pc->attr_nr - flat_nr;
 
-			if (pc->iv_p)
-				free_temp(pc, pc->iv_p);
-			if (pc->iv_c)
-				free_temp(pc, pc->iv_c);
+		int base = (TGSI_SEMANTIC_POSITION ==
+			    p->info.input_semantic_name[0]) ? 0 : 1;
 
-			pc->p->cfg.fp.high_map = (mid / 4);
-			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
-		} else {
-			/* vertex program */
-			for (i = 0; i < pc->attr_nr * 4; i++) {
-				pc->p->cfg.vp.attr[aid / 32] |=
-					(1 << (aid % 32));
-				pc->attr[i].type = P_ATTR;
-				pc->attr[i].hw = aid++;
-				pc->attr[i].index = i / 4;
+		/* non-flat interpolants have to be mapped to
+		 * the lower hardware IDs, so sort them:
+		 */
+		for (i = 0; i < pc->attr_nr; i++) {
+			if (pc->interp_mode[i] == INTERP_FLAT) {
+				p->cfg.io[m].id_vp = i + base;
+				p->cfg.io[m++].id_fp = i;
+			} else {
+				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+					p->cfg.io[n].linear = TRUE;
+				p->cfg.io[n].id_vp = i + base;
+				p->cfg.io[n++].id_fp = i;
 			}
 		}
-	}
 
-	if (pc->result_nr) {
-		int rid = 0;
+		if (!base) /* set w-coordinate mask from perspective interp */
+			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
 
-		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->result)
-			goto out_err;
+		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
+			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
 
-		for (i = 0; i < pc->result_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					pc->result[i*4+c].type = P_TEMP;
-					pc->result[i*4+c].hw = -1;
-					pc->result[i*4+c].rhw = (i == depr) ?
-						-1 : rid++;
-				} else {
-					pc->result[i*4+c].type = P_RESULT;
-					pc->result[i*4+c].hw = rid++;
-				}
-				pc->result[i*4+c].index = i;
-			}
+		for (n = 0; n < pc->attr_nr; ++n) {
+			p->cfg.io[n].hw = rid = aid;
+			i = p->cfg.io[n].id_fp;
 
-			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
-			    depr != 0xffff) {
-				pc->result[depr * 4 + 2].rhw =
-					(pc->result_nr - 1) * 4;
+			for (c = 0; c < 4; ++c) {
+				if (!pc->attr[i * 4 + c].acc)
+					continue;
+				pc->attr[i * 4 + c].rhw = rid++;
+				p->cfg.io[n].mask |= 1 << c;
+
+				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
+			aid += popcnt4(p->cfg.io[n].mask);
 		}
-	}
 
-	if (pc->param_nr) {
-		int rid = 0;
+		if (!base)
+			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
 
-		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->param)
-			goto out_err;
+		m = popcnt4(p->cfg.regs[1] >> 24);
+
+		/* set count of non-position inputs and of non-flat
+		 * non-position inputs for FP_INTERPOLANT_CTRL
+		 */
+		p->cfg.regs[1] |= aid - m;
+
+		if (flat_nr) {
+			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			p->cfg.regs[1] |= (i - m) << 16;
+		} else
+			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+		/* mark color semantic for light-twoside */
+		n = 0x40;
+		for (i = 0; i < pc->attr_nr; i++) {
+			ubyte si, sn;
+
+			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
+			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
+
+			if (sn == TGSI_SEMANTIC_COLOR) {
+				p->cfg.two_side[si] = p->cfg.io[i];
+
+				/* increase colour count */
+				p->cfg.regs[0] += popcnt4(
+					p->cfg.two_side[si].mask) << 16;
 
-		for (i = 0; i < pc->param_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->param[i*4+c].type = P_CONST;
-				pc->param[i*4+c].hw = rid++;
-				pc->param[i*4+c].index = i;
+				n = MIN2(n, p->cfg.io[i].hw - m);
 			}
 		}
+		if (n < 0x40)
+			p->cfg.regs[0] += n;
+
+		/* Initialize FP results:
+		 * FragDepth is always first TGSI and last hw output
+		 */
+		i = p->info.writes_z ? 4 : 0;
+		for (rid = 0; i < pc->result_nr * 4; i++)
+			pc->result[i].rhw = rid++;
+		if (p->info.writes_z)
+			pc->result[2].rhw = rid;
 	}
 
 	if (pc->immd_nr) {
 		int rid = 0;
 
-		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
 		if (!pc->immd)
 			goto out_err;
 
 		for (i = 0; i < pc->immd_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->immd[i*4+c].type = P_IMMD;
-				pc->immd[i*4+c].hw = rid++;
-				pc->immd[i*4+c].index = i;
-			}
+			for (c = 0; c < 4; c++, rid++)
+				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
 		}
 	}
 
 	ret = TRUE;
 out_err:
-	if (r_usage[0])
-		FREE(r_usage[0]);
-	if (r_usage[1])
-		FREE(r_usage[1]);
+	if (pc->iv_p)
+		free_temp(pc, pc->iv_p);
+	if (pc->iv_c)
+		free_temp(pc, pc->iv_c);
 
-	tgsi_parse_free(&p);
+	tgsi_parse_free(&tp);
 	return ret;
 }
 
@@ -2108,6 +2281,88 @@ free_nv50_pc(struct nv50_pc *pc)
 }
 
 static boolean
+ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
+{
+	int i, c;
+	unsigned rtype[2] = { P_ATTR, P_RESULT };
+
+	pc->p = p;
+	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
+	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
+	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+	p->cfg.high_temp = 4;
+
+	p->cfg.two_side[0].hw = 0x40;
+	p->cfg.two_side[1].hw = 0x40;
+
+	switch (p->type) {
+	case PIPE_SHADER_VERTEX:
+		p->cfg.psiz = 0x40;
+		p->cfg.clpd = 0x40;
+		p->cfg.io_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		rtype[0] = rtype[1] = P_TEMP;
+
+		p->cfg.regs[0] = 0x01000004;
+		p->cfg.io_nr = pc->attr_nr;
+
+		if (p->info.writes_z) {
+			p->cfg.regs[2] |= 0x00000100;
+			p->cfg.regs[3] |= 0x00000011;
+		}
+		if (p->info.uses_kill)
+			p->cfg.regs[2] |= 0x00100000;
+		break;
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->temp)
+			return FALSE;
+
+		for (i = 0; i < pc->temp_nr * 4; ++i)
+			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
+	}
+
+	if (pc->attr_nr) {
+		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->attr)
+			return FALSE;
+
+		for (i = 0; i < pc->attr_nr * 4; ++i)
+			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
+	}
+
+	if (pc->result_nr) {
+		unsigned nr = pc->result_nr * 4;
+
+		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
+		if (!pc->result)
+			return FALSE;
+
+		for (i = 0; i < nr; ++i)
+			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->param)
+			return FALSE;
+
+		for (i = 0; i < pc->param_nr; ++i)
+			for (c = 0; c < 4; ++c, ++rid)
+				ctor_reg(&pc->param[rid], P_CONST, i, rid);
+	}
+
+	return TRUE;
+}
+
+static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
@@ -2118,8 +2373,10 @@ nv50_program_tx(struct nv50_program *p)
 	pc = CALLOC_STRUCT(nv50_pc);
 	if (!pc)
 		return FALSE;
-	pc->p = p;
-	pc->p->cfg.high_temp = 4;
+
+	ret = ctor_nv50_pc(pc, p);
+	if (ret == FALSE)
+		goto out_cleanup;
 
 	ret = nv50_program_tx_prep(pc);
 	if (ret == FALSE)
@@ -2139,7 +2396,7 @@ nv50_program_tx(struct nv50_program *p)
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			++pc->insn_cur;
-			ret = nv50_program_tx_insn(pc, tok);
+			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
 			break;
@@ -2150,8 +2407,8 @@ nv50_program_tx(struct nv50_program *p)
 
 	if (p->type == PIPE_SHADER_FRAGMENT) {
 		struct nv50_reg out;
+		ctor_reg(&out, P_TEMP, -1, -1);
 
-		out.type = P_TEMP;
 		for (k = 0; k < pc->result_nr * 4; k++) {
 			if (pc->result[k].rhw == -1)
 				continue;
@@ -2221,9 +2478,9 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
-		BEGIN_RING(chan, tesla, 0x00000f00, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
 		OUT_RING  (chan, (cbuf << 0) | (start << 8));
-		BEGIN_RING(chan, tesla, 0x40000f04, nr);
+		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
 		OUT_RINGp (chan, map, nr);
 
 		map += nr;
@@ -2256,30 +2513,19 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (!p->data[1] && p->param_nr) {
-		struct nouveau_resource *heap =
-			nv50->screen->parm_heap[p->type];
-
-		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
-			while (heap->next && heap->size < p->param_nr) {
-				struct nv50_program *evict = heap->next->priv;
-				nouveau_resource_free(&evict->data[1]);
-			}
-
-			if (nouveau_resource_alloc(heap, p->param_nr, p,
-						   &p->data[1]))
-				assert(0);
-		}
-	}
+	assert(p->param_nr <= 128);
 
 	if (p->param_nr) {
-		unsigned cbuf = NV50_CB_PVP;
+		unsigned cb;
 		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
 					     PIPE_BUFFER_USAGE_CPU_READ);
-		if (p->type == PIPE_SHADER_FRAGMENT)
-			cbuf = NV50_CB_PFP;
-		nv50_program_upload_data(nv50, map, p->data[1]->start,
-					 p->param_nr, cbuf);
+
+		if (p->type == PIPE_SHADER_VERTEX)
+			cb = NV50_CB_PVP;
+		else
+			cb = NV50_CB_PFP;
+
+		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
@@ -2301,32 +2547,30 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 		upload = TRUE;
 	}
 
-	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
-		(p->data[1] && p->data[1]->start != p->data_start[1])) {
-		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci, bs;
-
-			if (e->param.index < 0)
-				continue;
-			bs = (e->inst[1] >> 22) & 0x07;
-			assert(bs < 2);
-			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data[bs]->start;
+	if (p->data[0] && p->data[0]->start != p->data_start[0])
+		upload = TRUE;
 
-			e->inst[ei] &= ~e->param.mask;
-			e->inst[ei] |= (ci << e->param.shift);
-		}
+	if (!upload)
+		return;
 
-		if (p->data[0])
-			p->data_start[0] = p->data[0]->start;
-		if (p->data[1])
-			p->data_start[1] = p->data[1]->start;
+	for (e = p->exec_head; e; e = e->next) {
+		unsigned ei, ci, bs;
 
-		upload = TRUE;
+		if (e->param.index < 0)
+			continue;
+		bs = (e->inst[1] >> 22) & 0x07;
+		assert(bs < 2);
+		ei = e->param.shift >> 5;
+		ci = e->param.index;
+		if (bs == 0)
+			ci += p->data[bs]->start;
+
+		e->inst[ei] &= ~e->param.mask;
+		e->inst[ei] |= (ci << e->param.shift);
 	}
 
-	if (!upload)
-		return;
+	if (p->data[0])
+		p->data_start[0] = p->data[0]->start;
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
@@ -2345,7 +2589,7 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 	}
 
 	so = so_new(4,2);
-	so_method(so, nv50->screen->tesla, 0x1280, 3);
+	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
@@ -2364,9 +2608,9 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 			continue;
 		}
 
-		BEGIN_RING(chan, tesla, 0x0f00, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
 		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
-		BEGIN_RING(chan, tesla, 0x40000f04, nr);	
+		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
 		OUT_RINGp (chan, up + start, nr);
 
 		start += nr;
@@ -2399,15 +2643,15 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 		      NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, 0x1650, 2);
-	so_data  (so, p->cfg.vp.attr[0]);
-	so_data  (so, p->cfg.vp.attr[1]);
-	so_method(so, tesla, 0x16b8, 1);
+	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
+	so_data  (so, p->cfg.attr[0]);
+	so_data  (so, p->cfg.attr[1]);
+	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, 0x16ac, 2);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
 	so_data  (so, p->cfg.high_result); //8);
 	so_data  (so, p->cfg.high_temp);
-	so_method(so, tesla, 0x140c, 1);
+	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.vertprog);
 	so_ref(NULL, &so);
@@ -2419,7 +2663,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -2436,29 +2679,186 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 		      NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, 0x1904, 4);
-	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
-	so_data  (so, 0x00000004);
-	so_data  (so, 0x00000000);
-	so_data  (so, 0x00000000);
-	so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
-	for (i = 0; i < p->cfg.fp.high_map; i++)
-		so_data(so, p->cfg.fp.map[i]);
-	so_method(so, tesla, 0x1988, 2);
-	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
-	so_method(so, tesla, 0x1298, 1);
+	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, 0x19a8, 1);
-	so_data  (so, p->cfg.fp.regs[2]);
-	so_method(so, tesla, 0x196c, 1);
-	so_data  (so, p->cfg.fp.regs[3]);
-	so_method(so, tesla, 0x1414, 1);
+	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
+	so_data  (so, p->cfg.regs[2]);
+	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
+	so_data  (so, p->cfg.regs[3]);
+	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
 	so_ref(NULL, &so);
 }
 
+static void
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
+{
+	struct nv50_program *fp = nv50->fragprog;
+	struct nv50_program *vp = nv50->vertprog;
+	unsigned i, c, m = base;
+
+	/* XXX: This can't work correctly in all cases yet, we either
+	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
+	 * to be per FP input instead of per VP output
+	 */
+	memset(pntc, 0, 8 * sizeof(uint32_t));
+
+	for (i = 0; i < fp->cfg.io_nr; i++) {
+		uint8_t sn, si;
+		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
+		unsigned n = popcnt4(fp->cfg.io[i].mask);
+
+		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+			m += n;
+			continue;
+		}
+
+		sn = vp->info.input_semantic_name[j];
+		si = vp->info.input_semantic_index[j];
+
+		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
+			ubyte mode =
+				nv50->rasterizer->pipe.sprite_coord_mode[si];
+
+			if (mode == PIPE_SPRITE_COORD_NONE) {
+				m += n;
+				continue;
+			}
+		}
+
+		/* this is either PointCoord or replaced by sprite coords */
+		for (c = 0; c < 4; c++) {
+			if (!(fp->cfg.io[i].mask & (1 << c)))
+				continue;
+			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+			++m;
+		}
+	}
+}
+
+static int
+nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
+	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+	int c;
+	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+	uint8_t *map = (uint8_t *)p_map;
+
+	for (c = 0; c < 4; ++c) {
+		if (mf & 1) {
+			if (fpi->linear == TRUE)
+				lin[mid / 32] |= 1 << (mid % 32);
+			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+		}
+
+		oid += mv & 1;
+		mf >>= 1;
+		mv >>= 1;
+	}
+
+	return mid;
+}
+
+void
+nv50_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *fp = nv50->fragprog;
+	struct nouveau_stateobj *so;
+	struct nv50_sreg4 dummy, *vpo;
+	int i, n, c, m = 0;
+	uint32_t map[16], lin[4], reg[5], pcrd[8];
+
+	memset(map, 0, sizeof(map));
+	memset(lin, 0, sizeof(lin));
+
+	reg[1] = 0x00000004; /* low and high clip distance map ids */
+	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+	reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+	dummy.linear = FALSE;
+	dummy.mask = 0xf; /* map all components of HPOS */
+	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+
+	dummy.mask = 0x0;
+
+	if (vp->cfg.clpd < 0x40) {
+		for (c = 0; c < vp->cfg.clpd_nr; ++c)
+			map[m++] = vp->cfg.clpd + c;
+		reg[1] = (m << 8);
+	}
+
+	reg[0] |= m << 8; /* adjust BFC0 id */
+
+	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+	if (nv50->rasterizer->pipe.light_twoside) {
+		vpo = &vp->cfg.two_side[0];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+	}
+
+	reg[0] += m - 4; /* adjust FFC0 id */
+	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+	i = 0;
+	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+		i = 1;
+	for (; i < fp->cfg.io_nr; i++) {
+		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
+		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
+
+		n = fp->cfg.io[i].id_vp;
+		if (n >= vp->cfg.io_nr ||
+		    vp->info.output_semantic_name[n] != sn ||
+		    vp->info.output_semantic_index[n] != si)
+			vpo = &dummy;
+		else
+			vpo = &vp->cfg.io[n];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+	}
+
+	if (nv50->rasterizer->pipe.point_size_per_vertex) {
+		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
+		reg[3] = (m++ << 4) | 1;
+	}
+
+	/* now fill the stateobj */
+	so = so_new(64, 0);
+
+	n = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+	so_datap (so, map, n);
+
+	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+	so_datap (so, reg, 4);
+
+	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+	so_data  (so, reg[4]);
+
+	so_method(so, tesla, 0x1540, 4);
+	so_datap (so, lin, 4);
+
+	if (nv50->rasterizer->pipe.point_sprite) {
+		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+
+		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+		so_datap (so, pcrd, 8);
+	}
+
+        so_ref(so, &nv50->state.programs);
+        so_ref(NULL, &so);
+}
+
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
@@ -2474,8 +2874,6 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	nouveau_bo_ref(NULL, &p->bo);
 
 	nouveau_resource_free(&p->data[0]);
-	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
-
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 096e0476aa..d78dee083f 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -15,6 +15,15 @@ struct nv50_program_exec {
 	} param;
 };
 
+struct nv50_sreg4 {
+	uint8_t hw;
+	uint8_t id_vp;
+	uint8_t id_fp;
+
+	uint8_t mask;
+	boolean linear;
+};
+
 struct nv50_program {
 	struct pipe_shader_state pipe;
 	struct tgsi_shader_info info;
@@ -24,8 +33,8 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data[2];
-	unsigned data_start[2];
+	struct nouveau_resource *data[1];
+	unsigned data_start[1];
 
 	struct nouveau_bo *bo;
 
@@ -36,14 +45,20 @@ struct nv50_program {
 	struct {
 		unsigned high_temp;
 		unsigned high_result;
-		struct {
-			unsigned attr[2];
-		} vp;
-		struct {
-			unsigned regs[4];
-			unsigned map[5];
-			unsigned high_map;
-		} fp;
+
+		uint32_t attr[2];
+		uint32_t regs[4];
+
+		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
+		unsigned io_nr;
+		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+
+		/* FP colour inputs, VP/GP back colour outputs */
+		struct nv50_sreg4 two_side[2];
+
+		/* VP only */
+		uint8_t clpd, clpd_nr;
+		uint8_t psiz;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 940e04365f..5305c93d59 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -94,7 +94,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nv50_query *q = nv50_query(pq);
 
 	WAIT_RING (chan, 5);
-	BEGIN_RING(chan, tesla, 0x1b00, 4);
+	BEGIN_RING(chan, tesla, NV50TCL_QUERY_ADDRESS_HIGH, 4);
 	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 0x00000000);
@@ -107,13 +107,13 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 		  boolean wait, uint64_t *result)
 {
 	struct nv50_query *q = nv50_query(pq);
-
-	/*XXX: Want to be able to return FALSE here instead of blocking
-	 *     until the result is available..
-	 */
+	int ret;
 
 	if (!q->ready) {
-		nouveau_bo_map(q->bo, NOUVEAU_BO_RD);
+		ret = nouveau_bo_map(q->bo, NOUVEAU_BO_RD |
+				     wait ? 0 : NOUVEAU_BO_NOWAIT);
+		if (ret)
+			return false;
 		q->result = ((uint32_t *)q->bo->map)[1];
 		q->ready = TRUE;
 		nouveau_bo_unmap(q->bo);
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index ce8f906b15..3b08e1b89f 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -44,9 +44,10 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 	} else
 	if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
 		switch (format) {
+		case PIPE_FORMAT_Z32_FLOAT:
 		case PIPE_FORMAT_Z24S8_UNORM:
-		case PIPE_FORMAT_Z24X8_UNORM:
-		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+		case PIPE_FORMAT_S8Z24_UNORM:
 			return TRUE;
 		default:
 			break;
@@ -86,12 +87,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
@@ -188,7 +187,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	nv50_transfer_init_screen_functions(pscreen);
 
 	/* DMA engine object */
-	ret = nouveau_grobj_alloc(chan, 0xbeef5039, 0x5039, &screen->m2mf);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5039,
+		NV50_MEMORY_TO_MEMORY_FORMAT, &screen->m2mf);
 	if (ret) {
 		NOUVEAU_ERR("Error creating M2MF object: %d\n", ret);
 		nv50_screen_destroy(pscreen);
@@ -197,7 +197,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
-	ret = nouveau_grobj_alloc(chan, 0xbeef502d, 0x502d, &screen->eng2d);
+	ret = nouveau_grobj_alloc(chan, 0xbeef502d, NV50_2D, &screen->eng2d);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
 		nv50_screen_destroy(pscreen);
@@ -208,14 +208,15 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	/* 3D object */
 	switch (chipset & 0xf0) {
 	case 0x50:
-		tesla_class = 0x5097;
+		tesla_class = NV50TCL;
 		break;
 	case 0x80:
 	case 0x90:
-		tesla_class = 0x8297;
+		/* this stupid name should be corrected. */
+		tesla_class = NV54TCL;
 		break;
 	case 0xa0:
-		tesla_class = 0x8397;
+		tesla_class = NVA0TCL;
 		break;
 	default:
 		NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset);
@@ -229,7 +230,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class, &screen->tesla);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class,
+		&screen->tesla);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		nv50_screen_destroy(pscreen);
@@ -247,7 +249,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	/* Static M2MF init */
 	so = so_new(32, 0);
-	so_method(so, screen->m2mf, 0x0180, 3);
+	so_method(so, screen->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 3);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
 	so_data  (so, chan->vram->handle);
@@ -290,9 +292,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	so_method(so, screen->tesla, 0x13bc, 1);
 	so_data  (so, 0x54);
+	/* origin is top left (set to 1 for bottom left) */
 	so_method(so, screen->tesla, 0x13ac, 1);
-	so_data  (so, 1);
-	so_method(so, screen->tesla, 0x16b8, 1);
+	so_data  (so, 0);
+	so_method(so, screen->tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, 8);
 
 	/* constant buffers for immediates and VP/FP parameters */
@@ -330,33 +333,33 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_data  (so, 0x000BBNP1);
 	*/
 
-	so_method(so, screen->tesla, 0x1280, 3);
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PMISC << 16) | 0x00000800);
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000031 | (NV50_CB_PMISC << 12));
 
-	so_method(so, screen->tesla, 0x1280, 3);
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PVP << 16) | 0x00000800);
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000101 | (NV50_CB_PVP << 12));
 
-	so_method(so, screen->tesla, 0x1280, 3);
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PFP << 16) | 0x00000800);
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
 
 	/* Texture sampler/image unit setup - we abuse the constant buffer
@@ -370,13 +373,13 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	so_method(so, screen->tesla, 0x1280, 3);
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
-	so_method(so, screen->tesla, 0x1574, 3);
+	so_method(so, screen->tesla, NV50TCL_TIC_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
@@ -389,13 +392,13 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	so_method(so, screen->tesla, 0x1280, 3);
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
-	so_method(so, screen->tesla, 0x155c, 3);
+	so_method(so, screen->tesla, NV50TCL_TSC_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
@@ -405,7 +408,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
-		so_method(so, screen->tesla, 0x1080 + (i * 8), 2);
+		so_method(so, screen->tesla, NV50TCL_UNK1080_OFFSET_HIGH(i), 2);
 		so_data  (so, 0x000000ff);
 		so_data  (so, 0xffffffff);
 	}
@@ -417,6 +420,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_method(so, screen->tesla, 0x1234, 1);
 	so_data  (so, 1);
 
+	/* activate first scissor rectangle */
+	so_method(so, screen->tesla, NV50TCL_SCISSOR_ENABLE, 1);
+	so_data  (so, 1);
+
 	so_emit(chan, so);
 	so_ref (so, &screen->static_init);
 	so_ref (NULL, &so);
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 116866a8e7..81fa3e34c5 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -205,11 +205,16 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	limit = CLAMP(cso->lod_bias, -16.0, 15.0);
-	tsc[1] |= ((int)(limit * 256.0) & 0x1fff) << 11;
+	tsc[1] |= ((int)(limit * 256.0) & 0x1fff) << 12;
 
 	tsc[2] |= ((int)CLAMP(cso->max_lod, 0.0, 15.0) << 20) |
 		  ((int)CLAMP(cso->min_lod, 0.0, 15.0) << 8);
 
+	tsc[4] = fui(cso->border_color[0]);
+	tsc[5] = fui(cso->border_color[1]);
+	tsc[6] = fui(cso->border_color[2]);
+	tsc[7] = fui(cso->border_color[3]);
+
 	sso->normalized = cso->normalized_coords;
 	return (void *)sso;
 }
@@ -268,6 +273,11 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
 	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
 				       NV50TCL_SHADE_MODEL_SMOOTH);
+	so_method(so, tesla, 0x1684, 1);
+	so_data  (so, cso->flatshade_first ? 0 : 1);
+
+	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	so_data  (so, cso->light_twoside);
 
 	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
 	so_data  (so, fui(cso->line_width));
@@ -287,6 +297,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
 	so_data  (so, fui(cso->point_size));
 
+	so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1);
+	so_data  (so, cso->point_sprite);
+
 	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
 		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
@@ -404,35 +417,35 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, 0);
 	}
 
-	/*XXX: yes, I know they're backwards.. header needs fixing */
+	/* XXX: keep hex values until header is updated (names reversed) */
 	if (cso->stencil[0].enabled) {
-		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_method(so, tesla, 0x1380, 8);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
 		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
-		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
 		so_data  (so, cso->stencil[0].ref_value);
 		so_data  (so, cso->stencil[0].writemask);
 		so_data  (so, cso->stencil[0].valuemask);
 	} else {
-		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_method(so, tesla, 0x1380, 1);
 		so_data  (so, 0);
 	}
 
 	if (cso->stencil[1].enabled) {
-		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
+		so_method(so, tesla, 0x1594, 5);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
 		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_method(so, tesla, 0x0f54, 3);
 		so_data  (so, cso->stencil[1].ref_value);
 		so_data  (so, cso->stencil[1].writemask);
 		so_data  (so, cso->stencil[1].valuemask);
 	} else {
-		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_method(so, tesla, 0x1594, 1);
 		so_data  (so, 0);
 	}
 
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index d313e9de4f..5a3559ed18 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -33,7 +33,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
 		struct pipe_texture *pt = fb->cbufs[i]->texture;
-		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+		struct nouveau_bo *bo = nv50_miptree(pt)->base.bo;
 
 		if (!gw) {
 			w = fb->cbufs[i]->width;
@@ -55,18 +55,19 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->cbufs[i]->format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
-			so_data(so, 0xcf);
+			so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM);
 			break;
 		case PIPE_FORMAT_R5G6B5_UNORM:
-			so_data(so, 0xe8);
+			so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM);
 			break;
 		default:
 			NOUVEAU_ERR("AIIII unknown format %s\n",
 				    pf_name(fb->cbufs[i]->format));
-			so_data(so, 0xe6);
+			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -75,7 +76,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 
 	if (fb->zsbuf) {
 		struct pipe_texture *pt = fb->zsbuf->texture;
-		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+		struct nouveau_bo *bo = nv50_miptree(pt)->base.bo;
 
 		if (!gw) {
 			w = fb->zsbuf->width;
@@ -92,25 +93,31 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z32_FLOAT:
+			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);
+			break;
 		case PIPE_FORMAT_Z24S8_UNORM:
-		case PIPE_FORMAT_Z24X8_UNORM:
-			so_data(so, 0x16);
+			so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM);
+			break;
+		case PIPE_FORMAT_X8Z24_UNORM:
+			so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM);
 			break;
-		case PIPE_FORMAT_Z16_UNORM:
-			so_data(so, 0x15);
+		case PIPE_FORMAT_S8Z24_UNORM:
+			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
 			break;
 		default:
 			NOUVEAU_ERR("AIIII unknown format %s\n",
 				    pf_name(fb->zsbuf->format));
-			so_data(so, 0x16);
+			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
 		so_data  (so, 1);
-		so_method(so, tesla, 0x1228, 3);
+		so_method(so, tesla, NV50TCL_ZETA_HORIZ, 3);
 		so_data  (so, fb->zsbuf->width);
 		so_data  (so, fb->zsbuf->height);
 		so_data  (so, 0x00010001);
@@ -119,12 +126,18 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
 	so_data  (so, w << 16);
 	so_data  (so, h << 16);
-	so_method(so, tesla, 0x0e04, 2);
+	/* set window lower left corner */
+	so_method(so, tesla, NV50TCL_WINDOW_LEFT, 2);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	/* set screen scissor rectangle */
+	so_method(so, tesla, NV50TCL_SCREEN_SCISSOR_HORIZ, 2);
 	so_data  (so, w << 16);
 	so_data  (so, h << 16);
-	so_method(so, tesla, 0xdf8, 2);
-	so_data  (so, 0);
-	so_data  (so, h);
+
+	/* we set scissors to framebuffer size when they're 'turned off' */
+	nv50->dirty |= NV50_NEW_SCISSOR;
+	so_ref(NULL, &nv50->state.scissor);
 
 	so_ref(so, &nv50->state.fb);
 	so_ref(NULL, &so);
@@ -137,7 +150,32 @@ nv50_state_emit(struct nv50_context *nv50)
 	struct nouveau_channel *chan = screen->base.channel;
 
 	if (nv50->pctx_id != screen->cur_pctx) {
-		nv50->state.dirty |= 0xffffffff;
+		if (nv50->state.fb)
+			nv50->state.dirty |= NV50_NEW_FRAMEBUFFER;
+		if (nv50->state.blend)
+			nv50->state.dirty |= NV50_NEW_BLEND;
+		if (nv50->state.zsa)
+			nv50->state.dirty |= NV50_NEW_ZSA;
+		if (nv50->state.vertprog)
+			nv50->state.dirty |= NV50_NEW_VERTPROG;
+		if (nv50->state.fragprog)
+			nv50->state.dirty |= NV50_NEW_FRAGPROG;
+		if (nv50->state.rast)
+			nv50->state.dirty |= NV50_NEW_RASTERIZER;
+		if (nv50->state.blend_colour)
+			nv50->state.dirty |= NV50_NEW_BLEND_COLOUR;
+		if (nv50->state.stipple)
+			nv50->state.dirty |= NV50_NEW_STIPPLE;
+		if (nv50->state.scissor)
+			nv50->state.dirty |= NV50_NEW_SCISSOR;
+		if (nv50->state.viewport)
+			nv50->state.dirty |= NV50_NEW_VIEWPORT;
+		if (nv50->state.tsc_upload)
+			nv50->state.dirty |= NV50_NEW_SAMPLER;
+		if (nv50->state.tic_upload)
+			nv50->state.dirty |= NV50_NEW_TEXTURE;
+		if (nv50->state.vtxfmt && nv50->state.vtxbuf)
+			nv50->state.dirty |= NV50_NEW_ARRAYS;
 		screen->cur_pctx = nv50->pctx_id;
 	}
 
@@ -151,6 +189,8 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
+	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		so_emit(chan, nv50->state.programs);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -168,8 +208,16 @@ nv50_state_emit(struct nv50_context *nv50)
 	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
 		so_emit(chan, nv50->state.vtxfmt);
 		so_emit(chan, nv50->state.vtxbuf);
+		if (nv50->state.vtxattr)
+			so_emit(chan, nv50->state.vtxattr);
 	}
 	nv50->state.dirty = 0;
+}
+
+void
+nv50_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv50_context *nv50 = chan->user_private;
 
 	so_emit_reloc_markers(chan, nv50->state.fb);
 	so_emit_reloc_markers(chan, nv50->state.vertprog);
@@ -200,6 +248,9 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		nv50_linkage_validate(nv50);
+
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
@@ -233,13 +284,13 @@ nv50_state_validate(struct nv50_context *nv50)
 		nv50->state.scissor_enabled = rast->scissor;
 
 		so = so_new(3, 0);
-		so_method(so, tesla, 0x0ff4, 2);
+		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ, 2);
 		if (nv50->state.scissor_enabled) {
-			so_data(so, ((s->maxx - s->minx) << 16) | s->minx);
-			so_data(so, ((s->maxy - s->miny) << 16) | s->miny);
+			so_data(so, (s->maxx << 16) | s->minx);
+			so_data(so, (s->maxy << 16) | s->miny);
 		} else {
-			so_data(so, (8192 << 16));
-			so_data(so, (8192 << 16));
+			so_data(so, (nv50->framebuffer.width << 16));
+			so_data(so, (nv50->framebuffer.height << 16));
 		}
 		so_ref(so, &nv50->state.scissor);
 		so_ref(NULL, &so);
@@ -263,20 +314,22 @@ scissor_uptodate:
 
 		so = so_new(12, 0);
 		if (!bypass) {
-			so_method(so, tesla, NV50TCL_VIEWPORT_UNK1(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
 			so_data  (so, fui(nv50->viewport.translate[1]));
 			so_data  (so, fui(nv50->viewport.translate[2]));
-			so_method(so, tesla, NV50TCL_VIEWPORT_UNK0(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_SCALE(0), 3);
 			so_data  (so, fui(nv50->viewport.scale[0]));
-			so_data  (so, fui(-nv50->viewport.scale[1]));
+			so_data  (so, fui(nv50->viewport.scale[1]));
 			so_data  (so, fui(nv50->viewport.scale[2]));
-			so_method(so, tesla, 0x192c, 1);
+
+			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 1);
+			/* no idea what 0f90 does */
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 0);
 		} else {
-			so_method(so, tesla, 0x192c, 1);
+			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 0);
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 1);
@@ -292,9 +345,10 @@ viewport_uptodate:
 		int i;
 
 		so = so_new(nv50->sampler_nr * 8 + 3, 0);
-		so_method(so, tesla, 0x0f00, 1);
+		so_method(so, tesla, NV50TCL_CB_ADDR, 1);
 		so_data  (so, NV50_CB_TSC);
-		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
+		so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000,
+			nv50->sampler_nr * 8);
 		for (i = 0; i < nv50->sampler_nr; i++)
 			so_datap (so, nv50->sampler[i]->tsc, 8);
 		so_ref(so, &nv50->state.tsc_upload);
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 3da9d6e728..6bf6f773b0 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -35,13 +35,13 @@ nv50_format(enum pipe_format format)
 {
 	switch (format) {
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
-		return NV50_2D_DST_FORMAT_32BPP;
+		return NV50_2D_DST_FORMAT_A8R8G8B8_UNORM;
 	case PIPE_FORMAT_X8R8G8B8_UNORM:
-		return NV50_2D_DST_FORMAT_24BPP;
+		return NV50_2D_DST_FORMAT_X8R8G8B8_UNORM;
 	case PIPE_FORMAT_R5G6B5_UNORM:
-		return NV50_2D_DST_FORMAT_16BPP;
+		return NV50_2D_DST_FORMAT_R5G6B5_UNORM;
 	case PIPE_FORMAT_A8_UNORM:
-		return NV50_2D_DST_FORMAT_8BPP;
+		return NV50_2D_DST_FORMAT_R8_UNORM;
 	default:
 		return -1;
 	}
@@ -53,20 +53,20 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
 	struct nv50_miptree *mt = nv50_miptree(ps->texture);
 	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
-	struct nouveau_bo *bo = nv50_miptree(ps->texture)->bo;
+	struct nouveau_bo *bo = nv50_miptree(ps->texture)->base.bo;
  	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
  	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
 
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
-  
+
  	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
  		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
- 		OUT_RING  (chan, mt->level[0].pitch);
+		OUT_RING  (chan, mt->level[ps->level].pitch);
  		OUT_RING  (chan, ps->width);
  		OUT_RING  (chan, ps->height);
  		OUT_RELOCh(chan, bo, ps->offset, flags);
@@ -75,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
@@ -144,7 +144,7 @@ nv50_surface_copy(struct pipe_context *pipe,
 		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
 		  unsigned width, unsigned height)
 {
-	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_context *nv50 = nv50_context(pipe);
 	struct nv50_screen *screen = nv50->screen;
 
 	assert(src->format == dest->format);
@@ -158,7 +158,7 @@ nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
 		  unsigned destx, unsigned desty, unsigned width,
 		  unsigned height, unsigned value)
 {
-	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_context *nv50 = nv50_context(pipe);
 	struct nv50_screen *screen = nv50->screen;
 	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index ff40c2ad81..033cb50c11 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -29,33 +29,33 @@ static int
 nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 		   struct nv50_miptree *mt, int unit)
 {
-	switch (mt->base.format) {
+	switch (mt->base.base.format) {
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
 			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
 			    NV50TIC_0_0_FMT_8_8_8_8);
 		break;
 	case PIPE_FORMAT_A1R5G5B5_UNORM:
 		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
 			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
 			    NV50TIC_0_0_FMT_1_5_5_5);
 		break;
 	case PIPE_FORMAT_A4R4G4B4_UNORM:
 		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
 			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
 			    NV50TIC_0_0_FMT_4_4_4_4);
 		break;
 	case PIPE_FORMAT_R5G6B5_UNORM:
 		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
 			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
 			    NV50TIC_0_0_FMT_5_6_5);
 		break;
 	case PIPE_FORMAT_L8_UNORM:
@@ -118,18 +118,18 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 		return 1;
 	}
 
-	so_reloc(so, mt->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+	so_reloc(so, mt->base.bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
 		     NOUVEAU_BO_RD, 0, 0);
 	if (nv50->sampler[unit]->normalized)
-		so_data (so, 0xd0005000 | mt->bo->tile_mode << 22);
+		so_data (so, 0xd0005000 | mt->base.bo->tile_mode << 22);
 	else
-		so_data (so, 0x5001d000 | mt->bo->tile_mode << 22);
+		so_data (so, 0x5001d000 | mt->base.bo->tile_mode << 22);
 	so_data (so, 0x00300000);
-	so_data (so, mt->base.width[0]);
-	so_data (so, (mt->base.last_level << 28) |
-		     (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, mt->base.base.width[0]);
+	so_data (so, (mt->base.base.last_level << 28) |
+		     (mt->base.base.depth[0] << 16) | mt->base.base.height[0]);
 	so_data (so, 0x03000000);
-	so_data (so, mt->base.last_level << 4);
+	so_data (so, mt->base.base.last_level << 4);
 
 	return 0;
 }
@@ -145,25 +145,28 @@ nv50_tex_validate(struct nv50_context *nv50)
 	push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2;
 
 	so = so_new(push, nv50->miptree_nr * 2);
-	so_method(so, tesla, 0x0f00, 1);
+	so_method(so, tesla, NV50TCL_CB_ADDR, 1);
 	so_data  (so, NV50_CB_TIC);
 	for (unit = 0; unit < nv50->miptree_nr; unit++) {
 		struct nv50_miptree *mt = nv50->miptree[unit];
 
-		so_method(so, tesla, 0x40000f04, 8);
+		so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, 8);
 		if (nv50_tex_construct(nv50, so, mt, unit)) {
 			NOUVEAU_ERR("failed tex validate\n");
 			so_ref(NULL, &so);
 			return;
 		}
 
-		so_method(so, tesla, 0x1458, 1);
-		so_data  (so, (unit << 9) | (unit << 1) | 1);
+		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
+		so_data  (so, (unit << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) |
+			(unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) |
+			NV50TCL_SET_SAMPLER_TEX_VALID);
 	}
 
 	for (; unit < nv50->state.miptree_nr; unit++) {
-		so_method(so, tesla, 0x1458, 1);
-		so_data  (so, (unit << 1) | 0);
+		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
+		so_data  (so,
+			(unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0);
 	}
 
 	so_ref(so, &nv50->state.tic_upload);
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
index aca622c73b..207fb039f7 100644
--- a/src/gallium/drivers/nv50/nv50_texture.h
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -14,13 +14,13 @@
 #define NV50TIC_0_0_MAPA_C2                                       0x20000000
 #define NV50TIC_0_0_MAPA_C3                                       0x28000000
 #define NV50TIC_0_0_MAPA_ONE                                      0x38000000
-#define NV50TIC_0_0_MAPR_MASK                                     0x07000000
-#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
-#define NV50TIC_0_0_MAPR_C0                                       0x02000000
-#define NV50TIC_0_0_MAPR_C1                                       0x03000000
-#define NV50TIC_0_0_MAPR_C2                                       0x04000000
-#define NV50TIC_0_0_MAPR_C3                                       0x05000000
-#define NV50TIC_0_0_MAPR_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPB_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x02000000
+#define NV50TIC_0_0_MAPB_C1                                       0x03000000
+#define NV50TIC_0_0_MAPB_C2                                       0x04000000
+#define NV50TIC_0_0_MAPB_C3                                       0x05000000
+#define NV50TIC_0_0_MAPB_ONE                                      0x07000000
 #define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
 #define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
 #define NV50TIC_0_0_MAPG_C0                                       0x00400000
@@ -28,31 +28,49 @@
 #define NV50TIC_0_0_MAPG_C2                                       0x00800000
 #define NV50TIC_0_0_MAPG_C3                                       0x00a00000
 #define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
-#define NV50TIC_0_0_MAPB_MASK                                     0x001c0000
-#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
-#define NV50TIC_0_0_MAPB_C0                                       0x00080000
-#define NV50TIC_0_0_MAPB_C1                                       0x000c0000
-#define NV50TIC_0_0_MAPB_C2                                       0x00100000
-#define NV50TIC_0_0_MAPB_C3                                       0x00140000
-#define NV50TIC_0_0_MAPB_ONE                                      0x001c0000
+#define NV50TIC_0_0_MAPR_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x00080000
+#define NV50TIC_0_0_MAPR_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPR_C2                                       0x00100000
+#define NV50TIC_0_0_MAPR_C3                                       0x00140000
+#define NV50TIC_0_0_MAPR_ONE                                      0x001c0000
 #define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
 #define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
-#define NV50TIC_0_0_TYPER_MASK                                    0x00007000
-#define NV50TIC_0_0_TYPER_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEA_SNORM                                   0x00008000
+#define NV50TIC_0_0_TYPEA_FLOAT                                   0x00038000
+#define NV50TIC_0_0_TYPEB_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEB_SNORM                                   0x00001000
+#define NV50TIC_0_0_TYPEB_FLOAT                                   0x00007000
 #define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
 #define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
-#define NV50TIC_0_0_TYPEB_MASK                                    0x000001c0
-#define NV50TIC_0_0_TYPEB_UNORM                                   0x00000080
-#define NV50TIC_0_0_FMT_MASK                                      0x0000003c
+#define NV50TIC_0_0_TYPEG_SNORM                                   0x00000200
+#define NV50TIC_0_0_TYPEG_FLOAT                                   0x00000e00
+#define NV50TIC_0_0_TYPER_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00000080
+#define NV50TIC_0_0_TYPER_SNORM                                   0x00000040
+#define NV50TIC_0_0_TYPER_FLOAT                                   0x000001c0
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003f
+#define NV50TIC_0_0_FMT_32_32_32_32                               0x00000001
+#define NV50TIC_0_0_FMT_16_16_16_16                               0x00000003
+#define NV50TIC_0_0_FMT_32_32                                     0x00000004
 #define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_2_10_10_10                                0x00000009
+#define NV50TIC_0_0_FMT_32                                        0x0000000f
 #define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
-#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000013
+/* #define NV50TIC_0_0_FMT_1_5_5_5                                0x00000013 */
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000014
 #define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
 #define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_16                                        0x0000001b
 #define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_10_11_11                                  0x00000021
 #define NV50TIC_0_0_FMT_DXT1                                      0x00000024
 #define NV50TIC_0_0_FMT_DXT3                                      0x00000025
 #define NV50TIC_0_0_FMT_DXT5                                      0x00000026
+#define NV50TIC_0_0_FMT_RGTC1                                     0x00000027
+#define NV50TIC_0_0_FMT_RGTC2                                     0x00000028
 
 #define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
 #define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
@@ -102,6 +120,7 @@
 #define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
 #define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
 #define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+#define NV50TSC_1_0_MAX_ANISOTROPY_MASK                          0x00700000
 
 #define NV50TSC_1_1_MAGF_MASK                                    0x00000003
 #define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
@@ -113,17 +132,19 @@
 #define NV50TSC_1_1_MIPF_NONE                                    0x00000040
 #define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
 #define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+#define NV50TSC_1_1_LOD_BIAS_MASK                                0x01fff000
 
-#define NV50TSC_1_2_UNKNOWN_MASK                                 0xffffffff
+#define NV50TSC_1_2_MIN_LOD_MASK                                 0x00000f00
+#define NV50TSC_1_2_MAX_LOD_MASK                                 0x00f00000
 
 #define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
 
-#define NV50TSC_1_4_UNKNOWN_MASK                                 0xffffffff
+#define NV50TSC_1_4_BORDER_COLOR_RED_MASK                        0xffffffff
 
-#define NV50TSC_1_5_UNKNOWN_MASK                                 0xffffffff
+#define NV50TSC_1_5_BORDER_COLOR_GREEN_MASK                      0xffffffff
 
-#define NV50TSC_1_6_UNKNOWN_MASK                                 0xffffffff
+#define NV50TSC_1_6_BORDER_COLOR_BLUE_MASK                       0xffffffff
 
-#define NV50TSC_1_7_UNKNOWN_MASK                                 0xffffffff
+#define NV50TSC_1_7_BORDER_COLOR_ALPHA_MASK                      0xffffffff
 
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index d0b7f0bef4..bb7731855c 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -8,6 +8,7 @@ struct nv50_transfer {
 	struct pipe_transfer base;
 	struct nouveau_bo *bo;
 	unsigned level_offset;
+	unsigned level_tiling;
 	int level_pitch;
 	int level_width;
 	int level_height;
@@ -16,11 +17,14 @@ struct nv50_transfer {
 };
 
 static void
-nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
-			unsigned src_offset, int src_pitch, int sx, int sy,
-			int sw, int sh, struct nouveau_bo *dst_bo,
-			unsigned dst_offset, int dst_pitch, int dx, int dy,
-			int dw, int dh, int cpp, int width, int height,
+nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
+			struct nouveau_bo *src_bo, unsigned src_offset,
+			int src_pitch, unsigned src_tile_mode,
+			int sx, int sy, int sw, int sh,
+			struct nouveau_bo *dst_bo, unsigned dst_offset,
+			int dst_pitch, unsigned dst_tile_mode,
+			int dx, int dy, int dw, int dh,
+			int cpp, int width, int height,
 			unsigned src_reloc, unsigned dst_reloc)
 {
 	struct nv50_screen *screen = nv50_screen(pscreen);
@@ -33,15 +37,18 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
 	WAIT_RING (chan, 14);
 
 	if (!src_bo->tile_flags) {
-		BEGIN_RING(chan, m2mf, 0x0200, 1);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 1);
 		OUT_RING  (chan, 1);
-		BEGIN_RING(chan, m2mf, 0x0314, 1);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
 		OUT_RING  (chan, src_pitch);
 		src_offset += (sy * src_pitch) + (sx * cpp);
 	} else {
-		BEGIN_RING(chan, m2mf, 0x0200, 6);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, src_bo->tile_mode << 4);
+		OUT_RING  (chan, src_tile_mode << 4);
 		OUT_RING  (chan, sw * cpp);
 		OUT_RING  (chan, sh);
 		OUT_RING  (chan, 1);
@@ -49,15 +56,18 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
 	}
 
 	if (!dst_bo->tile_flags) {
-		BEGIN_RING(chan, m2mf, 0x021c, 1);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 1);
 		OUT_RING  (chan, 1);
-		BEGIN_RING(chan, m2mf, 0x0318, 1);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
 		OUT_RING  (chan, dst_pitch);
 		dst_offset += (dy * dst_pitch) + (dx * cpp);
 	} else {
-		BEGIN_RING(chan, m2mf, 0x021c, 6);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, dst_bo->tile_mode << 4);
+		OUT_RING  (chan, dst_tile_mode << 4);
 		OUT_RING  (chan, dw * cpp);
 		OUT_RING  (chan, dh);
 		OUT_RING  (chan, 1);
@@ -68,25 +78,30 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
 		int line_count = height > 2047 ? 2047 : height;
 
 		WAIT_RING (chan, 15);
-		BEGIN_RING(chan, m2mf, 0x0238, 2);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH, 2);
 		OUT_RELOCh(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCh(chan, dst_bo, dst_offset, dst_reloc);
-		BEGIN_RING(chan, m2mf, 0x030c, 2);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
 		if (src_bo->tile_flags) {
-			BEGIN_RING(chan, m2mf, 0x0218, 1);
-			OUT_RING  (chan, (dy << 16) | sx);
+			BEGIN_RING(chan, m2mf,
+				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
+			OUT_RING  (chan, (sy << 16) | (sx * cpp));
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
 		if (dst_bo->tile_flags) {
-			BEGIN_RING(chan, m2mf, 0x0234, 1);
-			OUT_RING  (chan, (sy << 16) | dx);
+			BEGIN_RING(chan, m2mf,
+				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
+			OUT_RING  (chan, (dy << 16) | (dx * cpp));
 		} else {
 			dst_offset += (line_count * dst_pitch);
 		}
-		BEGIN_RING(chan, m2mf, 0x031c, 4);
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
 		OUT_RING  (chan, width * cpp);
 		OUT_RING  (chan, line_count);
 		OUT_RING  (chan, 0x00000101);
@@ -133,9 +148,10 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	tx->base.usage = usage;
 
 	tx->level_pitch = lvl->pitch;
-	tx->level_width = mt->base.width[level];
-	tx->level_height = mt->base.height[level];
+	tx->level_width = mt->base.base.width[level];
+	tx->level_height = mt->base.base.height[level];
 	tx->level_offset = lvl->image_offset[image];
+	tx->level_tiling = lvl->tile_mode;
 	tx->level_x = x;
 	tx->level_y = y;
 	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
@@ -146,10 +162,12 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	}
 
 	if (usage != PIPE_TRANSFER_WRITE) {
-		nv50_transfer_rect_m2mf(pscreen, mt->bo, tx->level_offset,
-					tx->level_pitch, x, y, tx->level_width,
-					tx->level_height, tx->bo, 0,
-					tx->base.stride, 0, 0,
+		nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset,
+					tx->level_pitch, tx->level_tiling,
+					x, y,
+					tx->level_width, tx->level_height,
+					tx->bo, 0, tx->base.stride,
+					tx->bo->tile_mode, 0, 0,
 					tx->base.width, tx->base.height,
 					tx->base.block.size, w, h,
 					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
@@ -168,12 +186,14 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 	if (ptx->usage != PIPE_TRANSFER_READ) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride,
-					0, 0, tx->base.width, tx->base.height,
-					mt->bo, tx->level_offset,
-					tx->level_pitch, tx->level_x,
-					tx->level_y, tx->level_width,
-					tx->level_height, tx->base.block.size,
+					tx->bo->tile_mode, 0, 0,
 					tx->base.width, tx->base.height,
+					mt->base.bo, tx->level_offset,
+					tx->level_pitch, tx->level_tiling,
+					tx->level_x, tx->level_y,
+					tx->level_width, tx->level_height,
+					tx->base.block.size, tx->base.width,
+					tx->base.height,
 					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
 					NOUVEAU_BO_GART);
 	}
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index f81929f238..eeed148c7b 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -49,6 +49,82 @@ nv50_prim(unsigned mode)
 	return NV50TCL_VERTEX_BEGIN_POINTS;
 }
 
+static INLINE uint32_t
+nv50_vbo_type_to_hw(unsigned type)
+{
+	switch (type) {
+	case PIPE_FORMAT_TYPE_FLOAT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT;
+	case PIPE_FORMAT_TYPE_UNORM:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM;
+	case PIPE_FORMAT_TYPE_SNORM:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM;
+	case PIPE_FORMAT_TYPE_USCALED:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED;
+	case PIPE_FORMAT_TYPE_SSCALED:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED;
+	/*
+	case PIPE_FORMAT_TYPE_UINT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT;
+	case PIPE_FORMAT_TYPE_SINT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT; */
+	default:
+		return 0;
+	}
+}
+
+static INLINE uint32_t
+nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
+{
+	static const uint32_t hw_values[] = {
+		0, 0, 0, 0,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16_16,
+		0, 0, 0, 0,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32_32 };
+
+	/* we'd also have R11G11B10 and R10G10B10A2 */
+
+	assert(nr_c > 0 && nr_c <= 4);
+
+	if (size > 32)
+		return 0;
+	size >>= (3 - 2);
+
+	return hw_values[size + (nr_c - 1)];
+}
+
+static INLINE uint32_t
+nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
+{
+	uint32_t hw_type, hw_size;
+	enum pipe_format pf = ve->src_format;
+	unsigned size = pf_size_x(pf) << pf_exp2(pf);
+
+	hw_type = nv50_vbo_type_to_hw(pf_type(pf));
+	hw_size = nv50_vbo_size_to_hw(size, ve->nr_components);
+
+	if (!hw_type || !hw_size) {
+		NOUVEAU_ERR("unsupported vbo format: %s\n", pf_name(pf));
+		abort();
+		return 0x24e80000;
+	}
+
+	if (pf_swizzle_x(pf) == 2) /* BGRA */
+		hw_size |= (1 << 31); /* no real swizzle bits :-( */
+
+	return (hw_type | hw_size);
+}
+
 boolean
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
@@ -102,7 +178,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 
 		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
 		for (i = 0; i < nr; i += 2)
-			OUT_RING  (chan, (map[1] << 16) | map[0]);
+			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
 		count -= nr;
 		map += nr;
@@ -131,7 +207,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 
 		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
 		for (i = 0; i < nr; i += 2)
-			OUT_RING  (chan, (map[1] << 16) | map[0]);
+			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
 		count -= nr;
 		map += nr;
@@ -139,7 +215,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 }
 
 static INLINE void
-nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
+nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 			      unsigned start, unsigned count)
 {
 	struct nouveau_channel *chan = nv50->screen->tesla->channel;
@@ -201,49 +277,101 @@ nv50_draw_elements(struct pipe_context *pipe,
 	return TRUE;
 }
 
+static INLINE boolean
+nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
+		       struct nouveau_stateobj **pso,
+		       struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+
+{
+	struct nouveau_stateobj *so;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_bo *bo = nouveau_bo(vb->buffer);
+	float *v;
+	int ret;
+	enum pipe_format pf = ve->src_format;
+
+	if ((pf_type(pf) != PIPE_FORMAT_TYPE_FLOAT) ||
+	    (pf_size_x(pf) << pf_exp2(pf)) != 32)
+		return FALSE;
+
+	ret = nouveau_bo_map(bo, NOUVEAU_BO_RD);
+	if (ret)
+		return FALSE;
+	v = (float *)(bo->map + (vb->buffer_offset + ve->src_offset));
+
+	so = *pso;
+	if (!so)
+		*pso = so = so_new(nv50->vtxelt_nr * 5, 0);
+
+	switch (ve->nr_components) {
+	case 4:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_4F_X(attrib), 4);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		so_data  (so, fui(v[2]));
+		so_data  (so, fui(v[3]));
+		break;
+	case 3:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_3F_X(attrib), 3);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		so_data  (so, fui(v[2]));
+		break;
+	case 2:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_2F_X(attrib), 2);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		break;
+	case 1:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_1F(attrib), 1);
+		so_data  (so, fui(v[0]));
+		break;
+	default:
+		nouveau_bo_unmap(bo);
+		return FALSE;
+	}
+
+	nouveau_bo_unmap(bo);
+	return TRUE;
+}
+
 void
 nv50_vbo_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *vtxbuf, *vtxfmt;
-	int i;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *vtxattr;
+	unsigned i;
+
+	/* don't validate if Gallium took away our buffers */
+	if (nv50->vtxbuf_nr == 0)
+		return;
 
-	vtxbuf = so_new(nv50->vtxelt_nr * 4, nv50->vtxelt_nr * 2);
+	vtxattr = NULL;
+	vtxbuf = so_new(nv50->vtxelt_nr * 7, nv50->vtxelt_nr * 4);
 	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
-	so_method(vtxfmt, tesla, 0x1ac0, nv50->vtxelt_nr);
+	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0),
+		nv50->vtxelt_nr);
 
 	for (i = 0; i < nv50->vtxelt_nr; i++) {
 		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
 		struct pipe_vertex_buffer *vb =
 			&nv50->vtxbuf[ve->vertex_buffer_index];
 		struct nouveau_bo *bo = nouveau_bo(vb->buffer);
+		uint32_t hw = nv50_vbo_vtxelt_to_hw(ve);
 
-		switch (ve->src_format) {
-		case PIPE_FORMAT_R32G32B32A32_FLOAT:
-			so_data(vtxfmt, 0x7e080000 | i);
-			break;
-		case PIPE_FORMAT_R32G32B32_FLOAT:
-			so_data(vtxfmt, 0x7e100000 | i);
-			break;
-		case PIPE_FORMAT_R32G32_FLOAT:
-			so_data(vtxfmt, 0x7e200000 | i);
-			break;
-		case PIPE_FORMAT_R32_FLOAT:
-			so_data(vtxfmt, 0x7e900000 | i);
-			break;
-		case PIPE_FORMAT_R8G8B8A8_UNORM:
-			so_data(vtxfmt, 0x24500000 | i);
-			break;
-		default:
-		{
-			NOUVEAU_ERR("invalid vbo format %s\n",
-				    pf_name(ve->src_format));
-			assert(0);
-			return;
-		}
+		if (!vb->stride &&
+		    nv50_vbo_static_attrib(nv50, i, &vtxattr, ve, vb)) {
+			so_data(vtxfmt, hw | (1 << 4));
+
+			so_method(vtxbuf, tesla,
+				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+			so_data  (vtxbuf, 0);
+			continue;
 		}
+		so_data(vtxfmt, hw | i);
 
-		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
 		so_data  (vtxbuf, 0x20000000 | vb->stride);
 		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
@@ -251,11 +379,22 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+
+		/* vertex array limits */
+		so_method(vtxbuf, tesla, 0x1080 + (i * 8), 2);
+		so_reloc (vtxbuf, bo, vb->buffer->size - 1,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			  NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, bo, vb->buffer->size - 1,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			  NOUVEAU_BO_LOW, 0, 0);
 	}
 
 	so_ref (vtxfmt, &nv50->state.vtxfmt);
 	so_ref (vtxbuf, &nv50->state.vtxbuf);
+	so_ref (vtxattr, &nv50->state.vtxattr);
 	so_ref (NULL, &vtxbuf);
 	so_ref (NULL, &vtxfmt);
+	so_ref (NULL, &vtxattr);
 }
 
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index faceec9842..93c2152edc 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -21,6 +21,22 @@ C_SOURCES = \
 	r300_state_invariant.c \
 	r300_vs.c \
 	r300_surface.c \
-	r300_texture.c
+	r300_texture.c \
+	r300_tgsi_to_rc.c
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/mesa/drivers/dri/r300/compiler \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/include
+
+COMPILER_ARCHIVE = $(TOP)/src/mesa/drivers/dri/r300/compiler/libr300compiler.a
+
+EXTRA_OBJECTS = \
+	$(COMPILER_ARCHIVE)
 
 include ../../Makefile.template
+
+.PHONY : $(COMPILER_ARCHIVE)
+
+$(COMPILER_ARCHIVE):
+	cd $(TOP)/src/mesa/drivers/dri/r300/compiler; make
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 00fae8d26f..d138866d33 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -30,9 +30,11 @@
 void r300_parse_chipset(struct r300_capabilities* caps)
 {
     /* Reasonable defaults */
+    caps->num_vert_fpus = 4;
     caps->has_tcl = getenv("RADEON_NO_TCL") ? FALSE : TRUE;
     caps->is_r500 = FALSE;
-    caps->num_vert_fpus = 4;
+    caps->high_second_pipe = FALSE;
+
 
     /* Note: These are not ordered by PCI ID. I leave that task to GCC,
      * which will perform the ordering while collating jump tables. Instead,
@@ -40,6 +42,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
     switch (caps->pci_id) {
         case 0x4144:
             caps->family = CHIP_FAMILY_R300;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x4145:
@@ -50,6 +53,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4E46:
         case 0x4E47:
             caps->family = CHIP_FAMILY_R300;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x4150:
@@ -66,6 +70,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4E54:
         case 0x4E56:
             caps->family = CHIP_FAMILY_RV350;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x4148:
@@ -76,10 +81,12 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4E49:
         case 0x4E4B:
             caps->family = CHIP_FAMILY_R350;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x4E4A:
             caps->family = CHIP_FAMILY_R360;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x5460:
@@ -91,6 +98,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5B64:
         case 0x5B65:
             caps->family = CHIP_FAMILY_RV370;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x3150:
@@ -99,6 +107,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x3E50:
         case 0x3E54:
             caps->family = CHIP_FAMILY_RV380;
+            caps->high_second_pipe = TRUE;
             break;
 
         case 0x4A48:
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index 5b2e1f0568..322d4a57e4 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -34,6 +34,8 @@ struct r300_capabilities {
     int family;
     /* The number of vertex floating-point units */
     int num_vert_fpus;
+    /* The number of fragment pipes */
+    int num_frag_pipes;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
     /* Whether or not this is an RV515 or newer; R500s have many differences
@@ -42,6 +44,8 @@ struct r300_capabilities {
      * - Blend color is split across two registers
      * - Universal Shader (US) block used for fragment shaders */
     boolean is_r500;
+    /* Whether or not the second pixel pipe is accessed with the high bit */
+    boolean high_second_pipe;
 };
 
 /* Enumerations for legibility and telling which card we're running on. */
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 233a32b53c..9cc455135d 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -22,6 +22,9 @@
 
 #include "r300_context.h"
 
+#include "r300_flush.h"
+#include "r300_state_invariant.h"
+
 static boolean r300_draw_range_elements(struct pipe_context* pipe,
                                         struct pipe_buffer* indexBuffer,
                                         unsigned indexSize,
@@ -52,7 +55,7 @@ static boolean r300_draw_range_elements(struct pipe_context* pipe,
 
     draw_set_mapped_constant_buffer(r300->draw,
             r300->shader_constants[PIPE_SHADER_VERTEX].constants,
-            r300->shader_constants[PIPE_SHADER_VERTEX].user_count *
+            r300->shader_constants[PIPE_SHADER_VERTEX].count *
                 (sizeof(float) * 4));
 
     draw_arrays(r300->draw, mode, start, count);
@@ -88,9 +91,21 @@ static boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
 
 static void r300_destroy_context(struct pipe_context* context) {
     struct r300_context* r300 = r300_context(context);
+    struct r300_query* query, * temp;
 
     draw_destroy(r300->draw);
 
+    /* Free the OQ BO. */
+    context->screen->buffer_destroy(r300->oqbo);
+
+    /* If there are any queries pending or not destroyed, remove them now. */
+    if (r300->query_list) {
+        foreach_s(query, temp, r300->query_list) {
+            remove_from_list(query);
+            FREE(query);
+        }
+    }
+
     FREE(r300->blend_color_state);
     FREE(r300->rs_block);
     FREE(r300->scissor_state);
@@ -134,6 +149,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.winsys = (struct pipe_winsys*)r300_winsys;
     r300->context.screen = r300_screen(screen);
 
+    r300_init_debug(r300);
+
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
@@ -145,6 +162,11 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.is_texture_referenced = r300_is_texture_referenced;
     r300->context.is_buffer_referenced = r300_is_buffer_referenced;
 
+    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->rs_block = CALLOC_STRUCT(r300_rs_block);
+    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+    r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
+
     /* Create a Draw. This is used for vert collation and SW TCL. */
     r300->draw = draw_create();
     /* Enable our renderer. */
@@ -155,10 +177,9 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
      * transform in hardware, always. */
     draw_set_viewport_state(r300->draw, &r300_viewport_identity);
 
-    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
-    r300->rs_block = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
-    r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
+    /* Open up the OQ BO. */
+    r300->oqbo = screen->buffer_create(screen, 4096,
+            PIPE_BUFFER_USAGE_VERTEX, 4096);
 
     r300_init_flush_functions(r300);
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index d891fd6265..52b1c9a6b2 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -25,15 +25,22 @@
 
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
+
 #include "pipe/p_context.h"
+
 #include "tgsi/tgsi_scan.h"
+
 #include "util/u_memory.h"
+#include "util/u_simple_list.h"
 
 #include "r300_clear.h"
 #include "r300_query.h"
 #include "r300_screen.h"
 #include "r300_winsys.h"
 
+struct r300_fragment_shader;
+struct r300_vertex_shader;
+
 struct r300_blend_state {
     uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
     uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
@@ -143,69 +150,31 @@ struct r300_constant_buffer {
     /* Buffer of constants */
     /* XXX first number should be raised */
     float constants[32][4];
-    /* Number of user-defined constants */
-    unsigned user_count;
     /* Total number of constants */
     unsigned count;
 };
 
-struct r300_fragment_shader {
-    /* Parent class */
-    struct pipe_shader_state state;
-    struct tgsi_shader_info info;
-
-    /* Has this shader been translated yet? */
-    boolean translated;
-
-    /* Pixel stack size */
-    int stack_size;
-
-    /* Are there immediates in this shader?
-     * If not, we can heavily optimize recompilation. */
-    boolean uses_imms;
-};
-
-struct r3xx_fragment_shader {
-    /* Parent class */
-    struct r300_fragment_shader shader;
-
-    /* Number of ALU instructions */
-    int alu_instruction_count;
-
-    /* Number of texture instructions */
-    int tex_instruction_count;
-
-    /* Number of texture indirections */
-    int indirections;
-
-    /* Indirection node offsets */
-    int alu_offset[4];
-
-    /* Machine instructions */
-    struct {
-        uint32_t alu_rgb_inst;
-        uint32_t alu_rgb_addr;
-        uint32_t alu_alpha_inst;
-        uint32_t alu_alpha_addr;
-    } instructions[64]; /* XXX magic num */
-};
-
-struct r5xx_fragment_shader {
-    /* Parent class */
-    struct r300_fragment_shader shader;
-
-    /* Number of used instructions */
-    int instruction_count;
-
-    /* Machine instructions */
-    struct {
-        uint32_t inst0;
-        uint32_t inst1;
-        uint32_t inst2;
-        uint32_t inst3;
-        uint32_t inst4;
-        uint32_t inst5;
-    } instructions[256]; /*< XXX magic number */
+/* Query object.
+ *
+ * This is not a subclass of pipe_query because pipe_query is never
+ * actually fully defined. So, rather than have it as a member, and do
+ * subclass-style casting, we treat pipe_query as an opaque, and just
+ * trust that our state tracker does not ever mess up query objects.
+ */
+struct r300_query {
+    /* The kind of query. Currently only OQ is supported. */
+    unsigned type;
+    /* Whether this query is currently active. Only active queries will
+     * get emitted into the command stream, and only active queries get
+     * tallied. */
+    boolean active;
+    /* The current count of this query. Required to be at least 32 bits. */
+    unsigned int count;
+    /* The offset of this query into the query buffer, in bytes. */
+    unsigned offset;
+    /* Linked list members. */
+    struct r300_query* prev;
+    struct r300_query* next;
 };
 
 struct r300_texture {
@@ -215,8 +184,15 @@ struct r300_texture {
     /* Offsets into the buffer. */
     unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
 
-    /* Stride (pitch?) of this texture in bytes */
-    unsigned stride;
+    /**
+     * If non-zero, override the natural texture layout with
+     * a custom stride (in bytes).
+     *
+     * \note Mipmapping fails for textures with a non-natural layout!
+     *
+     * \sa r300_texture_get_stride
+     */
+    unsigned stride_override;
 
     /* Total size of this texture, in bytes. */
     unsigned size;
@@ -242,37 +218,7 @@ struct r300_vertex_format {
     int fs_tab[16];
 };
 
-struct r300_vertex_shader {
-    /* Parent class */
-    struct pipe_shader_state state;
-    struct tgsi_shader_info info;
-
-    /* Fallback shader, because Draw has issues */
-    struct draw_vertex_shader* draw;
-
-    /* Has this shader been translated yet? */
-    boolean translated;
-
-    /* Are there immediates in this shader?
-     * If not, we can heavily optimize recompilation. */
-    boolean uses_imms;
-
-    /* Number of used instructions */
-    int instruction_count;
-
-    /* Machine instructions */
-    struct {
-        uint32_t inst0;
-        uint32_t inst1;
-        uint32_t inst2;
-        uint32_t inst3;
-    } instructions[128]; /*< XXX magic number */
-};
-
-static struct pipe_viewport_state r300_viewport_identity = {
-    .scale = {1.0, 1.0, 1.0, 1.0},
-    .translate = {0.0, 0.0, 0.0, 0.0},
-};
+extern struct pipe_viewport_state r300_viewport_identity;
 
 struct r300_context {
     /* Parent class */
@@ -288,6 +234,11 @@ struct r300_context {
     /* Offset into the VBO. */
     size_t vbo_offset;
 
+    /* Occlusion query buffer. */
+    struct pipe_buffer* oqbo;
+    /* Query list. */
+    struct r300_query* query_list;
+
     /* Various CSO state objects. */
     /* Blend state. */
     struct r300_blend_state* blend_state;
@@ -328,6 +279,9 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 /* Convenience cast wrapper. */
@@ -341,4 +295,40 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+/*@}*/
+
+static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+{
+    return (ctx->debug & flags) ? true : false;
+}
+
+static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+{
+    if (DBG_ON(ctx, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_context * ctx);
+
 #endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 71b142c0db..0a7e470363 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -49,7 +49,8 @@
     (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
 
 #define CS_LOCALS(context) \
-    struct r300_winsys* cs_winsys = context->winsys; \
+    struct r300_context* const cs_context_copy = (context); \
+    struct r300_winsys* cs_winsys = cs_context_copy->winsys; \
     int cs_count = 0;
 
 #define CHECK_CS(size) \
@@ -58,7 +59,7 @@
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
                 size, __FUNCTION__, __FILE__, __LINE__); \
     } \
     cs_winsys->begin_cs(cs_winsys, (size), \
@@ -78,7 +79,7 @@
 
 #define OUT_CS_REG(register, value) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
             value, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, 0)); \
@@ -89,14 +90,14 @@
  * not the actual packet0 count! */
 #define OUT_CS_REG_SEQ(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, ((count) - 1))); \
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for buffer %p, offset %d, " \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, offset %d, " \
             "domains (%d, %d, %d)\n", \
         bo, offset, rd, wd, flags); \
     assert(bo); \
@@ -107,7 +108,7 @@
 
 #define END_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     if (cs_count != 0) \
@@ -117,7 +118,7 @@
 
 #define FLUSH_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     cs_winsys->flush_cs(cs_winsys); \
@@ -127,7 +128,7 @@
 
 #define OUT_CS_ONE_REG(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing data sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
@@ -141,7 +142,7 @@
 } while (0)
 
 #define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for index buffer %p," \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
             "offset %d\n", bo, offset); \
     assert(bo); \
     OUT_CS(offset); \
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index c83e8526cf..15308dda1d 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Nicolai Haehnle <nhaehnle@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,179 +20,69 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "r300_debug.h"
+#include "r300_context.h"
 
-void r3xx_dump_fs(struct r3xx_fragment_shader* fs)
-{
-    int i;
+#include <ctype.h>
 
-    for (i = 0; i < fs->alu_instruction_count; i++) {
-    }
-}
 
-void r5xx_fs_dump(struct r5xx_fragment_shader* fs)
-{
-    int i;
-    uint32_t inst;
+struct debug_option {
+    const char * name;
+    unsigned flag;
+    const char * description;
+};
 
-    for (i = 0; i < fs->instruction_count; i++) {
-        inst = fs->instructions[i].inst0;
-        debug_printf("%d:  0: CMN_INST   0x%08x:", i, inst);
-        switch (inst & 0x3) {
-            case R500_INST_TYPE_ALU:
-                debug_printf("ALU ");
-                break;
-            case R500_INST_TYPE_OUT:
-                debug_printf("OUT ");
-                break;
-            case R500_INST_TYPE_FC:
-                debug_printf("FC  ");
-                break;
-            case R500_INST_TYPE_TEX:
-                debug_printf("TEX ");
-                break;
-        }
-        debug_printf("%s %s %s %s ",
-                inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
-                inst & R500_INST_LAST ? "LAST" : "",
-                inst & R500_INST_NOP ? "NOP" : "",
-                inst & R500_INST_ALU_WAIT ? "ALU_WAIT" : "");
-        debug_printf("wmask: %s omask: %s\n",
-                r5xx_fs_mask[(inst >> 11) & 0xf],
-                r5xx_fs_mask[(inst >> 15) & 0xf]);
-        switch (inst & 0x3) {
-            case R500_INST_TYPE_ALU:
-            case R500_INST_TYPE_OUT:
-                inst = fs->instructions[i].inst1;
-                debug_printf("    1: RGB_ADDR   0x%08x:", inst);
-                debug_printf("Addr0: %d%c, Addr1: %d%c, "
-                        "Addr2: %d%c, srcp:%d\n",
-                        inst & 0xff, (inst & (1 << 8)) ? 'c' : 't',
-                        (inst >> 10) & 0xff, (inst & (1 << 18)) ? 'c' : 't',
-                        (inst >> 20) & 0xff, (inst & (1 << 28)) ? 'c' : 't',
-                        (inst >> 30));
+static struct debug_option debug_options[] = {
+    { "help", DBG_HELP, "Helpful meta-information about the driver" },
+    { "fp", DBG_FP, "Fragment program handling" },
+    { "vp", DBG_VP, "Vertex program handling" },
+    { "cs", DBG_CS, "Command submissions" },
+    { "draw", DBG_DRAW, "Draw and emit" },
 
-                inst = fs->instructions[i].inst2;
-                debug_printf("    2: ALPHA_ADDR 0x%08x:", inst);
-                debug_printf("Addr0: %d%c, Addr1: %d%c, "
-                        "Addr2: %d%c, srcp:%d\n",
-                        inst & 0xff, (inst & (1 << 8)) ? 'c' : 't',
-                        (inst >> 10) & 0xff, (inst & (1 << 18)) ? 'c' : 't',
-                        (inst >> 20) & 0xff, (inst & (1 << 28)) ? 'c' : 't',
-                        (inst >> 30));
+    { "all", ~0, "Convenience option that enables all debug flags" },
 
-                inst = fs->instructions[i].inst3;
-                debug_printf("    3: RGB_INST   0x%08x:", inst);
-                debug_printf("rgb_A_src:%d %s/%s/%s %d "
-                        "rgb_B_src:%d %s/%s/%s %d\n",
-                        inst & 0x3, r5xx_fs_swiz[(inst >> 2) & 0x7],
-                        r5xx_fs_swiz[(inst >> 5) & 0x7],
-                        r5xx_fs_swiz[(inst >> 8) & 0x7],
-                        (inst >> 11) & 0x3, (inst >> 13) & 0x3,
-                        r5xx_fs_swiz[(inst >> 15) & 0x7],
-                        r5xx_fs_swiz[(inst >> 18) & 0x7],
-                        r5xx_fs_swiz[(inst >> 21) & 0x7],
-                        (inst >> 24) & 0x3);
+    /* must be last */
+    { 0, 0, 0 }
+};
 
-                inst = fs->instructions[i].inst4;
-                debug_printf("    4: ALPHA_INST 0x%08x:", inst);
-                debug_printf("%s dest:%d%s alp_A_src:%d %s %d "
-                        "alp_B_src:%d %s %d w:%d\n",
-                        r5xx_fs_op_alpha[inst & 0xf], (inst >> 4) & 0x7f,
-                        inst & (1<<11) ? "(rel)":"", (inst >> 12) & 0x3,
-                        r5xx_fs_swiz[(inst >> 14) & 0x7], (inst >> 17) & 0x3,
-                        (inst >> 19) & 0x3, r5xx_fs_swiz[(inst >> 21) & 0x7],
-                        (inst >> 24) & 0x3, (inst >> 31) & 0x1);
+void r300_init_debug(struct r300_context * ctx)
+{
+    const char * options = debug_get_option("RADEON_DEBUG", 0);
+    boolean printhint = false;
 
-                inst = fs->instructions[i].inst5;
-                debug_printf("    5: RGBA_INST  0x%08x:", inst);
-                debug_printf("%s dest:%d%s rgb_C_src:%d %s/%s/%s %d "
-                        "alp_C_src:%d %s %d\n",
-                        r5xx_fs_op_rgb[inst & 0xf], (inst >> 4) & 0x7f,
-                        inst & (1 << 11) ? "(rel)":"", (inst >> 12) & 0x3,
-                        r5xx_fs_swiz[(inst >> 14) & 0x7],
-                        r5xx_fs_swiz[(inst >> 17) & 0x7],
-                        r5xx_fs_swiz[(inst >> 20) & 0x7],
-                        (inst >> 23) & 0x3, (inst >> 25) & 0x3,
-                        r5xx_fs_swiz[(inst >> 27) & 0x7], (inst >> 30) & 0x3);
-                break;
-            case R500_INST_TYPE_FC:
-                /* XXX don't even bother yet */
-                break;
-            case R500_INST_TYPE_TEX:
-                inst = fs->instructions[i].inst1;
-                debug_printf("    1: TEX_INST   0x%08x: id: %d "
-                        "op:%s, %s, %s %s\n",
-                        inst, (inst >> 16) & 0xf,
-                        r5xx_fs_tex[(inst >> 22) & 0x7],
-                        (inst & (1 << 25)) ? "ACQ" : "",
-                        (inst & (1 << 26)) ? "IGNUNC" : "",
-                        (inst & (1 << 27)) ? "UNSCALED" : "SCALED");
+    if (options) {
+        while(*options) {
+            if (*options == ' ' || *options == ',') {
+                options++;
+                continue;
+            }
 
-                inst = fs->instructions[i].inst2;
-                debug_printf("    2: TEX_ADDR   0x%08x: "
-                        "src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n",
-                        inst, inst & 0x7f, inst & (1 << 7) ? "(rel)" : "",
-                        r5xx_fs_swiz[(inst >> 8) & 0x3],
-                        r5xx_fs_swiz[(inst >> 10) & 0x3],
-                        r5xx_fs_swiz[(inst >> 12) & 0x3],
-                        r5xx_fs_swiz[(inst >> 14) & 0x3],
-                        (inst >> 16) & 0x7f, inst & (1 << 23) ? "(rel)" : "",
-                        r5xx_fs_swiz[(inst >> 24) & 0x3],
-                        r5xx_fs_swiz[(inst >> 26) & 0x3],
-                        r5xx_fs_swiz[(inst >> 28) & 0x3],
-                        r5xx_fs_swiz[(inst >> 30) & 0x3]);
-                
-                inst = fs->instructions[i].inst3;
-                debug_printf("    3: TEX_DXDY   0x%08x\n", inst);
-                break;
-        }
-    }
-}
+            size_t length = strcspn(options, " ,");
+            struct debug_option * opt;
 
-static void r300_vs_op_dump(uint32_t op)
-{
-    debug_printf(" dst: %d%s op: ",
-            (op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]);
-    if (op & 0x80) {
-        if (op & 0x1) {
-            debug_printf("PVS_MACRO_OP_2CLK_M2X_ADD\n");
-        } else {
-            debug_printf("   PVS_MACRO_OP_2CLK_MADD\n");
-        }
-    } else if (op & 0x40) {
-        debug_printf("%s\n", r300_vs_me_ops[op & 0x1f]);
-    } else {
-        debug_printf("%s\n", r300_vs_ve_ops[op & 0x1f]);
-    }
-}
+            for(opt = debug_options; opt->name; ++opt) {
+                if (!strncmp(options, opt->name, length)) {
+                    ctx->debug |= opt->flag;
+                    break;
+                }
+            }
 
-void r300_vs_src_dump(uint32_t src)
-{
-    debug_printf(" reg: %d%s swiz: %s%s/%s%s/%s%s/%s%s\n",
-            (src >> 5) & 0x7f, r300_vs_src_debug[src & 0x3],
-            src & (1 << 25) ? "-" : " ",
-            r300_vs_swiz_debug[(src >> 13) & 0x7],
-            src & (1 << 26) ? "-" : " ",
-            r300_vs_swiz_debug[(src >> 16) & 0x7],
-            src & (1 << 27) ? "-" : " ",
-            r300_vs_swiz_debug[(src >> 19) & 0x7],
-            src & (1 << 28) ? "-" : " ",
-            r300_vs_swiz_debug[(src >> 22) & 0x7]);
-}
+            if (!opt->name) {
+                debug_printf("Unknown debug option: %s\n", options);
+                printhint = true;
+            }
 
-void r300_vs_dump(struct r300_vertex_shader* vs)
-{
-    int i;
+            options += length;
+        }
 
-    for (i = 0; i < vs->instruction_count; i++) {
-        debug_printf("%d: op: 0x%08x", i, vs->instructions[i].inst0);
-        r300_vs_op_dump(vs->instructions[i].inst0);
-        debug_printf(" src0: 0x%08x", vs->instructions[i].inst1);
-        r300_vs_src_dump(vs->instructions[i].inst1);
-        debug_printf(" src1: 0x%08x", vs->instructions[i].inst2);
-        r300_vs_src_dump(vs->instructions[i].inst2);
-        debug_printf(" src2: 0x%08x", vs->instructions[i].inst3);
-        r300_vs_src_dump(vs->instructions[i].inst3);
+        if (!ctx->debug)
+            printhint = true;
+    }
+
+    if (printhint || ctx->debug & DBG_HELP) {
+        debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
+                     "to a comma-separated list of debug options. Available options are:\n");
+        for(struct debug_option * opt = debug_options; opt->name; ++opt) {
+            debug_printf("    %s: %s\n", opt->name, opt->description);
+        }
     }
 }
diff --git a/src/gallium/drivers/r300/r300_debug.h b/src/gallium/drivers/r300/r300_debug.h
deleted file mode 100644
index 6b58c1e250..0000000000
--- a/src/gallium/drivers/r300/r300_debug.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R300_DEBUG_H
-#define R300_DEBUG_H
-
-#include "r300_reg.h"
-#include "r300_fs.h"
-#include "r300_vs.h"
-
-static char* r5xx_fs_swiz[] = {
-    " R",
-    " G",
-    " B",
-    " A",
-    " 0",
-    ".5",
-    " 1",
-    " U",
-};
-
-static char* r5xx_fs_op_rgb[] = {
-    "MAD",
-    "DP3",
-    "DP4",
-    "D2A",
-    "MIN",
-    "MAX",
-    "---",
-    "CND",
-    "CMP",
-    "FRC",
-    "SOP",
-    "MDH",
-    "MDV",
-};
-
-static char* r5xx_fs_op_alpha[] = {
-    "MAD",
-    " DP",
-    "MIN",
-    "MAX",
-    "---",
-    "CND",
-    "CMP",
-    "FRC",
-    "EX2",
-    "LN2",
-    "RCP",
-    "RSQ",
-    "SIN",
-    "COS",
-    "MDH",
-    "MDV",
-};
-
-static char* r5xx_fs_mask[] = {
-    "NONE",
-    "R   ",
-    " G  ",
-    "RG  ",
-    "  B ",
-    "R B ",
-    " GB ",
-    "RGB ",
-    "   A",
-    "R  A",
-    " G A",
-    "RG A",
-    "  BA",
-    "R BA",
-    " GBA",
-    "RGBA",
-};
-
-static char* r5xx_fs_tex[] = {
-    "    NOP",
-    "     LD",
-    "TEXKILL",
-    "   PROJ",
-    "LODBIAS",
-    "    LOD",
-    "   DXDY",
-};
-
-static char* r300_vs_ve_ops[] = {
-    /* R300 vector ops */
-    "                 VE_NO_OP",
-    "           VE_DOT_PRODUCT",
-    "              VE_MULTIPLY",
-    "                   VE_ADD",
-    "          VE_MULTIPLY_ADD",
-    "       VE_DISTANCE_FACTOR",
-    "              VE_FRACTION",
-    "               VE_MAXIMUM",
-    "               VE_MINIMUM",
-    "VE_SET_GREATER_THAN_EQUAL",
-    "         VE_SET_LESS_THAN",
-    "        VE_MULTIPLYX2_ADD",
-    "        VE_MULTIPLY_CLAMP",
-    "            VE_FLT2FIX_DX",
-    "        VE_FLT2FIX_DX_RND",
-    /* R500 vector ops */
-    "      VE_PRED_SET_EQ_PUSH",
-    "      VE_PRED_SET_GT_PUSH",
-    "     VE_PRED_SET_GTE_PUSH",
-    "     VE_PRED_SET_NEQ_PUSH",
-    "         VE_COND_WRITE_EQ",
-    "         VE_COND_WRITE_GT",
-    "        VE_COND_WRITE_GTE",
-    "        VE_COND_WRITE_NEQ",
-    "      VE_SET_GREATER_THAN",
-    "             VE_SET_EQUAL",
-    "         VE_SET_NOT_EQUAL",
-    "               (reserved)",
-    "               (reserved)",
-    "               (reserved)",
-};
-
-static char* r300_vs_me_ops[] = {
-    /* R300 math ops */
-    "                 ME_NO_OP",
-    "          ME_EXP_BASE2_DX",
-    "          ME_LOG_BASE2_DX",
-    "          ME_EXP_BASEE_FF",
-    "        ME_LIGHT_COEFF_DX",
-    "         ME_POWER_FUNC_FF",
-    "              ME_RECIP_DX",
-    "              ME_RECIP_FF",
-    "         ME_RECIP_SQRT_DX",
-    "         ME_RECIP_SQRT_FF",
-    "              ME_MULTIPLY",
-    "     ME_EXP_BASE2_FULL_DX",
-    "     ME_LOG_BASE2_FULL_DX",
-    " ME_POWER_FUNC_FF_CLAMP_B",
-    "ME_POWER_FUNC_FF_CLAMP_B1",
-    "ME_POWER_FUNC_FF_CLAMP_01",
-    "                   ME_SIN",
-    "                   ME_COS",
-    /* R500 math ops */
-    "        ME_LOG_BASE2_IEEE",
-    "            ME_RECIP_IEEE",
-    "       ME_RECIP_SQRT_IEEE",
-    "           ME_PRED_SET_EQ",
-    "           ME_PRED_SET_GT",
-    "          ME_PRED_SET_GTE",
-    "          ME_PRED_SET_NEQ",
-    "          ME_PRED_SET_CLR",
-    "          ME_PRED_SET_INV",
-    "          ME_PRED_SET_POP",
-    "      ME_PRED_SET_RESTORE",
-    "               (reserved)",
-    "               (reserved)",
-    "               (reserved)",
-};
-
-/* XXX refactor to avoid clashing symbols */
-static char* r300_vs_src_debug[] = {
-    "t",
-    "i",
-    "c",
-    "a",
-};
-
-static char* r300_vs_dst_debug[] = {
-    "t",
-    "a0",
-    "o",
-    "ox",
-    "a",
-    "i",
-    "u",
-    "u",
-};
-
-static char* r300_vs_swiz_debug[] = {
-    "X",
-    "Y",
-    "Z",
-    "W",
-    "0",
-    "1",
-    "U",
-    "U",
-};
-
-void r5xx_fs_dump(struct r5xx_fragment_shader* fs);
-void r3xx_dump_fs(struct r3xx_fragment_shader* fs);
-
-void r300_vs_dump(struct r300_vertex_shader* vs);
-
-#endif /* R300_DEBUG_H */
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 7ba56cdc1d..a1b36ba2ed 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -24,6 +24,10 @@
 
 #include "r300_emit.h"
 
+#include "r300_fs.h"
+#include "r300_state_derived.h"
+#include "r300_vs.h"
+
 void r300_emit_blend_state(struct r300_context* r300,
                            struct r300_blend_state* blend)
 {
@@ -109,73 +113,158 @@ void r300_emit_dsa_state(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_fragment_shader(struct r300_context* r300,
-                               struct r3xx_fragment_shader* fs)
+static const float * get_shader_constant(
+    struct r300_context * r300,
+    struct rc_constant * constant,
+    struct r300_constant_buffer * externals)
 {
+    static const float zero[4] = { 0.0, 0.0, 0.0, 0.0 };
+    switch(constant->Type) {
+        case RC_CONSTANT_EXTERNAL:
+            return externals->constants[constant->u.External];
+
+        case RC_CONSTANT_IMMEDIATE:
+            return constant->u.Immediate;
+
+        default:
+            debug_printf("r300: Implementation error: Unhandled constant type %i\n",
+                constant->Type);
+            return zero;
+    }
+}
+
+/* Convert a normal single-precision float into the 7.16 format
+ * used by the R300 fragment shader.
+ */
+static uint32_t pack_float24(float f)
+{
+    union {
+        float fl;
+        uint32_t u;
+    } u;
+    float mantissa;
+    int exponent;
+    uint32_t float24 = 0;
+
+    if (f == 0.0)
+        return 0;
+
+    u.fl = f;
+
+    mantissa = frexpf(f, &exponent);
+
+    /* Handle -ve */
+    if (mantissa < 0) {
+        float24 |= (1 << 23);
+        mantissa = mantissa * -1.0;
+    }
+    /* Handle exponent, bias of 63 */
+    exponent += 62;
+    float24 |= (exponent << 16);
+    /* Kill 7 LSB of mantissa */
+    float24 |= (u.u & 0x7FFFFF) >> 7;
+
+    return float24;
+}
+
+void r300_emit_fragment_program_code(struct r300_context* r300,
+                                     struct rX00_fragment_program_code* generic_code,
+                                     struct r300_constant_buffer* externals)
+{
+    struct r300_fragment_program_code * code = &generic_code->code.r300;
+    struct rc_constant_list * constants = &generic_code->constants;
     int i;
     CS_LOCALS(r300);
 
-    BEGIN_CS(22);
-
-    OUT_CS_REG(R300_US_CONFIG, fs->indirections);
-    OUT_CS_REG(R300_US_PIXSIZE, fs->shader.stack_size);
-    /* XXX figure out exactly how big the sizes are on this reg */
-    OUT_CS_REG(R300_US_CODE_OFFSET, 0x40);
-    /* XXX figure these ones out a bit better kthnx */
-    OUT_CS_REG(R300_US_CODE_ADDR_0, 0x0);
-    OUT_CS_REG(R300_US_CODE_ADDR_1, 0x0);
-    OUT_CS_REG(R300_US_CODE_ADDR_2, 0x0);
-    OUT_CS_REG(R300_US_CODE_ADDR_3, 0x40 | R300_RGBA_OUT);
-
-    for (i = 0; i < fs->alu_instruction_count; i++) {
-        OUT_CS_REG(R300_US_ALU_RGB_INST_0 + (4 * i),
-            fs->instructions[i].alu_rgb_inst);
-        OUT_CS_REG(R300_US_ALU_RGB_ADDR_0 + (4 * i),
-            fs->instructions[i].alu_rgb_addr);
-        OUT_CS_REG(R300_US_ALU_ALPHA_INST_0 + (4 * i),
-            fs->instructions[i].alu_alpha_inst);
-        OUT_CS_REG(R300_US_ALU_ALPHA_ADDR_0 + (4 * i),
-            fs->instructions[i].alu_alpha_addr);
+    BEGIN_CS(15 +
+             code->alu.length * 4 +
+             (code->tex.length ? (1 + code->tex.length) : 0) +
+             (constants->Count ? (1 + constants->Count * 4) : 0));
+
+    OUT_CS_REG(R300_US_CONFIG, code->config);
+    OUT_CS_REG(R300_US_PIXSIZE, code->pixsize);
+    OUT_CS_REG(R300_US_CODE_OFFSET, code->code_offset);
+
+    OUT_CS_REG_SEQ(R300_US_CODE_ADDR_0, 4);
+    for(i = 0; i < 4; ++i)
+        OUT_CS(code->code_addr[i]);
+
+    OUT_CS_REG_SEQ(R300_US_ALU_RGB_INST_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++)
+        OUT_CS(code->alu.inst[i].rgb_inst);
+
+    OUT_CS_REG_SEQ(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++)
+        OUT_CS(code->alu.inst[i].rgb_addr);
+
+    OUT_CS_REG_SEQ(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++)
+        OUT_CS(code->alu.inst[i].alpha_inst);
+
+    OUT_CS_REG_SEQ(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++)
+        OUT_CS(code->alu.inst[i].alpha_addr);
+
+    if (code->tex.length) {
+        OUT_CS_REG_SEQ(R300_US_TEX_INST_0, code->tex.length);
+        for(i = 0; i < code->tex.length; ++i)
+            OUT_CS(code->tex.inst[i]);
+    }
+
+    if (constants->Count) {
+        OUT_CS_ONE_REG(R300_PFS_PARAM_0_X, constants->Count * 4);
+        for(i = 0; i < constants->Count; ++i) {
+            const float * data = get_shader_constant(r300, &constants->Constants[i], externals);
+            OUT_CS(pack_float24(data[0]));
+            OUT_CS(pack_float24(data[1]));
+            OUT_CS(pack_float24(data[2]));
+            OUT_CS(pack_float24(data[3]));
+        }
     }
 
     END_CS;
 }
 
-void r500_emit_fragment_shader(struct r300_context* r300,
-                               struct r5xx_fragment_shader* fs)
+void r500_emit_fragment_program_code(struct r300_context* r300,
+                                     struct rX00_fragment_program_code* generic_code,
+                                     struct r300_constant_buffer* externals)
 {
+    struct r500_fragment_program_code * code = &generic_code->code.r500;
+    struct rc_constant_list * constants = &generic_code->constants;
     int i;
-    struct r300_constant_buffer* constants =
-        &r300->shader_constants[PIPE_SHADER_FRAGMENT];
     CS_LOCALS(r300);
 
-    BEGIN_CS(9 + (fs->instruction_count * 6) + (constants->count ? 3 : 0) +
-            (constants->count * 4));
-    OUT_CS_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
-    OUT_CS_REG(R500_US_PIXSIZE, fs->shader.stack_size);
-    OUT_CS_REG(R500_US_CODE_ADDR, R500_US_CODE_START_ADDR(0) |
-            R500_US_CODE_END_ADDR(fs->instruction_count));
+    BEGIN_CS(13 +
+             ((code->inst_end + 1) * 6) +
+             (constants->Count ? (3 + (constants->Count * 4)) : 0));
+    OUT_CS_REG(R500_US_CONFIG, 0);
+    OUT_CS_REG(R500_US_PIXSIZE, code->max_temp_idx);
+    OUT_CS_REG(R500_US_CODE_RANGE,
+               R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
+    OUT_CS_REG(R500_US_CODE_OFFSET, 0);
+    OUT_CS_REG(R500_US_CODE_ADDR,
+               R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(code->inst_end));
 
     OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
-    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, fs->instruction_count * 6);
-    for (i = 0; i < fs->instruction_count; i++) {
-        OUT_CS(fs->instructions[i].inst0);
-        OUT_CS(fs->instructions[i].inst1);
-        OUT_CS(fs->instructions[i].inst2);
-        OUT_CS(fs->instructions[i].inst3);
-        OUT_CS(fs->instructions[i].inst4);
-        OUT_CS(fs->instructions[i].inst5);
-    }
-
-    if (constants->count) {
-        OUT_CS_REG(R500_GA_US_VECTOR_INDEX,
-                R500_GA_US_VECTOR_INDEX_TYPE_CONST);
-        OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, constants->count * 4);
-        for (i = 0; i < constants->count; i++) {
-            OUT_CS_32F(constants->constants[i][0]);
-            OUT_CS_32F(constants->constants[i][1]);
-            OUT_CS_32F(constants->constants[i][2]);
-            OUT_CS_32F(constants->constants[i][3]);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, (code->inst_end + 1) * 6);
+    for (i = 0; i <= code->inst_end; i++) {
+        OUT_CS(code->inst[i].inst0);
+        OUT_CS(code->inst[i].inst1);
+        OUT_CS(code->inst[i].inst2);
+        OUT_CS(code->inst[i].inst3);
+        OUT_CS(code->inst[i].inst4);
+        OUT_CS(code->inst[i].inst5);
+    }
+
+    if (constants->Count) {
+        OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
+        OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, constants->Count * 4);
+        for (i = 0; i < constants->Count; i++) {
+            const float * data = get_shader_constant(r300, &constants->Constants[i], externals);
+            OUT_CS_32F(data[0]);
+            OUT_CS_32F(data[1]);
+            OUT_CS_32F(data[2]);
+            OUT_CS_32F(data[3]);
         }
     }
 
@@ -190,17 +279,19 @@ void r300_emit_fb_state(struct r300_context* r300,
     int i;
     CS_LOCALS(r300);
 
-    BEGIN_CS((8 * fb->nr_cbufs) + (fb->zsbuf ? 8 : 0) + 4);
+    BEGIN_CS((10 * fb->nr_cbufs) + (fb->zsbuf ? 10 : 0) + 4);
     for (i = 0; i < fb->nr_cbufs; i++) {
         tex = (struct r300_texture*)fb->cbufs[i]->texture;
         assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 
-        OUT_CS_REG(R300_RB3D_COLORPITCH0 + (4 * i), pixpitch |
-            r300_translate_colorformat(tex->tex.format));
+        OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
+        OUT_CS_RELOC(tex->buffer, pixpitch |
+                     r300_translate_colorformat(tex->tex.format), 0,
+                     RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
             r300_translate_out_fmt(fb->cbufs[i]->format));
@@ -209,14 +300,15 @@ void r300_emit_fb_state(struct r300_context* r300,
     if (fb->zsbuf) {
         tex = (struct r300_texture*)fb->zsbuf->texture;
         assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_ZB_FORMAT, r300_translate_zsformat(tex->tex.format));
 
-        OUT_CS_REG(R300_ZB_DEPTHPITCH, pixpitch);
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
+        OUT_CS_RELOC(tex->buffer, pixpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
     OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
@@ -228,6 +320,79 @@ void r300_emit_fb_state(struct r300_context* r300,
     END_CS;
 }
 
+void r300_emit_query_begin(struct r300_context* r300,
+                           struct r300_query* query)
+{
+    CS_LOCALS(r300);
+
+    /* XXX This will almost certainly not return good results
+     * for overlapping queries. */
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_ZB_ZPASS_DATA, 0);
+    END_CS;
+}
+
+void r300_emit_query_end(struct r300_context* r300,
+                         struct r300_query* query)
+{
+    struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
+    CS_LOCALS(r300);
+
+    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
+                0, RADEON_GEM_DOMAIN_GTT)) {
+        debug_printf("r300: There wasn't room for the OQ buffer!?"
+                " Oh noes!\n");
+    }
+
+    assert(caps->num_frag_pipes);
+    BEGIN_CS(6 * caps->num_frag_pipes + 2);
+    /* I'm not so sure I like this switch, but it's hard to be elegant
+     * when there's so many special cases...
+     *
+     * So here's the basic idea. For each pipe, enable writes to it only,
+     * then put out the relocation for ZPASS_ADDR, taking into account a
+     * 4-byte offset for each pipe. RV380 and older are special; they have
+     * only two pipes, and the second pipe's enable is on bit 3, not bit 1,
+     * so there's a chipset cap for that. */
+    switch (caps->num_frag_pipes) {
+        case 4:
+            /* pipe 3 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 3);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 3),
+                    0, RADEON_GEM_DOMAIN_GTT, 0);
+        case 3:
+            /* pipe 2 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 2);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 2),
+                    0, RADEON_GEM_DOMAIN_GTT, 0);
+        case 2:
+            /* pipe 1 only */
+            /* As mentioned above, accomodate RV380 and older. */
+            OUT_CS_REG(R300_SU_REG_DEST,
+                    1 << (caps->high_second_pipe ? 3 : 1));
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 1),
+                    0, RADEON_GEM_DOMAIN_GTT, 0);
+        case 1:
+            /* pipe 0 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 0);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 0),
+                    0, RADEON_GEM_DOMAIN_GTT, 0);
+        default:
+            debug_printf("r300: Implementation error: Chipset reports %d"
+                    " pixel pipes!\n", caps->num_frag_pipes);
+            assert(0);
+    }
+
+    /* And, finally, reset it to normal... */
+    OUT_CS_REG(R300_SU_REG_DEST, 0xF);
+    END_CS;
+
+}
+
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
 {
     CS_LOCALS(r300);
@@ -326,7 +491,7 @@ void r300_emit_vertex_buffer(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    debug_printf("r300: Preparing vertex buffer %p for render, "
+    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
             "vertex size %d\n", r300->vbo,
             r300->vertex_info.vinfo.size);
     /* Set the pointer to our vertex buffer. The emitted values are this:
@@ -380,13 +545,13 @@ void r300_emit_vertex_format_state(struct r300_context* r300)
     END_CS;
 }
 
-void r300_emit_vertex_shader(struct r300_context* r300,
-                             struct r300_vertex_shader* vs)
+void r300_emit_vertex_program_code(struct r300_context* r300,
+                                   struct r300_vertex_program_code* code,
+                                   struct r300_constant_buffer* constants)
 {
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct r300_constant_buffer* constants =
-        &r300->shader_constants[PIPE_SHADER_VERTEX];
+    unsigned instruction_count = code->length / 4;
     CS_LOCALS(r300);
 
     if (!r300screen->caps->has_tcl) {
@@ -395,10 +560,10 @@ void r300_emit_vertex_shader(struct r300_context* r300,
         return;
     }
 
-    if (constants->count) {
-        BEGIN_CS(14 + (vs->instruction_count * 4) + (constants->count * 4));
+    if (code->constants.Count) {
+        BEGIN_CS(14 + code->length + (code->constants.Count * 4));
     } else {
-        BEGIN_CS(11 + (vs->instruction_count * 4));
+        BEGIN_CS(11 + code->length);
     }
 
     /* R300_VAP_PVS_CODE_CNTL_0
@@ -408,30 +573,27 @@ void r300_emit_vertex_shader(struct r300_context* r300,
      * XXX these could be optimized to select better values... */
     OUT_CS_REG_SEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
     OUT_CS(R300_PVS_FIRST_INST(0) |
-            R300_PVS_XYZW_VALID_INST(vs->instruction_count - 1) |
-            R300_PVS_LAST_INST(vs->instruction_count - 1));
-    OUT_CS(R300_PVS_MAX_CONST_ADDR(constants->count - 1));
-    OUT_CS(vs->instruction_count - 1);
+            R300_PVS_XYZW_VALID_INST(instruction_count - 1) |
+            R300_PVS_LAST_INST(instruction_count - 1));
+    OUT_CS(R300_PVS_MAX_CONST_ADDR(code->constants.Count - 1));
+    OUT_CS(instruction_count - 1);
 
     OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
-    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, vs->instruction_count * 4);
-    for (i = 0; i < vs->instruction_count; i++) {
-        OUT_CS(vs->instructions[i].inst0);
-        OUT_CS(vs->instructions[i].inst1);
-        OUT_CS(vs->instructions[i].inst2);
-        OUT_CS(vs->instructions[i].inst3);
-    }
+    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);
+    for (i = 0; i < code->length; i++)
+        OUT_CS(code->body.d[i]);
 
-    if (constants->count) {
+    if (code->constants.Count) {
         OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
                 (r300screen->caps->is_r500 ?
                  R500_PVS_CONST_START : R300_PVS_CONST_START));
-        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, constants->count * 4);
-        for (i = 0; i < constants->count; i++) {
-            OUT_CS_32F(constants->constants[i][0]);
-            OUT_CS_32F(constants->constants[i][1]);
-            OUT_CS_32F(constants->constants[i][2]);
-            OUT_CS_32F(constants->constants[i][3]);
+        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->constants.Count * 4);
+        for (i = 0; i < code->constants.Count; i++) {
+            const float * data = get_shader_constant(r300, &code->constants.Constants[i], constants);
+            OUT_CS_32F(data[0]);
+            OUT_CS_32F(data[1]);
+            OUT_CS_32F(data[2]);
+            OUT_CS_32F(data[3]);
         }
     }
 
@@ -443,6 +605,12 @@ void r300_emit_vertex_shader(struct r300_context* r300,
     END_CS;
 }
 
+void r300_emit_vertex_shader(struct r300_context* r300,
+                             struct r300_vertex_shader* vs)
+{
+    r300_emit_vertex_program_code(r300, &vs->code, &r300->shader_constants[PIPE_SHADER_VERTEX]);
+}
+
 void r300_emit_viewport_state(struct r300_context* r300,
                               struct r300_viewport_state* viewport)
 {
@@ -521,6 +689,12 @@ validate:
             goto validate;
         }
     }
+    /* ...occlusion query buffer... */
+    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
+                0, RADEON_GEM_DOMAIN_GTT)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        goto validate;
+    }
     /* ...and vertex buffer. */
     if (r300->vbo) {
         if (!r300->winsys->add_buffer(r300->winsys, r300->vbo,
@@ -531,10 +705,11 @@ validate:
     } else {
         debug_printf("No VBO while emitting dirty state!\n");
     }
-    if (r300->winsys->validate(r300->winsys)) {
+    if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
         if (invalid) {
             /* Well, hell. */
+            debug_printf("r300: Stuck in validation loop, gonna quit now.");
             exit(1);
         }
         invalid = TRUE;
@@ -563,11 +738,9 @@ validate:
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
         if (r300screen->caps->is_r500) {
-            r500_emit_fragment_shader(r300,
-                (struct r5xx_fragment_shader*)r300->fs);
+            r500_emit_fragment_program_code(r300, &r300->fs->code, &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
         } else {
-            r300_emit_fragment_shader(r300,
-                (struct r3xx_fragment_shader*)r300->fs);
+            r300_emit_fragment_program_code(r300, &r300->fs->code, &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
         }
         r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER;
     }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index fda26f3948..c4002b8e5d 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -30,6 +30,9 @@
 #include "r300_screen.h"
 #include "r300_state_inlines.h"
 
+struct rX00_fragment_program_code;
+struct r300_vertex_program_code;
+
 void r300_emit_blend_state(struct r300_context* r300,
                            struct r300_blend_state* blend);
 
@@ -42,15 +45,22 @@ void r300_emit_clip_state(struct r300_context* r300,
 void r300_emit_dsa_state(struct r300_context* r300,
                          struct r300_dsa_state* dsa);
 
-void r300_emit_fragment_shader(struct r300_context* r300,
-                               struct r3xx_fragment_shader* fs);
+void r300_emit_fragment_program_code(struct r300_context* r300,
+                                     struct rX00_fragment_program_code* generic_code,
+                                     struct r300_constant_buffer* externals);
 
-void r500_emit_fragment_shader(struct r300_context* r300,
-                               struct r5xx_fragment_shader* fs);
+void r500_emit_fragment_program_code(struct r300_context* r300,
+                                     struct rX00_fragment_program_code* generic_code,
+                                     struct r300_constant_buffer* externals);
 
 void r300_emit_fb_state(struct r300_context* r300,
                         struct pipe_framebuffer_state* fb);
 
+void r300_emit_query_begin(struct r300_context* r300,
+                           struct r300_query* query);
+void r300_emit_query_end(struct r300_context* r300,
+                         struct r300_query* query);
+
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
@@ -68,6 +78,10 @@ void r300_emit_vertex_buffer(struct r300_context* r300);
 
 void r300_emit_vertex_format_state(struct r300_context* r300);
 
+void r300_emit_vertex_program_code(struct r300_context* r300,
+                                   struct r300_vertex_program_code* code,
+                                   struct r300_constant_buffer* constants);
+
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs);
 
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 8672e211bc..a0e848a59a 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -23,86 +23,115 @@
 
 #include "r300_fs.h"
 
-void r300_translate_fragment_shader(struct r300_context* r300,
-                                    struct r300_fragment_shader* fs)
+#include "r300_tgsi_to_rc.h"
+
+#include "radeon_compiler.h"
+
+static void find_output_registers(struct r300_fragment_program_compiler * compiler,
+                                  struct r300_fragment_shader * fs)
 {
-    struct tgsi_parse_context parser;
-    int i;
-    boolean is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
-    struct r300_constant_buffer* consts =
-        &r300->shader_constants[PIPE_SHADER_FRAGMENT];
+    unsigned i;
 
-    struct r300_fs_asm* assembler = CALLOC_STRUCT(r300_fs_asm);
-    if (assembler == NULL) {
-        return;
-    }
-    /* Setup starting offset for immediates. */
-    assembler->imm_offset = consts->user_count;
-    /* Enable depth writes, if needed. */
-    assembler->writes_depth = fs->info.writes_z;
-
-    /* Make sure we start at the beginning of the shader. */
-    if (is_r500) {
-        ((struct r5xx_fragment_shader*)fs)->instruction_count = 0;
-    }
+    /* Mark the outputs as not present initially */
+    compiler->OutputColor = fs->info.num_outputs;
+    compiler->OutputDepth = fs->info.num_outputs;
 
-    tgsi_parse_init(&parser, fs->state.tokens);
+    /* Now see where they really are. */
+    for(i = 0; i < fs->info.num_outputs; ++i) {
+        switch(fs->info.output_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                compiler->OutputColor = i;
+                break;
+            case TGSI_SEMANTIC_POSITION:
+                compiler->OutputDepth = i;
+                break;
+        }
+    }
+}
 
-    while (!tgsi_parse_end_of_tokens(&parser)) {
-        tgsi_parse_token(&parser);
+static void allocate_hardware_inputs(
+    struct r300_fragment_program_compiler * c,
+    void (*allocate)(void * data, unsigned input, unsigned hwreg),
+    void * mydata)
+{
+    struct tgsi_shader_info* info = &((struct r300_fragment_shader*)c->UserData)->info;
+    int total_colors = 0;
+    int colors = 0;
+    int total_generic = 0;
+    int generic = 0;
+    int i;
 
-        /* This is seriously the lamest way to create fragment programs ever.
-         * I blame TGSI. */
-        switch (parser.FullToken.Token.Type) {
-            case TGSI_TOKEN_TYPE_DECLARATION:
-                /* Allocated registers sitting at the beginning
-                 * of the program. */
-                r300_fs_declare(assembler, &parser.FullToken.FullDeclaration);
+    for (i = 0; i < info->num_inputs; i++) {
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                total_colors++;
                 break;
-            case TGSI_TOKEN_TYPE_IMMEDIATE:
-                debug_printf("r300: Emitting immediate to constant buffer, "
-                        "position %d\n",
-                        assembler->imm_offset + assembler->imm_count);
-                /* I am not amused by the length of these. */
-                for (i = 0; i < 4; i++) {
-                    consts->constants[assembler->imm_offset +
-                        assembler->imm_count][i] =
-                        parser.FullToken.FullImmediate.u[i].Float;
-                }
-                assembler->imm_count++;
+            case TGSI_SEMANTIC_FOG:
+            case TGSI_SEMANTIC_GENERIC:
+                total_generic++;
                 break;
-            case TGSI_TOKEN_TYPE_INSTRUCTION:
-                if (is_r500) {
-                    r5xx_fs_instruction((struct r5xx_fragment_shader*)fs,
-                            assembler, &parser.FullToken.FullInstruction);
-                } else {
-                    r3xx_fs_instruction((struct r3xx_fragment_shader*)fs,
-                            assembler, &parser.FullToken.FullInstruction);
-                }
+        }
+    }
+
+    for(i = 0; i < info->num_inputs; i++) {
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                allocate(mydata, i, colors);
+                colors++;
+                break;
+            case TGSI_SEMANTIC_FOG:
+            case TGSI_SEMANTIC_GENERIC:
+                allocate(mydata, i, total_colors + generic);
+                generic++;
                 break;
         }
     }
+}
+
+void r300_translate_fragment_shader(struct r300_context* r300,
+                                    struct r300_fragment_shader* fs)
+{
+    struct r300_fragment_program_compiler compiler;
+    struct tgsi_to_rc ttr;
+
+    memset(&compiler, 0, sizeof(compiler));
+    rc_init(&compiler.Base);
+    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
+
+    compiler.code = &fs->code;
+    compiler.is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
+    compiler.AllocateHwInputs = &allocate_hardware_inputs;
+    compiler.UserData = fs;
+
+    /* TODO: Program compilation depends on texture compare modes,
+     * which are sampler state. Therefore, programs need to be recompiled
+     * depending on this state as in the classic Mesa driver.
+     *
+     * This is not yet handled correctly.
+     */
 
-    debug_printf("r300: fs: %d texs and %d colors, first free reg is %d\n",
-            assembler->tex_count, assembler->color_count,
-            assembler->tex_count + assembler->color_count);
-
-    consts->count = consts->user_count + assembler->imm_count;
-    fs->uses_imms = assembler->imm_count;
-    debug_printf("r300: fs: %d total constants, "
-            "%d from user and %d from immediates\n", consts->count,
-            consts->user_count, assembler->imm_count);
-    r3xx_fs_finalize(fs, assembler);
-    if (is_r500) {
-        r5xx_fs_finalize((struct r5xx_fragment_shader*)fs, assembler);
+    find_output_registers(&compiler, fs);
+
+    if (compiler.Base.Debug) {
+        debug_printf("r300: Initial fragment program\n");
+        tgsi_dump(fs->state.tokens, 0);
     }
 
-    tgsi_dump(fs->state.tokens, 0);
-    /* XXX finish r300 dumper too */
-    if (is_r500) {
-        r5xx_fs_dump((struct r5xx_fragment_shader*)fs);
+    /* Translate TGSI to our internal representation */
+    ttr.compiler = &compiler.Base;
+    ttr.info = &fs->info;
+
+    r300_tgsi_to_rc(&ttr, fs->state.tokens);
+
+    /* Invoke the compiler */
+    r3xx_compile_fragment_program(&compiler);
+    if (compiler.Base.Error) {
+        /* Todo: Fail gracefully */
+        fprintf(stderr, "r300 FP: Compiler error\n");
+        abort();
     }
 
-    tgsi_parse_free(&parser);
-    FREE(assembler);
+    /* And, finally... */
+    rc_destroy(&compiler.Base);
+    fs->translated = TRUE;
 }
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 18deb7a05e..9fab789402 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -30,6 +30,21 @@
 #include "r3xx_fs.h"
 #include "r5xx_fs.h"
 
+#include "radeon_code.h"
+
+struct r300_fragment_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+    struct tgsi_shader_info info;
+
+    /* Has this shader been translated yet? */
+    boolean translated;
+
+    /* Compiled code */
+    struct rX00_fragment_program_code code;
+};
+
+
 void r300_translate_fragment_shader(struct r300_context* r300,
                                     struct r300_fragment_shader* fs);
 
diff --git a/src/gallium/drivers/r300/r300_fs_inlines.h b/src/gallium/drivers/r300/r300_fs_inlines.h
deleted file mode 100644
index be4be9465e..0000000000
--- a/src/gallium/drivers/r300/r300_fs_inlines.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R300_FS_INLINES_H
-#define R300_FS_INLINES_H
-
-#include "tgsi/tgsi_parse.h"
-
-#include "r300_context.h"
-#include "r300_debug.h"
-#include "r300_reg.h"
-#include "r300_screen.h"
-#include "r300_shader_inlines.h"
-
-/* Temporary struct used to hold assembly state while putting together
- * fragment programs. */
-struct r300_fs_asm {
-    /* Pipe context. */
-    struct r300_context* r300;
-    /* Number of colors. */
-    unsigned color_count;
-    /* Number of texcoords. */
-    unsigned tex_count;
-    /* Offset for temporary registers. Inputs and temporaries have no
-     * distinguishing markings, so inputs start at 0 and the first usable
-     * temporary register is after all inputs. */
-    unsigned temp_offset;
-    /* Number of requested temporary registers. */
-    unsigned temp_count;
-    /* Offset for immediate constants. Neither R300 nor R500 can do four
-     * inline constants per source, so instead we copy immediates into the
-     * constant buffer. */
-    unsigned imm_offset;
-    /* Number of immediate constants. */
-    unsigned imm_count;
-    /* Are depth writes enabled? */
-    boolean writes_depth;
-    /* Depth write offset. This is the TGSI output that corresponds to
-     * depth writes. */
-    unsigned depth_output;
-};
-
-static INLINE void r300_fs_declare(struct r300_fs_asm* assembler,
-                            struct tgsi_full_declaration* decl)
-{
-    switch (decl->Declaration.File) {
-        case TGSI_FILE_INPUT:
-            switch (decl->Semantic.SemanticName) {
-                case TGSI_SEMANTIC_COLOR:
-                    assembler->color_count++;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                case TGSI_SEMANTIC_GENERIC:
-                    assembler->tex_count++;
-                    break;
-                default:
-                    debug_printf("r300: fs: Bad semantic declaration %d\n",
-                        decl->Semantic.SemanticName);
-                    break;
-            }
-            break;
-        case TGSI_FILE_OUTPUT:
-            /* Depth write. Mark the position of the output so we can
-             * identify it later. */
-            if (decl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
-                assembler->depth_output = decl->DeclarationRange.First;
-            }
-            break;
-        case TGSI_FILE_CONSTANT:
-            break;
-        case TGSI_FILE_TEMPORARY:
-            assembler->temp_count++;
-            break;
-        default:
-            debug_printf("r300: fs: Bad file %d\n", decl->Declaration.File);
-            break;
-    }
-
-    assembler->temp_offset = assembler->color_count + assembler->tex_count;
-}
-
-static INLINE unsigned r300_fs_src(struct r300_fs_asm* assembler,
-                                   struct tgsi_src_register* src)
-{
-    switch (src->File) {
-        case TGSI_FILE_NULL:
-            return 0;
-        case TGSI_FILE_INPUT:
-            /* XXX may be wrong */
-            return src->Index;
-            break;
-        case TGSI_FILE_TEMPORARY:
-            return src->Index + assembler->temp_offset;
-            break;
-        case TGSI_FILE_IMMEDIATE:
-            return (src->Index + assembler->imm_offset) | (1 << 8);
-            break;
-        case TGSI_FILE_CONSTANT:
-            /* XXX magic */
-            return src->Index | (1 << 8);
-            break;
-        default:
-            debug_printf("r300: fs: Unimplemented src %d\n", src->File);
-            break;
-    }
-    return 0;
-}
-
-static INLINE unsigned r300_fs_dst(struct r300_fs_asm* assembler,
-                                   struct tgsi_dst_register* dst)
-{
-    switch (dst->File) {
-        case TGSI_FILE_NULL:
-            /* This happens during KIL instructions. */
-            return 0;
-            break;
-        case TGSI_FILE_OUTPUT:
-            return 0;
-            break;
-        case TGSI_FILE_TEMPORARY:
-            return dst->Index + assembler->temp_offset;
-            break;
-        default:
-            debug_printf("r300: fs: Unimplemented dst %d\n", dst->File);
-            break;
-    }
-    return 0;
-}
-
-static INLINE boolean r300_fs_is_depr(struct r300_fs_asm* assembler,
-                                      struct tgsi_dst_register* dst)
-{
-    return (assembler->writes_depth &&
-            (dst->File == TGSI_FILE_OUTPUT) &&
-            (dst->Index == assembler->depth_output));
-}
-
-#endif /* R300_FS_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 8fc61c2dec..2880d34877 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -22,17 +22,35 @@
 
 #include "r300_query.h"
 
+#include "r300_emit.h"
+
 static struct pipe_query* r300_create_query(struct pipe_context* pipe,
                                             unsigned query_type)
 {
-    struct r300_query* q = CALLOC_STRUCT(r300_query);
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    unsigned query_size = r300screen->caps->num_frag_pipes * 4;
+    struct r300_query* q, * qptr;
+
+    q = CALLOC_STRUCT(r300_query);
 
     q->type = query_type;
     assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
-    /* XXX this is to force winsys to give us a GTT buffer */
-    q->buf = pipe->screen->buffer_create(pipe->screen, 64,
-            PIPE_BUFFER_USAGE_VERTEX, 64);
+    q->active = FALSE;
+
+    if (!r300->query_list) {
+        r300->query_list = q;
+    } else if (!is_empty_list(r300->query_list)) {
+        qptr = last_elem(r300->query_list);
+        q->offset = qptr->offset + query_size;
+        insert_at_tail(r300->query_list, q);
+    }
+
+    /* XXX */
+    if (q->offset >= 4096) {
+        q->offset = 0;
+    }
 
     return (struct pipe_query*)q;
 }
@@ -40,6 +58,9 @@ static struct pipe_query* r300_create_query(struct pipe_context* pipe,
 static void r300_destroy_query(struct pipe_context* pipe,
                                struct pipe_query* query)
 {
+    struct r300_query* q = (struct r300_query*)query;
+
+    remove_from_list(q);
     FREE(query);
 }
 
@@ -49,15 +70,15 @@ static void r300_begin_query(struct pipe_context* pipe,
     uint32_t* map;
     struct r300_context* r300 = r300_context(pipe);
     struct r300_query* q = (struct r300_query*)query;
-    CS_LOCALS(r300);
 
-    map = pipe_buffer_map(pipe->screen, q->buf, PIPE_BUFFER_USAGE_CPU_WRITE);
+    map = pipe->screen->buffer_map(pipe->screen, r300->oqbo,
+            PIPE_BUFFER_USAGE_CPU_WRITE);
+    map += q->offset / 4;
     *map = ~0;
-    pipe_buffer_unmap(pipe->screen, q->buf);
+    pipe->screen->buffer_unmap(pipe->screen, r300->oqbo);
 
-    BEGIN_CS(2);
-    OUT_CS_REG(R300_ZB_ZPASS_DATA, 0);
-    END_CS;
+    r300_emit_dirty_state(r300);
+    r300_emit_query_begin(r300, q);
 }
 
 static void r300_end_query(struct pipe_context* pipe,
@@ -65,12 +86,9 @@ static void r300_end_query(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
     struct r300_query* q = (struct r300_query*)query;
-    CS_LOCALS(r300);
 
-    BEGIN_CS(4);
-    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-    OUT_CS_RELOC(q->buf, 0, 0, RADEON_GEM_DOMAIN_GTT, 0);
-    END_CS;
+    r300_emit_dirty_state(r300);
+    r300_emit_query_end(r300, q);
 }
 
 static boolean r300_get_query_result(struct pipe_context* pipe,
@@ -78,22 +96,38 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
                                      boolean wait,
                                      uint64_t* result)
 {
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_query* q = (struct r300_query*)query;
+    unsigned flags = PIPE_BUFFER_USAGE_CPU_READ;
     uint32_t* map;
     uint32_t temp;
+    unsigned i;
 
     if (wait) {
-        /* Well, we're expected to just sit here and spin, so let's go ahead
-         * and flush so we can be sure that the card's spinning... */
-        /* XXX double-check these params */
         pipe->flush(pipe, 0, NULL);
+    } else {
+        flags |= PIPE_BUFFER_USAGE_DONTBLOCK;
     }
 
-    map = pipe_buffer_map(pipe->screen, q->buf, PIPE_BUFFER_USAGE_CPU_READ);
-    temp = *map;
-    pipe_buffer_unmap(pipe->screen, q->buf);
+    map = pipe->screen->buffer_map(pipe->screen, r300->oqbo, flags);
+    map += q->offset / 4;
+    for (i = 0; i < r300screen->caps->num_frag_pipes; i++) {
+        if (*map == ~0) {
+            /* Looks like our results aren't ready yet. */
+            if (wait) {
+                debug_printf("r300: Despite waiting, OQ results haven't"
+                        " come in yet.\n");
+            }
+            temp = ~0;
+            break;
+        }
+        temp += *map;
+        map++;
+    }
+    pipe->screen->buffer_unmap(pipe->screen, r300->oqbo);
 
-    if (temp < 0) {
+    if (temp == ~0) {
         /* Our results haven't been written yet... */
         return FALSE;
     }
diff --git a/src/gallium/drivers/r300/r300_query.h b/src/gallium/drivers/r300/r300_query.h
index 6a7646087a..4f50e8f844 100644
--- a/src/gallium/drivers/r300/r300_query.h
+++ b/src/gallium/drivers/r300/r300_query.h
@@ -29,13 +29,6 @@
 
 struct r300_context;
 
-struct r300_query {
-    /* The kind of query. Currently only OQ is supported. */
-    unsigned type;
-    /* Buffer object where we want our results to reside. */
-    struct pipe_buffer* buf;
-};
-
 static INLINE struct r300_query* r300_query(struct pipe_query* q)
 {
     return (struct r300_query*)q;
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 6825d99870..03cd219cde 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -3312,6 +3312,10 @@ enum {
 
 #define R200_3D_DRAW_IMMD_2      0xC0003500
 
+/* XXX Oh look, stuff not brought over from docs yet */
+
+#define R300_SU_REG_DEST                    0x42C8
+
 #endif /* _R300_REG_H */
 
 /* *INDENT-ON* */
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index cd458d019a..737396d8d9 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -26,15 +26,17 @@
 
 #include "r300_cs.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_reg.h"
 #include "r300_state_derived.h"
 
 /* r300_render: Vertex and index buffer primitive emission. */
+#define R300_MAX_VBO_SIZE  (1024 * 1024)
 
 struct r300_render {
     /* Parent class */
     struct vbuf_render base;
-    
+
     /* Pipe context */
     struct r300_context* r300;
 
@@ -45,7 +47,10 @@ struct r300_render {
 
     /* VBO */
     struct pipe_buffer* vbo;
-    size_t vbo_alloc_size;
+    size_t vbo_size;
+    size_t vbo_offset;
+    size_t vbo_max_used;
+    void * vbo_ptr;
 };
 
 static INLINE struct r300_render*
@@ -74,19 +79,18 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (r300render->vbo && (size > r300render->vbo_alloc_size)) {
-        pipe_buffer_reference(&r300render->vbo, NULL);
-    }
-    
-    if (!r300render->vbo) {
+    if (size + r300render->vbo_offset > r300render->vbo_size) 
+    {
         r300render->vbo = pipe_buffer_create(screen,
                                              64,
                                              PIPE_BUFFER_USAGE_VERTEX,
-                                             size);
+                                             R300_MAX_VBO_SIZE);
+        r300render->vbo_size = R300_MAX_VBO_SIZE;
     }
 
-    r300render->vbo_alloc_size = MAX2(size, r300render->vbo_alloc_size);
     r300render->vertex_size = vertex_size;
+    r300->vbo = r300render->vbo;
+    r300->vbo_offset = r300render->vbo_offset;
 
     return (r300render->vbo) ? TRUE : FALSE;
 }
@@ -96,8 +100,10 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    return (unsigned char*)pipe_buffer_map(screen, r300render->vbo,
-                                           PIPE_BUFFER_USAGE_CPU_WRITE);
+    r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    return (r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
@@ -106,15 +112,24 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
+    CS_LOCALS(r300render->r300);
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max);
+    END_CS;
 
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used, 
+                                    r300render->vertex_size * (max + 1));
     pipe_buffer_unmap(screen, r300render->vbo);
 }
 
 static void r300_render_release_vertices(struct vbuf_render* render)
 {
     struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
 
-    pipe_buffer_reference(&r300render->vbo, NULL);
+    r300render->vbo_offset += r300render->vbo_max_used;
+    r300render->vbo_max_used = 0;
+    r300->vbo = NULL;
 }
 
 static boolean r300_render_set_primitive(struct vbuf_render* render,
@@ -162,14 +177,12 @@ static boolean r300_render_set_primitive(struct vbuf_render* render,
     return TRUE;
 }
 
-static void prepare_render(struct r300_render* render, unsigned count)
+static void r300_prepare_render(struct r300_render* render, unsigned count)
 {
     struct r300_context* r300 = render->r300;
 
     CS_LOCALS(r300);
 
-    r300->vbo = render->vbo;
-
     r300_emit_dirty_state(r300);
 }
 
@@ -182,9 +195,9 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
-    debug_printf("r300: Doing vbuf render, count %d\n", count);
+    DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
 
     BEGIN_CS(2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
@@ -207,7 +220,7 @@ static void r300_render_draw(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
     /* Send our indices into an index buffer. */
     index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
@@ -216,23 +229,6 @@ static void r300_render_draw(struct vbuf_render* render,
         return;
     }
 
-/*
-    index_map = pipe_buffer_map(screen, index_buffer,
-                                PIPE_BUFFER_USAGE_CPU_WRITE);
-    memcpy(index_map, indices, count);
-    pipe_buffer_unmap(screen, index_buffer);
-
-    debug_printf("r300: Doing indexbuf render, count %d\n", count);
-
-    BEGIN_CS(8);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
-    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
-           r300render->hwprim);
-    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
-    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2));
-    OUT_CS_INDEX_RELOC(index_buffer, 0, count, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS; */
-
     BEGIN_CS(2 + (count+1)/2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
@@ -271,6 +267,10 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
     r300render->base.release_vertices = r300_render_release_vertices;
     r300render->base.destroy = r300_render_destroy;
 
+    r300render->vbo = NULL;
+    r300render->vbo_size = 0;
+    r300render->vbo_offset = 0;
+
     return &r300render->base;
 }
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index da1d5ffe2f..3b5b1bbd37 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -93,8 +93,6 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
             } else {
                 return 0;
             }
-        case PIPE_CAP_S3TC:
-            return 1;
         case PIPE_CAP_ANISOTROPIC_FILTER:
             return 1;
         case PIPE_CAP_POINT_SPRITE:
@@ -147,6 +145,8 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_TGSI_CONT_SUPPORTED:
             /* XXX */
             return 0;
+        case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+            return 1;
         default:
             debug_printf("r300: Implementation error: Bad param %d\n",
                 param);
@@ -320,13 +320,15 @@ r300_get_tex_transfer(struct pipe_screen *screen,
     trans = CALLOC_STRUCT(r300_transfer);
     if (trans) {
         pipe_texture_reference(&trans->transfer.texture, texture);
-        trans->transfer.format = trans->transfer.format;
+        trans->transfer.format = texture->format;
+        trans->transfer.x = x;
+        trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
         trans->transfer.block = texture->block;
         trans->transfer.nblocksx = texture->nblocksx[level];
         trans->transfer.nblocksy = texture->nblocksy[level];
-        trans->transfer.stride = tex->stride;
+        trans->transfer.stride = r300_texture_get_stride(tex, level);
         trans->transfer.usage = usage;
         trans->offset = offset;
     }
@@ -353,7 +355,7 @@ static void* r300_transfer_map(struct pipe_screen* screen,
     if (transfer->usage != PIPE_TRANSFER_READ) {
         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
     }
-    
+
     map = pipe_buffer_map(screen, tex->buffer, flags);
 
     if (!map) {
@@ -389,6 +391,7 @@ struct pipe_screen* r300_create_screen(struct r300_winsys* r300_winsys)
         return NULL;
 
     caps->pci_id = r300_winsys->pci_id;
+    caps->num_frag_pipes = r300_winsys->gb_pipes;
 
     r300_parse_chipset(caps);
 
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 68da0aa4cb..88cb9af6fb 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -20,10 +20,11 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_pack_color.h"
 
-#include "util/u_debug.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "pipe/p_config.h"
 #include "pipe/internal/p_winsys_screen.h"
@@ -32,6 +33,7 @@
 #include "r300_reg.h"
 #include "r300_state_inlines.h"
 #include "r300_fs.h"
+#include "r300_vs.h"
 
 /* r300_state: Functions used to intialize state context by translating
  * Gallium state objects into semi-native r300 state objects. */
@@ -137,7 +139,6 @@ static void
                              const struct pipe_constant_buffer* buffer)
 {
     struct r300_context* r300 = r300_context(pipe);
-    int i = r300->shader_constants[shader].user_count;
 
     /* This entire chunk of code seems ever-so-slightly baked.
      * It's as if I've got pipe_buffer* matryoshkas... */
@@ -148,26 +149,13 @@ static void
             map, buffer->buffer->size);
         pipe->winsys->buffer_unmap(pipe->winsys, buffer->buffer);
 
-        r300->shader_constants[shader].user_count =
+        r300->shader_constants[shader].count =
             buffer->buffer->size / (sizeof(float) * 4);
     } else {
-        r300->shader_constants[shader].user_count = 0;
+        r300->shader_constants[shader].count = 0;
     }
 
     r300->dirty_state |= R300_NEW_CONSTANTS;
-
-    /* If the number of constants have changed, invalidate the shader. */
-    if (r300->shader_constants[shader].user_count != i) {
-        if (shader == PIPE_SHADER_FRAGMENT && r300->fs &&
-                r300->fs->uses_imms) {
-            r300->fs->translated = FALSE;
-            r300_translate_fragment_shader(r300, r300->fs);
-        } else if (shader == PIPE_SHADER_VERTEX && r300->vs &&
-                r300->vs->uses_imms) {
-            r300->vs->translated = FALSE;
-            r300_translate_vertex_shader(r300, r300->vs);
-        }
-    }
 }
 
 /* Create a new depth, stencil, and alpha state based on the CSO dsa state.
@@ -237,7 +225,8 @@ static void*
         dsa->alpha_reference = CLAMP(state->alpha.ref_value * 1023.0f,
                                      0, 1023);
     } else {
-        dsa->z_buffer_top = R300_ZTOP_ENABLE;
+        /* XXX need to fix this to be dynamically set
+        dsa->z_buffer_top = R300_ZTOP_ENABLE; */
     }
 
     return (void*)dsa;
@@ -284,14 +273,9 @@ static void
 static void* r300_create_fs_state(struct pipe_context* pipe,
                                   const struct pipe_shader_state* shader)
 {
-    struct r300_context* r300 = r300_context(pipe);
     struct r300_fragment_shader* fs = NULL;
 
-    if (r300_screen(r300->context.screen)->caps->is_r500) {
-        fs = (struct r300_fragment_shader*)CALLOC_STRUCT(r5xx_fragment_shader);
-    } else {
-        fs = (struct r300_fragment_shader*)CALLOC_STRUCT(r3xx_fragment_shader);
-    }
+    fs = (struct r300_fragment_shader*)CALLOC_STRUCT(r300_fragment_shader);
 
     /* Copy state directly into shader. */
     fs->state = *shader;
@@ -315,7 +299,6 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
         r300_translate_fragment_shader(r300, fs);
     }
 
-    fs->translated = TRUE;
     r300->fs = fs;
 
     r300->dirty_state |= R300_NEW_FRAGMENT_SHADER;
@@ -325,6 +308,7 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
 static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
+    rc_constants_destroy(&fs->code.constants);
     FREE(fs->state.tokens);
     FREE(shader);
 }
@@ -446,6 +430,9 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
 
     r300->rs_state = rs;
     r300->dirty_state |= R300_NEW_RASTERIZER;
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
+    r300->dirty_state |= R300_NEW_SCISSOR;
+    r300->dirty_state |= R300_NEW_VIEWPORT;
 }
 
 /* Free rasterizer state. */
@@ -555,16 +542,16 @@ static void r300_set_scissor_state(struct pipe_context* pipe,
             (state->minx << R300_SCISSORS_X_SHIFT) |
             (state->miny << R300_SCISSORS_Y_SHIFT);
         r300->scissor_state->scissor_bottom_right =
-            (state->maxx << R300_SCISSORS_X_SHIFT) |
-            (state->maxy << R300_SCISSORS_Y_SHIFT);
+            ((state->maxx - 1) << R300_SCISSORS_X_SHIFT) |
+            ((state->maxy - 1) << R300_SCISSORS_Y_SHIFT);
     } else {
         /* Offset of 1440 in non-R500 chipsets. */
         r300->scissor_state->scissor_top_left =
             ((state->minx + 1440) << R300_SCISSORS_X_SHIFT) |
             ((state->miny + 1440) << R300_SCISSORS_Y_SHIFT);
         r300->scissor_state->scissor_bottom_right =
-            ((state->maxx + 1440) << R300_SCISSORS_X_SHIFT) |
-            ((state->maxy + 1440) << R300_SCISSORS_Y_SHIFT);
+            (((state->maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
+            (((state->maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
     }
 
     r300->dirty_state |= R300_NEW_SCISSOR;
@@ -688,6 +675,7 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
     if (r300_screen(pipe->screen)->caps->has_tcl) {
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
+        rc_constants_destroy(&vs->code.constants);
         draw_delete_vertex_shader(r300->draw, vs->draw);
         FREE(vs->state.tokens);
         FREE(shader);
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 2477b30822..5f6b225d34 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -22,6 +22,10 @@
 
 #include "r300_state_derived.h"
 
+#include "r300_fs.h"
+#include "r300_state_inlines.h"
+#include "r300_vs.h"
+
 /* r300_state_derived: Various bits of state which are dependent upon
  * currently bound CSO data. */
 
@@ -46,59 +50,63 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 
     assert(info->num_inputs <= 16);
 
-    if (r300screen->caps->has_tcl) {
-        /* Just copy vert attribs over as-is. */
+    if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
+    {
         for (i = 0; i < info->num_inputs; i++) {
-            tab[i] = i;
-        }
-        for (i = 0; i < info->num_outputs; i++) {
-            switch (info->output_semantic_name[i]) {
+            switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_POSITION:
                     pos = TRUE;
+                    tab[i] = 0;
                     break;
                 case TGSI_SEMANTIC_COLOR:
+                    tab[i] = 2 + cols;
                     cols++;
                     break;
                 case TGSI_SEMANTIC_PSIZE:
                     psize = TRUE;
+                    tab[i] = 15;
                     break;
                 case TGSI_SEMANTIC_FOG:
                     fog = TRUE;
                     /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
+                    tab[i] = 6 + texs;
                     texs++;
                     break;
                 default:
-                    debug_printf("r300: Unknown vertex output %d\n",
-                        info->output_semantic_name[i]);
+                    debug_printf("r300: Unknown vertex input %d\n",
+                        info->input_semantic_name[i]);
                     break;
             }
         }
-    } else {
+    }
+    else
+    {
+        /* Just copy vert attribs over as-is. */
         for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
+            tab[i] = i;
+        }
+
+        for (i = 0; i < info->num_outputs; i++) {
+            switch (info->output_semantic_name[i]) {
                 case TGSI_SEMANTIC_POSITION:
                     pos = TRUE;
-                    tab[i] = 0;
                     break;
                 case TGSI_SEMANTIC_COLOR:
-                    tab[i] = 2 + cols;
                     cols++;
                     break;
                 case TGSI_SEMANTIC_PSIZE:
                     psize = TRUE;
-                    tab[i] = 15;
                     break;
                 case TGSI_SEMANTIC_FOG:
                     fog = TRUE;
                     /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
-                    tab[i] = 6 + texs;
                     texs++;
                     break;
                 default:
-                    debug_printf("r300: Unknown vertex input %d\n",
-                        info->input_semantic_name[i]);
+                    debug_printf("r300: Unknown vertex output %d\n",
+                        info->output_semantic_name[i]);
                     break;
             }
         }
@@ -188,13 +196,13 @@ static void r300_vertex_psc(struct r300_context* r300,
      * and not on attrib information. */
     if (r300screen->caps->has_tcl) {
         attrib_count = r300->vs->info.num_inputs;
-        debug_printf("r300: routing %d attribs in psc for vs\n",
+        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
                 attrib_count);
     } else {
         attrib_count = vinfo->num_attribs;
-        debug_printf("r300: attrib count: %d\n", attrib_count);
+        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
         for (i = 0; i < attrib_count; i++) {
-            debug_printf("r300: attrib: offset %d, interp %d, size %d,"
+            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
                    " tab %d\n", vinfo->attrib[i].src_index,
                    vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
                    tab[i]);
@@ -292,18 +300,18 @@ static void r300_update_fs_tab(struct r300_context* r300)
     }
 
     /* Now that we know where everything is... */
-    debug_printf("r300: fp input count: %d\n", info->num_inputs);
+    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
     for (i = 0; i < info->num_inputs; i++) {
         switch (tab[i]) {
             case INTERP_LINEAR:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, color,    tab %d\n",
                         i, cols_emitted);
                 tab[i] = cols_emitted;
                 cols_emitted++;
                 break;
             case INTERP_PERSPECTIVE:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, texcoord, tab %d\n",
                         i, cols + texs);
                 tab[i] = cols + texs;
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
index 63ae8eb8d0..71a4a47b00 100644
--- a/src/gallium/drivers/r300/r300_state_derived.h
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -23,11 +23,7 @@
 #ifndef R300_STATE_DERIVED_H
 #define R300_STATE_DERIVED_H
 
-#include "draw/draw_vertex.h"
-
-#include "r300_context.h"
-#include "r300_reg.h"
-#include "r300_state_inlines.h"
+struct r300_context;
 
 void r300_update_derived_state(struct r300_context* r300);
 
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index 22c8e199ae..91b93fc367 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -353,6 +353,25 @@ static INLINE uint32_t r300_translate_out_fmt(enum pipe_format format)
 
 /* Non-CSO state. (For now.) */
 
+static INLINE uint32_t r300_translate_gb_pipes(int pipe_count)
+{
+    switch (pipe_count) {
+        case 1:
+            return R300_GB_TILE_PIPE_COUNT_RV300;
+            break;
+        case 2:
+            return R300_GB_TILE_PIPE_COUNT_R300;
+            break;
+        case 3:
+            return R300_GB_TILE_PIPE_COUNT_R420_3P;
+            break;
+        case 4:
+            return R300_GB_TILE_PIPE_COUNT_R420;
+            break;
+    }
+    return 0;
+}
+
 static INLINE uint32_t translate_vertex_data_type(int type) {
     switch (type) {
         case EMIT_1F:
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 430129d5bd..3865730d63 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -23,6 +23,12 @@
 
 #include "r300_state_invariant.h"
 
+
+struct pipe_viewport_state r300_viewport_identity = {
+    .scale = {1.0, 1.0, 1.0, 1.0},
+    .translate = {0.0, 0.0, 0.0, 0.0},
+};
+
 /* Calculate and emit invariant state. This is data that the 3D engine
  * will probably want at the beginning of every CS, but it's not currently
  * handled by any CSO setup, and in addition it doesn't really change much.
@@ -34,11 +40,13 @@ void r300_emit_invariant_state(struct r300_context* r300)
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
 
-    BEGIN_CS(22 + (caps->has_tcl ? 2: 0));
+    BEGIN_CS(24 + (caps->has_tcl ? 2: 0));
 
     /*** Graphics Backend (GB) ***/
     /* Various GB enables */
-    OUT_CS_REG(R300_GB_ENABLE, 0x0);
+    OUT_CS_REG(R300_GB_ENABLE, R300_GB_POINT_STUFF_ENABLE |
+                               R300_GB_LINE_STUFF_ENABLE  |
+                               R300_GB_TRIANGLE_STUFF_ENABLE);
     /* Subpixel multisampling for AA
      * These are commented out because glisse's CS checker doesn't like them.
      * I presume these will be re-enabled later.
@@ -56,6 +64,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
     OUT_CS_REG(R300_FG_DEPTH_SRC, 0x0);
+    OUT_CS_REG(R300_US_W_FMT, 0x0);
 
     /*** VAP ***/
     /* Max and min vertex index clamp. */
@@ -72,7 +81,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(71 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
+    BEGIN_CS(64 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
     /* Flush PVS. */
     OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
 
@@ -132,11 +141,5 @@ void r300_emit_invariant_state(struct r300_context* r300)
     /* XXX */
     OUT_CS_REG(R300_SC_CLIP_RULE, 0xaaaa);
 
-    OUT_CS_REG_SEQ(R300_US_OUT_FMT_0, 4);
-    OUT_CS(R300_C0_SEL_B | R300_C1_SEL_G | R300_C2_SEL_R | R300_C3_SEL_A);
-    OUT_CS(R300_US_OUT_FMT_UNUSED);
-    OUT_CS(R300_US_OUT_FMT_UNUSED);
-    OUT_CS(R300_US_OUT_FMT_UNUSED);
-    OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0);
     END_CS;
 }
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
index fdabe4d9cf..cc6288cb51 100644
--- a/src/gallium/drivers/r300/r300_surface.c
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -29,7 +29,7 @@ static void r300_surface_setup(struct r300_context* r300,
                                unsigned w, unsigned h)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
-    unsigned pixpitch = dest->stride / dest->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(dest, 0) / dest->tex.block.size;
     CS_LOCALS(r300);
 
     r300_emit_blend_state(r300, &blend_clear_state);
@@ -37,7 +37,7 @@ static void r300_surface_setup(struct r300_context* r300,
     r300_emit_dsa_state(r300, &dsa_clear_state);
     r300_emit_rs_state(r300, &rs_clear_state);
 
-    BEGIN_CS(24);
+    BEGIN_CS(26);
 
     /* Viewport setup */
     OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
@@ -58,13 +58,13 @@ static void r300_surface_setup(struct r300_context* r300,
     OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
     if (caps->is_r500) {
         OUT_CS((x << R300_SCISSORS_X_SHIFT) | (y << R300_SCISSORS_Y_SHIFT));
-        OUT_CS((w << R300_SCISSORS_X_SHIFT) | (h << R300_SCISSORS_Y_SHIFT));
+        OUT_CS(((w - 1) << R300_SCISSORS_X_SHIFT) | ((h - 1) << R300_SCISSORS_Y_SHIFT));
     } else {
         /* Non-R500 chipsets have an offset of 1440 in their scissors. */
         OUT_CS(((x + 1440) << R300_SCISSORS_X_SHIFT) |
                 ((y + 1440) << R300_SCISSORS_Y_SHIFT));
-        OUT_CS(((w + 1440) << R300_SCISSORS_X_SHIFT) |
-                ((h + 1440) << R300_SCISSORS_Y_SHIFT));
+        OUT_CS((((w - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
+                (((h - 1) + 1440) << R300_SCISSORS_Y_SHIFT));
     }
 
     /* Flush colorbuffer and blend caches. */
@@ -78,8 +78,10 @@ static void r300_surface_setup(struct r300_context* r300,
     /* Setup colorbuffer. */
     OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0, 1);
     OUT_CS_RELOC(dest->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-    OUT_CS_REG(R300_RB3D_COLORPITCH0, pixpitch |
-        r300_translate_colorformat(dest->tex.format));
+    OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0, 1);
+    OUT_CS_RELOC(dest->buffer, pixpitch |
+                 r300_translate_colorformat(dest->tex.format), 0,
+                 RADEON_GEM_DOMAIN_VRAM, 0);
     OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0xf);
 
     END_CS;
@@ -98,7 +100,7 @@ static void r300_surface_fill(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* tex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = tex->stride / tex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
     boolean invalid = FALSE;
     CS_LOCALS(r300);
 
@@ -125,9 +127,10 @@ validate:
         r300->context.flush(&r300->context, 0, NULL);
         goto validate;
     }
-    if (r300->winsys->validate(r300->winsys)) {
+    if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
         if (invalid) {
+            debug_printf("r300: Stuck in validation loop, gonna fallback.");
             goto fallback;
         }
         invalid = TRUE;
@@ -138,10 +141,14 @@ validate:
 
     /* Vertex shader setup */
     if (caps->has_tcl) {
-        r300_emit_vertex_shader(r300, &r300_passthrough_vertex_shader);
+        r300_emit_vertex_program_code(r300, &r300_passthrough_vertex_shader, 0);
     } else {
         BEGIN_CS(4);
-        OUT_CS_REG(R300_VAP_CNTL_STATUS, R300_VAP_TCL_BYPASS);
+        OUT_CS_REG(R300_VAP_CNTL_STATUS,
+#ifdef PIPE_ARCH_BIG_ENDIAN
+                   R300_VC_32BIT_SWAP |
+#endif
+                   R300_VAP_TCL_BYPASS);
         OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(5) |
                 R300_PVS_NUM_CNTLRS(5) |
                 R300_PVS_NUM_FPUS(caps->num_vert_fpus) |
@@ -151,10 +158,10 @@ validate:
 
     /* Fragment shader setup */
     if (caps->is_r500) {
-        r500_emit_fragment_shader(r300, &r5xx_passthrough_fragment_shader);
+        r500_emit_fragment_program_code(r300, &r5xx_passthrough_fragment_shader, 0);
         r300_emit_rs_block_state(r300, &r5xx_rs_block_clear_state);
     } else {
-        r300_emit_fragment_shader(r300, &r3xx_passthrough_fragment_shader);
+        r300_emit_fragment_program_code(r300, &r3xx_passthrough_fragment_shader, 0);
         r300_emit_rs_block_state(r300, &r3xx_rs_block_clear_state);
     }
 
@@ -226,7 +233,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* srctex = (struct r300_texture*)src->texture;
     struct r300_texture* desttex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = srctex->stride / srctex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(srctex, 0) / srctex->tex.block.size;
     boolean invalid = FALSE;
     float fsrcx = srcx, fsrcy = srcy, fdestx = destx, fdesty = desty;
     CS_LOCALS(r300);
@@ -256,9 +263,10 @@ validate:
         r300->context.flush(&r300->context, 0, NULL);
         goto validate;
     }
-    if (r300->winsys->validate(r300->winsys)) {
+    if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
         if (invalid) {
+            debug_printf("r300: Stuck in validation loop, gonna fallback.");
             goto fallback;
         }
         invalid = TRUE;
@@ -275,10 +283,14 @@ validate:
 
     /* Vertex shader setup */
     if (caps->has_tcl) {
-        r300_emit_vertex_shader(r300, &r300_passthrough_vertex_shader);
+        r300_emit_vertex_program_code(r300, &r300_passthrough_vertex_shader, 0);
     } else {
         BEGIN_CS(4);
-        OUT_CS_REG(R300_VAP_CNTL_STATUS, R300_VAP_TCL_BYPASS);
+        OUT_CS_REG(R300_VAP_CNTL_STATUS,
+#ifdef PIPE_ARCH_BIG_ENDIAN
+                   R300_VC_32BIT_SWAP |
+#endif
+                   R300_VAP_TCL_BYPASS);
         OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(5) |
                 R300_PVS_NUM_CNTLRS(5) |
                 R300_PVS_NUM_FPUS(caps->num_vert_fpus) |
@@ -288,10 +300,10 @@ validate:
 
     /* Fragment shader setup */
     if (caps->is_r500) {
-        r500_emit_fragment_shader(r300, &r5xx_texture_fragment_shader);
+        r500_emit_fragment_program_code(r300, &r5xx_texture_fragment_shader, 0);
         r300_emit_rs_block_state(r300, &r5xx_rs_block_copy_state);
     } else {
-        r300_emit_fragment_shader(r300, &r3xx_texture_fragment_shader);
+        r300_emit_fragment_program_code(r300, &r3xx_texture_fragment_shader, 0);
         r300_emit_rs_block_state(r300, &r3xx_rs_block_copy_state);
     }
 
diff --git a/src/gallium/drivers/r300/r300_surface.h b/src/gallium/drivers/r300/r300_surface.h
index d01f0b143f..f9e98b2ec9 100644
--- a/src/gallium/drivers/r300/r300_surface.h
+++ b/src/gallium/drivers/r300/r300_surface.h
@@ -73,9 +73,9 @@ static struct r300_rs_state rs_clear_state = {
 };
 
 static struct r300_rs_block r3xx_rs_block_clear_state = {
-    .ip[0] = R500_RS_SEL_S(R300_RS_SEL_K0) |
-        R500_RS_SEL_T(R300_RS_SEL_K0) |
-        R500_RS_SEL_R(R300_RS_SEL_K0) |
+    .ip[0] = R500_RS_SEL_S(R300_RS_SEL_C0) |
+        R500_RS_SEL_T(R300_RS_SEL_C0) |
+        R500_RS_SEL_R(R300_RS_SEL_C0) |
         R500_RS_SEL_Q(R300_RS_SEL_K1),
     .inst[0] = R300_RS_INST_COL_CN_WRITE,
     .count = R300_IT_COUNT(0) | R300_IC_COUNT(1) | R300_HIRES_EN,
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 11c7858d42..7c041d17f7 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -22,17 +22,9 @@
 
 #include "r300_texture.h"
 
-/* XXX maths need to go to util */
-
-static int minify(int i)
-{
-    return MAX2(1, i >> 1);
-}
-
 static void r300_setup_texture_state(struct r300_texture* tex,
                                      unsigned width,
                                      unsigned height,
-                                     unsigned pitch,
                                      unsigned levels)
 {
     struct r300_texture_state* state = &tex->state;
@@ -45,7 +37,7 @@ static void r300_setup_texture_state(struct r300_texture* tex,
     /* XXX */
     state->format1 = r300_translate_texformat(tex->tex.format);
 
-    state->format2 = pitch - 1;
+    state->format2 = r300_texture_get_stride(tex, 0);
 
     /* Assume (somewhat foolishly) that oversized textures will
      * not be permitted by the state tracker. */
@@ -55,12 +47,32 @@ static void r300_setup_texture_state(struct r300_texture* tex,
     if (height > 2048) {
         state->format2 |= R500_TXHEIGHT_BIT11;
     }
+
+    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
+            width, height, levels);
+}
+
+/**
+ * Return the stride, in bytes, of the texture images of the given texture
+ * at the given level.
+ */
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+{
+    if (tex->stride_override)
+        return tex->stride_override;
+
+    if (level > tex->tex.last_level) {
+        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__, level, tex->tex.last_level);
+        return 0;
+    }
+
+    return align(pf_get_stride(&tex->tex.block, tex->tex.width[level]), 32);
 }
 
 static void r300_setup_miptree(struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, offset;
+    int stride, size;
     int i;
 
     for (i = 0; i <= base->last_level; i++) {
@@ -71,23 +83,23 @@ static void r300_setup_miptree(struct r300_texture* tex)
         }
 
         base->nblocksx[i] = pf_get_nblocksx(&base->block, base->width[i]);
-        base->nblocksy[i] = pf_get_nblocksy(&base->block, base->width[i]);
+        base->nblocksy[i] = pf_get_nblocksy(&base->block, base->height[i]);
 
         /* Radeons enjoy things in multiples of 64.
          *
          * XXX
          * POT, uncompressed, unmippmapped textures can be aligned to 32,
          * instead of 64. */
-        stride = align(base->nblocksx[i] * base->block.size, 64);
+        stride = r300_texture_get_stride(tex, i);
         size = stride * base->nblocksy[i] * base->depth[i];
 
-        tex->offset[i] = align(tex->size, 64);
+        tex->offset[i] = align(tex->size, 32);
         tex->size = tex->offset[i] + size;
 
-        /* Save stride of first level to the texture. */
-        if (i == 0) {
-            tex->stride = stride;
-        }
+        debug_printf("r300: Texture miptree: Level %d "
+                "(%dx%dx%d px, pitch %d bytes)\n",
+                i, base->width[i], base->height[i], base->depth[i],
+                stride);
     }
 }
 
@@ -109,9 +121,9 @@ static struct pipe_texture*
     r300_setup_miptree(tex);
 
     r300_setup_texture_state(tex, template->width[0], template->height[0],
-            template->width[0], template->last_level);
+                             template->last_level);
 
-    tex->buffer = screen->buffer_create(screen, 64,
+    tex->buffer = screen->buffer_create(screen, 1024,
                                         PIPE_BUFFER_USAGE_PIXEL,
                                         tex->size);
 
@@ -189,11 +201,10 @@ static struct pipe_texture*
     pipe_reference_init(&tex->tex.reference, 1);
     tex->tex.screen = screen;
 
-    tex->stride = *stride;
+    tex->stride_override = *stride;
 
     /* XXX */
-    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0],
-            tex->stride, 0);
+    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0], 0);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
@@ -221,7 +232,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = tex->stride;
+        *stride = r300_texture_get_stride(tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 3b56f0307c..3109af5bac 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -30,8 +30,12 @@
 #include "r300_context.h"
 #include "r300_reg.h"
 
+struct r300_texture;
+
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+
 /* Note the signature of R300_EASY_TX_FORMAT(A, R, G, B, FORMAT)... */
 static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
 {
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
new file mode 100644
index 0000000000..0913ca1bd5
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_tgsi_to_rc.h"
+
+#include "radeon_compiler.h"
+#include "radeon_program.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_util.h"
+
+
+static unsigned translate_opcode(unsigned opcode)
+{
+    switch(opcode) {
+        case TGSI_OPCODE_ARL: return OPCODE_ARL;
+        case TGSI_OPCODE_MOV: return OPCODE_MOV;
+        case TGSI_OPCODE_LIT: return OPCODE_LIT;
+        case TGSI_OPCODE_RCP: return OPCODE_RCP;
+        case TGSI_OPCODE_RSQ: return OPCODE_RSQ;
+        case TGSI_OPCODE_EXP: return OPCODE_EXP;
+        case TGSI_OPCODE_LOG: return OPCODE_LOG;
+        case TGSI_OPCODE_MUL: return OPCODE_MUL;
+        case TGSI_OPCODE_ADD: return OPCODE_ADD;
+        case TGSI_OPCODE_DP3: return OPCODE_DP3;
+        case TGSI_OPCODE_DP4: return OPCODE_DP4;
+        case TGSI_OPCODE_DST: return OPCODE_DST;
+        case TGSI_OPCODE_MIN: return OPCODE_MIN;
+        case TGSI_OPCODE_MAX: return OPCODE_MAX;
+        case TGSI_OPCODE_SLT: return OPCODE_SLT;
+        case TGSI_OPCODE_SGE: return OPCODE_SGE;
+        case TGSI_OPCODE_MAD: return OPCODE_MAD;
+        case TGSI_OPCODE_SUB: return OPCODE_SUB;
+        case TGSI_OPCODE_LRP: return OPCODE_LRP;
+     /* case TGSI_OPCODE_CND: return OPCODE_CND; */
+        case TGSI_OPCODE_DP2A: return OPCODE_DP2A;
+                                        /* gap */
+        case TGSI_OPCODE_FRC: return OPCODE_FRC;
+     /* case TGSI_OPCODE_CLAMP: return OPCODE_CLAMP; */
+        case TGSI_OPCODE_FLR: return OPCODE_FLR;
+     /* case TGSI_OPCODE_ROUND: return OPCODE_ROUND; */
+        case TGSI_OPCODE_EX2: return OPCODE_EX2;
+        case TGSI_OPCODE_LG2: return OPCODE_LG2;
+        case TGSI_OPCODE_POW: return OPCODE_POW;
+        case TGSI_OPCODE_XPD: return OPCODE_XPD;
+                                        /* gap */
+        case TGSI_OPCODE_ABS: return OPCODE_ABS;
+        case TGSI_OPCODE_RCC: return OPCODE_RCC;
+        case TGSI_OPCODE_DPH: return OPCODE_DPH;
+        case TGSI_OPCODE_COS: return OPCODE_COS;
+        case TGSI_OPCODE_DDX: return OPCODE_DDX;
+        case TGSI_OPCODE_DDY: return OPCODE_DDY;
+     /* case TGSI_OPCODE_KILP: return OPCODE_KILP; */
+        case TGSI_OPCODE_PK2H: return OPCODE_PK2H;
+        case TGSI_OPCODE_PK2US: return OPCODE_PK2US;
+        case TGSI_OPCODE_PK4B: return OPCODE_PK4B;
+        case TGSI_OPCODE_PK4UB: return OPCODE_PK4UB;
+        case TGSI_OPCODE_RFL: return OPCODE_RFL;
+        case TGSI_OPCODE_SEQ: return OPCODE_SEQ;
+        case TGSI_OPCODE_SFL: return OPCODE_SFL;
+        case TGSI_OPCODE_SGT: return OPCODE_SGT;
+        case TGSI_OPCODE_SIN: return OPCODE_SIN;
+        case TGSI_OPCODE_SLE: return OPCODE_SLE;
+        case TGSI_OPCODE_SNE: return OPCODE_SNE;
+        case TGSI_OPCODE_STR: return OPCODE_STR;
+        case TGSI_OPCODE_TEX: return OPCODE_TEX;
+        case TGSI_OPCODE_TXD: return OPCODE_TXD;
+        case TGSI_OPCODE_TXP: return OPCODE_TXP;
+        case TGSI_OPCODE_UP2H: return OPCODE_UP2H;
+        case TGSI_OPCODE_UP2US: return OPCODE_UP2US;
+        case TGSI_OPCODE_UP4B: return OPCODE_UP4B;
+        case TGSI_OPCODE_UP4UB: return OPCODE_UP4UB;
+        case TGSI_OPCODE_X2D: return OPCODE_X2D;
+        case TGSI_OPCODE_ARA: return OPCODE_ARA;
+        case TGSI_OPCODE_ARR: return OPCODE_ARR;
+        case TGSI_OPCODE_BRA: return OPCODE_BRA;
+        case TGSI_OPCODE_CAL: return OPCODE_CAL;
+        case TGSI_OPCODE_RET: return OPCODE_RET;
+        case TGSI_OPCODE_SSG: return OPCODE_SSG;
+        case TGSI_OPCODE_CMP: return OPCODE_CMP;
+        case TGSI_OPCODE_SCS: return OPCODE_SCS;
+        case TGSI_OPCODE_TXB: return OPCODE_TXB;
+     /* case TGSI_OPCODE_NRM: return OPCODE_NRM; */
+     /* case TGSI_OPCODE_DIV: return OPCODE_DIV; */
+        case TGSI_OPCODE_DP2: return OPCODE_DP2;
+        case TGSI_OPCODE_TXL: return OPCODE_TXL;
+        case TGSI_OPCODE_BRK: return OPCODE_BRK;
+        case TGSI_OPCODE_IF: return OPCODE_IF;
+     /* case TGSI_OPCODE_LOOP: return OPCODE_LOOP; */
+     /* case TGSI_OPCODE_REP: return OPCODE_REP; */
+        case TGSI_OPCODE_ELSE: return OPCODE_ELSE;
+        case TGSI_OPCODE_ENDIF: return OPCODE_ENDIF;
+        case TGSI_OPCODE_ENDLOOP: return OPCODE_ENDLOOP;
+     /* case TGSI_OPCODE_ENDREP: return OPCODE_ENDREP; */
+        case TGSI_OPCODE_PUSHA: return OPCODE_PUSHA;
+        case TGSI_OPCODE_POPA: return OPCODE_POPA;
+     /* case TGSI_OPCODE_CEIL: return OPCODE_CEIL; */
+     /* case TGSI_OPCODE_I2F: return OPCODE_I2F; */
+        case TGSI_OPCODE_NOT: return OPCODE_NOT;
+        case TGSI_OPCODE_TRUNC: return OPCODE_TRUNC;
+     /* case TGSI_OPCODE_SHL: return OPCODE_SHL; */
+     /* case TGSI_OPCODE_SHR: return OPCODE_SHR; */
+        case TGSI_OPCODE_AND: return OPCODE_AND;
+        case TGSI_OPCODE_OR: return OPCODE_OR;
+     /* case TGSI_OPCODE_MOD: return OPCODE_MOD; */
+        case TGSI_OPCODE_XOR: return OPCODE_XOR;
+     /* case TGSI_OPCODE_SAD: return OPCODE_SAD; */
+     /* case TGSI_OPCODE_TXF: return OPCODE_TXF; */
+     /* case TGSI_OPCODE_TXQ: return OPCODE_TXQ; */
+        case TGSI_OPCODE_CONT: return OPCODE_CONT;
+     /* case TGSI_OPCODE_EMIT: return OPCODE_EMIT; */
+     /* case TGSI_OPCODE_ENDPRIM: return OPCODE_ENDPRIM; */
+     /* case TGSI_OPCODE_BGNLOOP2: return OPCODE_BGNLOOP2; */
+        case TGSI_OPCODE_BGNSUB: return OPCODE_BGNSUB;
+     /* case TGSI_OPCODE_ENDLOOP2: return OPCODE_ENDLOOP2; */
+        case TGSI_OPCODE_ENDSUB: return OPCODE_ENDSUB;
+        case TGSI_OPCODE_NOISE1: return OPCODE_NOISE1;
+        case TGSI_OPCODE_NOISE2: return OPCODE_NOISE2;
+        case TGSI_OPCODE_NOISE3: return OPCODE_NOISE3;
+        case TGSI_OPCODE_NOISE4: return OPCODE_NOISE4;
+        case TGSI_OPCODE_NOP: return OPCODE_NOP;
+                                        /* gap */
+        case TGSI_OPCODE_NRM4: return OPCODE_NRM4;
+     /* case TGSI_OPCODE_CALLNZ: return OPCODE_CALLNZ; */
+     /* case TGSI_OPCODE_IFC: return OPCODE_IFC; */
+     /* case TGSI_OPCODE_BREAKC: return OPCODE_BREAKC; */
+        case TGSI_OPCODE_KIL: return OPCODE_KIL;
+        case TGSI_OPCODE_END: return OPCODE_END;
+        case TGSI_OPCODE_SWZ: return OPCODE_SWZ;
+    }
+
+    fprintf(stderr, "Unknown opcode: %i\n", opcode);
+    abort();
+}
+
+static unsigned translate_saturate(unsigned saturate)
+{
+    switch(saturate) {
+        case TGSI_SAT_NONE: return SATURATE_OFF;
+        case TGSI_SAT_ZERO_ONE: return SATURATE_ZERO_ONE;
+    }
+
+    fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
+    abort();
+}
+
+static unsigned translate_register_file(unsigned file)
+{
+    switch(file) {
+        case TGSI_FILE_CONSTANT: return PROGRAM_CONSTANT;
+        case TGSI_FILE_IMMEDIATE: return PROGRAM_CONSTANT;
+        case TGSI_FILE_INPUT: return PROGRAM_INPUT;
+        case TGSI_FILE_OUTPUT: return PROGRAM_OUTPUT;
+        case TGSI_FILE_TEMPORARY: return PROGRAM_TEMPORARY;
+        case TGSI_FILE_ADDRESS: return PROGRAM_ADDRESS;
+    }
+
+    fprintf(stderr, "Unhandled register file: %i\n", file);
+    abort();
+}
+
+static int translate_register_index(
+    struct tgsi_to_rc * ttr,
+    unsigned file,
+    int index)
+{
+    if (file == TGSI_FILE_IMMEDIATE)
+        return ttr->immediate_offset + index;
+
+    return index;
+}
+
+static void transform_dstreg(
+    struct tgsi_to_rc * ttr,
+    struct prog_dst_register * dst,
+    struct tgsi_full_dst_register * src)
+{
+    dst->File = translate_register_file(src->DstRegister.File);
+    dst->Index = translate_register_index(ttr, src->DstRegister.File, src->DstRegister.Index);
+    dst->WriteMask = src->DstRegister.WriteMask;
+    dst->RelAddr = src->DstRegister.Indirect;
+}
+
+static void transform_srcreg(
+    struct tgsi_to_rc * ttr,
+    struct prog_src_register * dst,
+    struct tgsi_full_src_register * src)
+{
+    dst->File = translate_register_file(src->SrcRegister.File);
+    dst->Index = translate_register_index(ttr, src->SrcRegister.File, src->SrcRegister.Index);
+    dst->RelAddr = src->SrcRegister.Indirect;
+    dst->Swizzle = tgsi_util_get_full_src_register_extswizzle(src, 0);
+    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 1) << 3;
+    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 2) << 6;
+    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 3) << 9;
+    dst->Abs = src->SrcRegisterExtMod.Absolute;
+    dst->Negate =
+        src->SrcRegisterExtSwz.NegateX |
+        (src->SrcRegisterExtSwz.NegateY << 1) |
+        (src->SrcRegisterExtSwz.NegateZ << 2) |
+        (src->SrcRegisterExtSwz.NegateW << 3);
+    dst->Negate ^= src->SrcRegister.Negate ? NEGATE_XYZW : 0;
+}
+
+static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_ext_texture src)
+{
+    switch(src.Texture) {
+        case TGSI_TEXTURE_1D:
+            dst->I.TexSrcTarget = TEXTURE_1D_INDEX;
+            break;
+        case TGSI_TEXTURE_2D:
+            dst->I.TexSrcTarget = TEXTURE_2D_INDEX;
+            break;
+        case TGSI_TEXTURE_3D:
+            dst->I.TexSrcTarget = TEXTURE_3D_INDEX;
+            break;
+        case TGSI_TEXTURE_CUBE:
+            dst->I.TexSrcTarget = TEXTURE_CUBE_INDEX;
+            break;
+        case TGSI_TEXTURE_RECT:
+            dst->I.TexSrcTarget = TEXTURE_RECT_INDEX;
+            break;
+        case TGSI_TEXTURE_SHADOW1D:
+            dst->I.TexSrcTarget = TEXTURE_1D_INDEX;
+            dst->I.TexShadow = 1;
+            break;
+        case TGSI_TEXTURE_SHADOW2D:
+            dst->I.TexSrcTarget = TEXTURE_2D_INDEX;
+            dst->I.TexShadow = 1;
+            break;
+        case TGSI_TEXTURE_SHADOWRECT:
+            dst->I.TexSrcTarget = TEXTURE_RECT_INDEX;
+            dst->I.TexShadow = 1;
+            break;
+    }
+}
+
+static void transform_instruction(struct tgsi_to_rc * ttr, struct tgsi_full_instruction * src)
+{
+    if (src->Instruction.Opcode == TGSI_OPCODE_END)
+        return;
+
+    struct rc_instruction * dst = rc_insert_new_instruction(ttr->compiler, ttr->compiler->Program.Instructions.Prev);
+    int i;
+
+    dst->I.Opcode = translate_opcode(src->Instruction.Opcode);
+    dst->I.SaturateMode = translate_saturate(src->Instruction.Saturate);
+
+    if (src->Instruction.NumDstRegs)
+        transform_dstreg(ttr, &dst->I.DstReg, &src->FullDstRegisters[0]);
+
+    for(i = 0; i < src->Instruction.NumSrcRegs; ++i) {
+        if (src->FullSrcRegisters[i].SrcRegister.File == TGSI_FILE_SAMPLER)
+            dst->I.TexSrcUnit = src->FullSrcRegisters[i].SrcRegister.Index;
+        else
+            transform_srcreg(ttr, &dst->I.SrcReg[i], &src->FullSrcRegisters[i]);
+    }
+
+    /* Texturing. */
+    transform_texture(dst, src->InstructionExtTexture);
+}
+
+static void handle_immediate(struct tgsi_to_rc * ttr, struct tgsi_full_immediate * imm)
+{
+    struct rc_constant constant;
+    int i;
+
+    constant.Type = RC_CONSTANT_IMMEDIATE;
+    constant.Size = 4;
+    for(i = 0; i < 4; ++i)
+        constant.u.Immediate[i] = imm->u[i].Float;
+    rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+}
+
+void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens)
+{
+    struct tgsi_parse_context parser;
+    int i;
+
+    /* Allocate constants placeholders.
+     *
+     * Note: What if declared constants are not contiguous? */
+    for(i = 0; i <= ttr->info->file_max[TGSI_FILE_CONSTANT]; ++i) {
+        struct rc_constant constant;
+        memset(&constant, 0, sizeof(constant));
+        constant.Type = RC_CONSTANT_EXTERNAL;
+        constant.Size = 4;
+        constant.u.External = i;
+        rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+    }
+
+    ttr->immediate_offset = ttr->compiler->Program.Constants.Count;
+
+    tgsi_parse_init(&parser, tokens);
+
+    while (!tgsi_parse_end_of_tokens(&parser)) {
+        tgsi_parse_token(&parser);
+
+        switch (parser.FullToken.Token.Type) {
+            case TGSI_TOKEN_TYPE_DECLARATION:
+                break;
+            case TGSI_TOKEN_TYPE_IMMEDIATE:
+                handle_immediate(ttr, &parser.FullToken.FullImmediate);
+                break;
+            case TGSI_TOKEN_TYPE_INSTRUCTION:
+                transform_instruction(ttr, &parser.FullToken.FullInstruction);
+                break;
+        }
+    }
+
+    tgsi_parse_free(&parser);
+
+    rc_calculate_inputs_outputs(ttr->compiler);
+}
+
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.h b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
new file mode 100644
index 0000000000..93e90ec6d2
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TGSI_TO_RC_H
+#define R300_TGSI_TO_RC_H
+
+struct radeon_compiler;
+
+struct tgsi_full_declaration;
+struct tgsi_shader_info;
+struct tgsi_token;
+
+struct tgsi_to_rc {
+    struct radeon_compiler * compiler;
+    const struct tgsi_shader_info * info;
+
+    int immediate_offset;
+};
+
+void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens);
+
+#endif /* R300_TGSI_TO_RC_H */
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index a664a316e8..12a6e37be6 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -22,390 +22,213 @@
 
 #include "r300_vs.h"
 
-static void r300_vs_declare(struct r300_vs_asm* assembler,
-                            struct tgsi_full_declaration* decl)
-{
-    switch (decl->Declaration.File) {
-        case TGSI_FILE_INPUT:
-            break;
-        case TGSI_FILE_OUTPUT:
-            switch (decl->Semantic.SemanticName) {
-                case TGSI_SEMANTIC_POSITION:
-                    assembler->tab[decl->DeclarationRange.First] = 0;
-                    break;
-                case TGSI_SEMANTIC_COLOR:
-                    assembler->tab[decl->DeclarationRange.First] =
-                        (assembler->point_size ? 1 : 0) +
-                        assembler->out_colors;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                case TGSI_SEMANTIC_GENERIC:
-                    /* XXX multiple? */
-                    assembler->tab[decl->DeclarationRange.First] =
-                        (assembler->point_size ? 1 : 0) +
-                        assembler->out_colors +
-                        assembler->out_texcoords;
-                    break;
-                case TGSI_SEMANTIC_PSIZE:
-                    assembler->tab[decl->DeclarationRange.First] = 1;
-                    break;
-                default:
-                    debug_printf("r300: vs: Bad semantic declaration %d\n",
-                        decl->Semantic.SemanticName);
-                    break;
-            }
-            break;
-        case TGSI_FILE_CONSTANT:
-            break;
-        case TGSI_FILE_TEMPORARY:
-            assembler->temp_count++;
-            break;
-        default:
-            debug_printf("r300: vs: Bad file %d\n", decl->Declaration.File);
-            break;
-    }
-}
+#include "r300_context.h"
+#include "r300_tgsi_to_rc.h"
 
-static INLINE unsigned r300_vs_src_type(struct r300_vs_asm* assembler,
-                                        struct tgsi_src_register* src)
-{
-    switch (src->File) {
-        case TGSI_FILE_NULL:
-        case TGSI_FILE_INPUT:
-            /* Probably a zero or one swizzle */
-            return R300_PVS_SRC_REG_INPUT;
-        case TGSI_FILE_TEMPORARY:
-            return R300_PVS_SRC_REG_TEMPORARY;
-        case TGSI_FILE_CONSTANT:
-        case TGSI_FILE_IMMEDIATE:
-            return R300_PVS_SRC_REG_CONSTANT;
-        default:
-            debug_printf("r300: vs: Unimplemented src type %d\n", src->File);
-            break;
-    }
-    return 0;
-}
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
 
-static INLINE unsigned r300_vs_src(struct r300_vs_asm* assembler,
-                                   struct tgsi_src_register* src)
-{
-    switch (src->File) {
-        case TGSI_FILE_NULL:
-        case TGSI_FILE_INPUT:
-        case TGSI_FILE_TEMPORARY:
-        case TGSI_FILE_CONSTANT:
-            return src->Index;
-        case TGSI_FILE_IMMEDIATE:
-            return src->Index + assembler->imm_offset;
-        default:
-            debug_printf("r300: vs: Unimplemented src type %d\n", src->File);
-            break;
-    }
-    return 0;
-}
+#include "radeon_compiler.h"
 
-static INLINE unsigned r300_vs_dst_type(struct r300_vs_asm* assembler,
-                                        struct tgsi_dst_register* dst)
-{
-    switch (dst->File) {
-        case TGSI_FILE_TEMPORARY:
-            return R300_PVS_DST_REG_TEMPORARY;
-        case TGSI_FILE_OUTPUT:
-            return R300_PVS_DST_REG_OUT;
-        default:
-            debug_printf("r300: vs: Unimplemented dst type %d\n", dst->File);
-            break;
-    }
-    return 0;
-}
 
-static INLINE unsigned r300_vs_dst(struct r300_vs_asm* assembler,
-                                   struct tgsi_dst_register* dst)
+static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
 {
-    switch (dst->File) {
-        case TGSI_FILE_TEMPORARY:
-            return dst->Index;
-        case TGSI_FILE_OUTPUT:
-            return assembler->tab[dst->Index];
-        default:
-            debug_printf("r300: vs: Unimplemented dst %d\n", dst->File);
-            break;
-    }
-    return 0;
-}
+    struct r300_vertex_shader * vs = c->UserData;
+    struct tgsi_shader_info* info = &vs->info;
+    boolean pointsize = false;
+    int out_colors = 0;
+    int colors = 0;
+    int out_generic = 0;
+    int generic = 0;
+    int i;
 
-static uint32_t r300_vs_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_DP3:
-        case TGSI_OPCODE_DP4:
-            return R300_VE_DOT_PRODUCT;
-        case TGSI_OPCODE_MUL:
-            return R300_VE_MULTIPLY;
-        case TGSI_OPCODE_ADD:
-        case TGSI_OPCODE_MOV:
-        case TGSI_OPCODE_SUB:
-        case TGSI_OPCODE_SWZ:
-            return R300_VE_ADD;
-        case TGSI_OPCODE_MAX:
-            return R300_VE_MAXIMUM;
-        case TGSI_OPCODE_SLT:
-            return R300_VE_SET_LESS_THAN;
-        case TGSI_OPCODE_RSQ:
-            return R300_PVS_DST_MATH_INST | R300_ME_RECIP_DX;
-        case TGSI_OPCODE_MAD:
-            return R300_PVS_DST_MACRO_INST | R300_PVS_MACRO_OP_2CLK_MADD;
-        default:
-            break;
-    }
-    return 0;
-}
+    /* Fill in the input mapping */
+    for (i = 0; i < info->num_inputs; i++)
+        c->code->inputs[i] = i;
 
-static uint32_t r300_vs_swiz(struct tgsi_full_src_register* reg)
-{
-    if (reg->SrcRegister.Extended) {
-        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
-            reg->SrcRegisterExtSwz.ExtSwizzleX |
-            (reg->SrcRegisterExtSwz.ExtSwizzleY << 3) |
-            (reg->SrcRegisterExtSwz.ExtSwizzleZ << 6) |
-            (reg->SrcRegisterExtSwz.ExtSwizzleW << 9);
-    } else {
-        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
-            reg->SrcRegister.SwizzleX |
-            (reg->SrcRegister.SwizzleY << 3) |
-            (reg->SrcRegister.SwizzleZ << 6) |
-            (reg->SrcRegister.SwizzleW << 9);
+    /* Fill in the output mapping */
+    for (i = 0; i < info->num_outputs; i++) {
+        switch (info->output_semantic_name[i]) {
+            case TGSI_SEMANTIC_PSIZE:
+                pointsize = true;
+                break;
+            case TGSI_SEMANTIC_COLOR:
+                out_colors++;
+                break;
+            case TGSI_SEMANTIC_FOG:
+            case TGSI_SEMANTIC_GENERIC:
+                out_generic++;
+                break;
+        }
     }
-}
 
-/* XXX icky icky icky icky */
-static uint32_t r300_vs_scalar_swiz(struct tgsi_full_src_register* reg)
-{
-    if (reg->SrcRegister.Extended) {
-        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
-            reg->SrcRegisterExtSwz.ExtSwizzleX |
-            (reg->SrcRegisterExtSwz.ExtSwizzleX << 3) |
-            (reg->SrcRegisterExtSwz.ExtSwizzleX << 6) |
-            (reg->SrcRegisterExtSwz.ExtSwizzleX << 9);
-    } else {
-        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
-            reg->SrcRegister.SwizzleX |
-            (reg->SrcRegister.SwizzleX << 3) |
-            (reg->SrcRegister.SwizzleX << 6) |
-            (reg->SrcRegister.SwizzleX << 9);
-    }
-}
+    struct tgsi_parse_context parser;
 
-/* XXX scalar stupidity */
-static void r300_vs_emit_inst(struct r300_vertex_shader* vs,
-                              struct r300_vs_asm* assembler,
-                              struct tgsi_full_src_register* src,
-                              struct tgsi_full_dst_register* dst,
-                              unsigned op,
-                              unsigned count,
-                              boolean is_scalar)
-{
-    int i = vs->instruction_count;
-    vs->instructions[i].inst0 = R300_PVS_DST_OPCODE(r300_vs_op(op)) |
-        R300_PVS_DST_REG_TYPE(r300_vs_dst_type(assembler, &dst->DstRegister)) |
-        R300_PVS_DST_OFFSET(r300_vs_dst(assembler, &dst->DstRegister)) |
-        R300_PVS_DST_WE(dst->DstRegister.WriteMask);
-    switch (count) {
-        case 3:
-            vs->instructions[i].inst3 =
-                R300_PVS_SRC_REG_TYPE(r300_vs_src_type(assembler,
-                            &src[2].SrcRegister)) |
-                R300_PVS_SRC_OFFSET(r300_vs_src(assembler,
-                            &src[2].SrcRegister)) |
-                R300_PVS_SRC_SWIZZLE(r300_vs_swiz(&src[2]));
-            /* Fall through */
-        case 2:
-            vs->instructions[i].inst2 =
-                R300_PVS_SRC_REG_TYPE(r300_vs_src_type(assembler,
-                            &src[1].SrcRegister)) |
-                R300_PVS_SRC_OFFSET(r300_vs_src(assembler,
-                            &src[1].SrcRegister)) |
-                R300_PVS_SRC_SWIZZLE(r300_vs_swiz(&src[1]));
-            /* Fall through */
-        case 1:
-            vs->instructions[i].inst1 =
-                R300_PVS_SRC_REG_TYPE(r300_vs_src_type(assembler,
-                            &src[0].SrcRegister)) |
-                R300_PVS_SRC_OFFSET(r300_vs_src(assembler,
-                            &src[0].SrcRegister)) |
-                /* XXX the icky, it burns */
-                R300_PVS_SRC_SWIZZLE(is_scalar ? r300_vs_scalar_swiz(&src[0])
-                        : r300_vs_swiz(&src[0]));
-            break;
-    }
-    vs->instruction_count++;
-}
+    tgsi_parse_init(&parser, vs->state.tokens);
 
-static void r300_vs_instruction(struct r300_vertex_shader* vs,
-                                struct r300_vs_asm* assembler,
-                                struct tgsi_full_instruction* inst)
-{
-    switch (inst->Instruction.Opcode) {
-        case TGSI_OPCODE_RSQ:
-            r300_vs_emit_inst(vs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode,
-                    1, TRUE);
-            break;
-        case TGSI_OPCODE_SUB:
-            inst->FullSrcRegisters[1].SrcRegister.Negate =
-                !inst->FullSrcRegisters[1].SrcRegister.Negate;
-            /* Fall through */
-        case TGSI_OPCODE_ADD:
-        case TGSI_OPCODE_MUL:
-        case TGSI_OPCODE_MAX:
-        case TGSI_OPCODE_SLT:
-            r300_vs_emit_inst(vs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode,
-                    2, FALSE);
-            break;
-        case TGSI_OPCODE_DP3:
-            /* Set alpha swizzle to zero for src0 and src1 */
-            if (!inst->FullSrcRegisters[0].SrcRegister.Extended) {
-                inst->FullSrcRegisters[0].SrcRegister.Extended = TRUE;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleX =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleX;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleY =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleY;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleZ =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleZ;
-            }
-            inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleW =
-                TGSI_EXTSWIZZLE_ZERO;
-            if (!inst->FullSrcRegisters[1].SrcRegister.Extended) {
-                inst->FullSrcRegisters[1].SrcRegister.Extended = TRUE;
-                inst->FullSrcRegisters[1].SrcRegisterExtSwz.ExtSwizzleX =
-                    inst->FullSrcRegisters[1].SrcRegister.SwizzleX;
-                inst->FullSrcRegisters[1].SrcRegisterExtSwz.ExtSwizzleY =
-                    inst->FullSrcRegisters[1].SrcRegister.SwizzleY;
-                inst->FullSrcRegisters[1].SrcRegisterExtSwz.ExtSwizzleZ =
-                    inst->FullSrcRegisters[1].SrcRegister.SwizzleZ;
-            }
-            inst->FullSrcRegisters[1].SrcRegisterExtSwz.ExtSwizzleW =
-                TGSI_EXTSWIZZLE_ZERO;
-            /* Fall through */
-        case TGSI_OPCODE_DP4:
-            r300_vs_emit_inst(vs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode,
-                    2, FALSE);
-            break;
-        case TGSI_OPCODE_MOV:
-        case TGSI_OPCODE_SWZ:
-            inst->FullSrcRegisters[1] = r300_constant_zero;
-            r300_vs_emit_inst(vs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode,
-                    2, FALSE);
-            break;
-        case TGSI_OPCODE_MAD:
-            r300_vs_emit_inst(vs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode,
-                    3, FALSE);
-            break;
-        case TGSI_OPCODE_END:
-            break;
-        default:
-            debug_printf("r300: vs: Bad opcode %d\n",
-                    inst->Instruction.Opcode);
-            break;
-    }
-}
+    while (!tgsi_parse_end_of_tokens(&parser)) {
+        tgsi_parse_token(&parser);
 
-static void r300_vs_init(struct r300_vertex_shader* vs,
-                         struct r300_vs_asm* assembler)
-{
-    struct tgsi_shader_info* info = &vs->info;
-    int i;
+        if (parser.FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
+            continue;
 
-    for (i = 0; i < info->num_outputs; i++) {
-        switch (info->output_semantic_name[i]) {
+        struct tgsi_full_declaration * decl = &parser.FullToken.FullDeclaration;
+
+        if (decl->Declaration.File != TGSI_FILE_OUTPUT)
+            continue;
+
+        switch (decl->Semantic.SemanticName) {
+            case TGSI_SEMANTIC_POSITION:
+                c->code->outputs[decl->DeclarationRange.First] = 0;
+                break;
             case TGSI_SEMANTIC_PSIZE:
-                assembler->point_size = TRUE;
+                c->code->outputs[decl->DeclarationRange.First] = 1;
                 break;
             case TGSI_SEMANTIC_COLOR:
-                assembler->out_colors++;
+                c->code->outputs[decl->DeclarationRange.First] = 1 +
+                    (pointsize ? 1 : 0) +
+                    colors++;
                 break;
             case TGSI_SEMANTIC_FOG:
             case TGSI_SEMANTIC_GENERIC:
-                assembler->out_texcoords++;
+                c->code->outputs[decl->DeclarationRange.First] = 1 +
+                    (pointsize ? 1 : 0) +
+                    out_colors +
+                    generic++;
+                break;
+            default:
+                debug_printf("r300: vs: Bad semantic declaration %d\n",
+                    decl->Semantic.SemanticName);
                 break;
         }
     }
 
-    vs->instruction_count = 0;
+    tgsi_parse_free(&parser);
 }
 
+
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs)
 {
-    struct tgsi_parse_context parser;
-    int i;
-    struct r300_constant_buffer* consts =
-        &r300->shader_constants[PIPE_SHADER_VERTEX];
+    struct r300_vertex_program_compiler compiler;
+    struct tgsi_to_rc ttr;
 
-    struct r300_vs_asm* assembler = CALLOC_STRUCT(r300_vs_asm);
-    if (assembler == NULL) {
-        return;
-    }
+    /* Setup the compiler */
+    rc_init(&compiler.Base);
 
-    /* Init assembler. */
-    r300_vs_init(vs, assembler);
+    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
+    compiler.code = &vs->code;
+    compiler.UserData = vs;
 
-    /* Setup starting offset for immediates. */
-    assembler->imm_offset = consts->user_count;
+    if (compiler.Base.Debug) {
+        debug_printf("r300: Initial vertex program\n");
+        tgsi_dump(vs->state.tokens, 0);
+    }
 
-    tgsi_parse_init(&parser, vs->state.tokens);
+    /* Translate TGSI to our internal representation */
+    ttr.compiler = &compiler.Base;
+    ttr.info = &vs->info;
 
-    while (!tgsi_parse_end_of_tokens(&parser)) {
-        tgsi_parse_token(&parser);
+    r300_tgsi_to_rc(&ttr, vs->state.tokens);
 
-        /* This is seriously the lamest way to create fragment programs ever.
-         * I blame TGSI. */
-        switch (parser.FullToken.Token.Type) {
-            case TGSI_TOKEN_TYPE_DECLARATION:
-                /* Allocated registers sitting at the beginning
-                 * of the program. */
-                r300_vs_declare(assembler, &parser.FullToken.FullDeclaration);
-                break;
-            case TGSI_TOKEN_TYPE_IMMEDIATE:
-                debug_printf("r300: Emitting immediate to constant buffer, "
-                        "position %d\n",
-                        assembler->imm_offset + assembler->imm_count);
-                /* I am not amused by the length of these. */
-                for (i = 0; i < 4; i++) {
-                    consts->constants[assembler->imm_offset +
-                        assembler->imm_count][i] =
-                        parser.FullToken.FullImmediate.u[i].Float;
-                }
-                assembler->imm_count++;
-                break;
-            case TGSI_TOKEN_TYPE_INSTRUCTION:
-                r300_vs_instruction(vs, assembler,
-                        &parser.FullToken.FullInstruction);
-                break;
-        }
-    }
+    compiler.RequiredOutputs = ~(~0 << vs->info.num_outputs);
+    compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
 
-    debug_printf("r300: vs: %d texs and %d colors, first free reg is %d\n",
-            assembler->tex_count, assembler->color_count,
-            assembler->tex_count + assembler->color_count);
+    /* Invoke the compiler */
+    r3xx_compile_vertex_program(&compiler);
+    if (compiler.Base.Error) {
+        /* Todo: Fail gracefully */
+        fprintf(stderr, "r300 VP: Compiler error\n");
+        abort();
+    }
 
-    consts->count = consts->user_count + assembler->imm_count;
-    vs->uses_imms = assembler->imm_count;
-    debug_printf("r300: vs: %d total constants, "
-            "%d from user and %d from immediates\n", consts->count,
-            consts->user_count, assembler->imm_count);
+    /* And, finally... */
+    rc_destroy(&compiler.Base);
+    vs->translated = TRUE;
+}
 
-    debug_printf("r300: vs: tab: %d %d %d %d\n", assembler->tab[0],
-            assembler->tab[1], assembler->tab[2], assembler->tab[3]);
 
-    tgsi_dump(vs->state.tokens, 0);
-    /* XXX finish r300 vertex shader dumper */
-    r300_vs_dump(vs);
+/* XXX get these to r300_reg */
+#define R300_PVS_DST_OPCODE(x)   ((x) << 0)
+#   define R300_VE_DOT_PRODUCT            1
+#   define R300_VE_MULTIPLY               2
+#   define R300_VE_ADD                    3
+#   define R300_VE_MAXIMUM                7
+#   define R300_VE_SET_LESS_THAN          10
+#define R300_PVS_DST_MATH_INST     (1 << 6)
+#   define R300_ME_RECIP_DX               6
+#define R300_PVS_DST_MACRO_INST    (1 << 7)
+#   define R300_PVS_MACRO_OP_2CLK_MADD    0
+#define R300_PVS_DST_REG_TYPE(x) ((x) << 8)
+#   define R300_PVS_DST_REG_TEMPORARY     0
+#   define R300_PVS_DST_REG_A0            1
+#   define R300_PVS_DST_REG_OUT           2
+#   define R300_PVS_DST_REG_OUT_REPL_X    3
+#   define R300_PVS_DST_REG_ALT_TEMPORARY 4
+#   define R300_PVS_DST_REG_INPUT         5
+#define R300_PVS_DST_OFFSET(x)   ((x) << 13)
+#define R300_PVS_DST_WE(x)       ((x) << 20)
+#define R300_PVS_DST_WE_XYZW     (0xf << 20)
+
+#define R300_PVS_SRC_REG_TYPE(x) ((x) << 0)
+#   define R300_PVS_SRC_REG_TEMPORARY     0
+#   define R300_PVS_SRC_REG_INPUT         1
+#   define R300_PVS_SRC_REG_CONSTANT      2
+#   define R300_PVS_SRC_REG_ALT_TEMPORARY 3
+#define R300_PVS_SRC_OFFSET(x)   ((x) << 5)
+#define R300_PVS_SRC_SWIZZLE(x)  ((x) << 13)
+#   define R300_PVS_SRC_SELECT_X          0
+#   define R300_PVS_SRC_SELECT_Y          1
+#   define R300_PVS_SRC_SELECT_Z          2
+#   define R300_PVS_SRC_SELECT_W          3
+#   define R300_PVS_SRC_SELECT_FORCE_0    4
+#   define R300_PVS_SRC_SELECT_FORCE_1    5
+#   define R300_PVS_SRC_SWIZZLE_XYZW \
+    ((R300_PVS_SRC_SELECT_X | (R300_PVS_SRC_SELECT_Y << 3) | \
+     (R300_PVS_SRC_SELECT_Z << 6) | (R300_PVS_SRC_SELECT_W << 9)) << 13)
+#   define R300_PVS_SRC_SWIZZLE_ZERO \
+    ((R300_PVS_SRC_SELECT_FORCE_0 | (R300_PVS_SRC_SELECT_FORCE_0 << 3) | \
+     (R300_PVS_SRC_SELECT_FORCE_0 << 6) | \
+      (R300_PVS_SRC_SELECT_FORCE_0 << 9)) << 13)
+#   define R300_PVS_SRC_SWIZZLE_ONE \
+    ((R300_PVS_SRC_SELECT_FORCE_1 | (R300_PVS_SRC_SELECT_FORCE_1 << 3) | \
+     (R300_PVS_SRC_SELECT_FORCE_1 << 6) | \
+      (R300_PVS_SRC_SELECT_FORCE_1 << 9)) << 13)
+#define R300_PVS_MODIFIER_X        (1 << 25)
+#define R300_PVS_MODIFIER_Y        (1 << 26)
+#define R300_PVS_MODIFIER_Z        (1 << 27)
+#define R300_PVS_MODIFIER_W        (1 << 28)
+#define R300_PVS_NEGATE_XYZW \
+    (R300_PVS_MODIFIER_X | R300_PVS_MODIFIER_Y | \
+     R300_PVS_MODIFIER_Z | R300_PVS_MODIFIER_W)
+
+struct r300_vertex_program_code r300_passthrough_vertex_shader = {
+    .length = 8, /* two instructions */
+
+    /* MOV out[0], in[0] */
+    .body.d[0] = R300_PVS_DST_OPCODE(R300_VE_ADD) |
+        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+        R300_PVS_DST_OFFSET(0) | R300_PVS_DST_WE_XYZW,
+    .body.d[1] = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+        R300_PVS_SRC_OFFSET(0) | R300_PVS_SRC_SWIZZLE_XYZW,
+    .body.d[2] = R300_PVS_SRC_SWIZZLE_ZERO,
+    .body.d[3] = 0x0,
+
+    /* MOV out[1], in[1] */
+    .body.d[4] = R300_PVS_DST_OPCODE(R300_VE_ADD) |
+        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+        R300_PVS_DST_OFFSET(1) | R300_PVS_DST_WE_XYZW,
+    .body.d[5] = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+        R300_PVS_SRC_OFFSET(1) | R300_PVS_SRC_SWIZZLE_XYZW,
+    .body.d[6] = R300_PVS_SRC_SWIZZLE_ZERO,
+    .body.d[7] = 0x0,
+
+    .inputs[0] = 0,
+    .inputs[1] = 1,
+    .outputs[0] = 0,
+    .outputs[1] = 1,
+
+    .InputsRead = 3,
+    .OutputsWritten = 3
+};
 
-    tgsi_parse_free(&parser);
-    FREE(assembler);
-}
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index 165d717812..2a4ce315e3 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -23,134 +23,31 @@
 #ifndef R300_VS_H
 #define R300_VS_H
 
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_dump.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
 
-#include "r300_context.h"
-#include "r300_debug.h"
-#include "r300_reg.h"
-#include "r300_screen.h"
-#include "r300_shader_inlines.h"
+#include "radeon_code.h"
 
-/* XXX get these to r300_reg */
-#define R300_PVS_DST_OPCODE(x)   ((x) << 0)
-#   define R300_VE_DOT_PRODUCT            1
-#   define R300_VE_MULTIPLY               2
-#   define R300_VE_ADD                    3
-#   define R300_VE_MAXIMUM                7
-#   define R300_VE_SET_LESS_THAN          10
-#define R300_PVS_DST_MATH_INST     (1 << 6)
-#   define R300_ME_RECIP_DX               6
-#define R300_PVS_DST_MACRO_INST    (1 << 7)
-#   define R300_PVS_MACRO_OP_2CLK_MADD    0
-#define R300_PVS_DST_REG_TYPE(x) ((x) << 8)
-#   define R300_PVS_DST_REG_TEMPORARY     0
-#   define R300_PVS_DST_REG_A0            1
-#   define R300_PVS_DST_REG_OUT           2
-#   define R300_PVS_DST_REG_OUT_REPL_X    3
-#   define R300_PVS_DST_REG_ALT_TEMPORARY 4
-#   define R300_PVS_DST_REG_INPUT         5
-#define R300_PVS_DST_OFFSET(x)   ((x) << 13)
-#define R300_PVS_DST_WE(x)       ((x) << 20)
-#define R300_PVS_DST_WE_XYZW     (0xf << 20)
+struct r300_context;
 
-#define R300_PVS_SRC_REG_TYPE(x) ((x) << 0)
-#   define R300_PVS_SRC_REG_TEMPORARY     0
-#   define R300_PVS_SRC_REG_INPUT         1
-#   define R300_PVS_SRC_REG_CONSTANT      2
-#   define R300_PVS_SRC_REG_ALT_TEMPORARY 3
-#define R300_PVS_SRC_OFFSET(x)   ((x) << 5)
-#define R300_PVS_SRC_SWIZZLE(x)  ((x) << 13)
-#   define R300_PVS_SRC_SELECT_X          0
-#   define R300_PVS_SRC_SELECT_Y          1
-#   define R300_PVS_SRC_SELECT_Z          2
-#   define R300_PVS_SRC_SELECT_W          3
-#   define R300_PVS_SRC_SELECT_FORCE_0    4
-#   define R300_PVS_SRC_SELECT_FORCE_1    5
-#   define R300_PVS_SRC_SWIZZLE_XYZW \
-    ((R300_PVS_SRC_SELECT_X | (R300_PVS_SRC_SELECT_Y << 3) | \
-     (R300_PVS_SRC_SELECT_Z << 6) | (R300_PVS_SRC_SELECT_W << 9)) << 13)
-#   define R300_PVS_SRC_SWIZZLE_ZERO \
-    ((R300_PVS_SRC_SELECT_FORCE_0 | (R300_PVS_SRC_SELECT_FORCE_0 << 3) | \
-     (R300_PVS_SRC_SELECT_FORCE_0 << 6) | \
-      (R300_PVS_SRC_SELECT_FORCE_0 << 9)) << 13)
-#   define R300_PVS_SRC_SWIZZLE_ONE \
-    ((R300_PVS_SRC_SELECT_FORCE_1 | (R300_PVS_SRC_SELECT_FORCE_1 << 3) | \
-     (R300_PVS_SRC_SELECT_FORCE_1 << 6) | \
-      (R300_PVS_SRC_SELECT_FORCE_1 << 9)) << 13)
-#define R300_PVS_MODIFIER_X        (1 << 25)
-#define R300_PVS_MODIFIER_Y        (1 << 26)
-#define R300_PVS_MODIFIER_Z        (1 << 27)
-#define R300_PVS_MODIFIER_W        (1 << 28)
-#define R300_PVS_NEGATE_XYZW \
-    (R300_PVS_MODIFIER_X | R300_PVS_MODIFIER_Y | \
-     R300_PVS_MODIFIER_Z | R300_PVS_MODIFIER_W)
+struct r300_vertex_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+    struct tgsi_shader_info info;
 
-/* Temporary struct used to hold assembly state while putting together
- * fragment programs. */
-struct r300_vs_asm {
-    /* Pipe context. */
-    struct r300_context* r300;
-    /* Number of colors. */
-    unsigned color_count;
-    /* Number of texcoords. */
-    unsigned tex_count;
-    /* Number of requested temporary registers. */
-    unsigned temp_count;
-    /* Offset for immediate constants. Neither R300 nor R500 can do four
-     * inline constants per source, so instead we copy immediates into the
-     * constant buffer. */
-    unsigned imm_offset;
-    /* Number of immediate constants. */
-    unsigned imm_count;
-    /* Number of colors to write. */
-    unsigned out_colors;
-    /* Number of texcoords to write. */
-    unsigned out_texcoords;
-    /* Whether to emit point size. */
-    boolean point_size;
-    /* Tab of declared outputs to OVM outputs. */
-    unsigned tab[16];
-};
+    /* Fallback shader, because Draw has issues */
+    struct draw_vertex_shader* draw;
 
-static struct r300_vertex_shader r300_passthrough_vertex_shader = {
-        /* XXX translate these back into normal instructions */
-    .instruction_count = 2,
-    .instructions[0].inst0 = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(0) | R300_PVS_DST_WE_XYZW,
-    .instructions[0].inst1 = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(0) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .instructions[0].inst2 = R300_PVS_SRC_SWIZZLE_ZERO,
-    .instructions[0].inst3 = 0x0,
-    .instructions[1].inst0 = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(1) | R300_PVS_DST_WE_XYZW,
-    .instructions[1].inst1 = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(1) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .instructions[1].inst2 = R300_PVS_SRC_SWIZZLE_ZERO,
-    .instructions[1].inst3 = 0x0,
-};
+    /* Has this shader been translated yet? */
+    boolean translated;
 
-static struct r300_vertex_shader r300_texture_vertex_shader = {
-        /* XXX translate these back into normal instructions */
-    .instruction_count = 2,
-    .instructions[0].inst0 = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(0) | R300_PVS_DST_WE_XYZW,
-    .instructions[0].inst1 = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(0) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .instructions[0].inst2 = R300_PVS_SRC_SWIZZLE_ZERO,
-    .instructions[0].inst3 = 0x0,
-    .instructions[1].inst0 = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(1) | R300_PVS_DST_WE_XYZW,
-    .instructions[1].inst1 = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(1) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .instructions[1].inst2 = R300_PVS_SRC_SWIZZLE_ZERO,
-    .instructions[1].inst3 = 0x0,
+    /* Machine code (if translated) */
+    struct r300_vertex_program_code code;
 };
 
+
+extern struct r300_vertex_program_code r300_passthrough_vertex_shader;
+
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs);
 
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index d2893c3b9d..f18ad75a47 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -45,6 +45,9 @@ struct r300_winsys {
     /* PCI ID */
     uint32_t pci_id;
 
+    /* GB pipe count */
+    uint32_t gb_pipes;
+
     /* GART size. */
     uint32_t gart_size;
 
diff --git a/src/gallium/drivers/r300/r3xx_fs.c b/src/gallium/drivers/r300/r3xx_fs.c
index 6e05d76977..c1c1194d58 100644
--- a/src/gallium/drivers/r300/r3xx_fs.c
+++ b/src/gallium/drivers/r300/r3xx_fs.c
@@ -23,74 +23,52 @@
 
 #include "r3xx_fs.h"
 
-static INLINE uint32_t r3xx_rgb_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_MOV:
-            return R300_ALU_OUTC_CMP;
-        default:
-            return 0;
-    }
-}
+#include "r300_reg.h"
 
-static INLINE uint32_t r3xx_alpha_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_MOV:
-            return R300_ALU_OUTA_CMP;
-        default:
-            return 0;
-    }
-}
+struct rX00_fragment_program_code r3xx_passthrough_fragment_shader = {
+    .code.r300.alu.length = 1,
+    .code.r300.tex.length = 0,
 
-static INLINE void r3xx_emit_maths(struct r3xx_fragment_shader* fs,
-                                   struct r300_fs_asm* assembler,
-                                   struct tgsi_full_src_register* src,
-                                   struct tgsi_full_dst_register* dst,
-                                   unsigned op,
-                                   unsigned count)
-{
-    int i = fs->alu_instruction_count;
+    .code.r300.config = 0,
+    .code.r300.pixsize = 0,
+    .code.r300.code_offset = 0,
+    .code.r300.code_addr[3] = R300_RGBA_OUT,
 
-    fs->instructions[i].alu_rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
+    .code.r300.alu.inst[0].rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
         R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
         R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
-        r3xx_rgb_op(op);
-    fs->instructions[i].alu_rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
-        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ;
-    fs->instructions[i].alu_alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
+        R300_ALU_OUTC_CMP,
+    .code.r300.alu.inst[0].rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
+        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
+    .code.r300.alu.inst[0].alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
         R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
         R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
-        r3xx_alpha_op(op);
-    fs->instructions[i].alu_alpha_addr = R300_ALPHA_ADDR0(0) |
-        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT;
+        R300_ALU_OUTA_CMP,
+    .code.r300.alu.inst[0].alpha_addr = R300_ALPHA_ADDR0(0) |
+        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
+};
 
-    fs->alu_instruction_count++;
-}
+struct rX00_fragment_program_code r3xx_texture_fragment_shader = {
+    .code.r300.alu.length = 1,
+    .code.r300.tex.length = 1,
 
-void r3xx_fs_finalize(struct r300_fragment_shader* fs,
-                      struct r300_fs_asm* assembler)
-{
-    fs->stack_size = assembler->temp_count + assembler->temp_offset + 1;
-}
+    .code.r300.config = R300_PFS_CNTL_FIRST_NODE_HAS_TEX,
+    .code.r300.pixsize = 0,
+    .code.r300.code_offset = 0,
+    .code.r300.code_addr[3] = R300_RGBA_OUT,
 
-void r3xx_fs_instruction(struct r3xx_fragment_shader* fs,
-                         struct r300_fs_asm* assembler,
-                         struct tgsi_full_instruction* inst)
-{
-    switch (inst->Instruction.Opcode) {
-        case TGSI_OPCODE_MOV:
-            /* src0 -> src1 and src2 forced to zero */
-            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[0];
-            inst->FullSrcRegisters[2] = r300_constant_zero;
-            r3xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-        case TGSI_OPCODE_END:
-            break;
-        default:
-            debug_printf("r300: fs: Bad opcode %d\n",
-                    inst->Instruction.Opcode);
-            break;
-    }
-}
+    .code.r300.tex.inst[0] = R300_TEX_OP_LD << R300_TEX_INST_SHIFT,
+
+    .code.r300.alu.inst[0].rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
+        R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
+        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
+        R300_ALU_OUTC_CMP,
+    .code.r300.alu.inst[0].rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
+        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
+    .code.r300.alu.inst[0].alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
+        R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
+        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
+        R300_ALU_OUTA_CMP,
+    .code.r300.alu.inst[0].alpha_addr = R300_ALPHA_ADDR0(0) |
+        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
+};
diff --git a/src/gallium/drivers/r300/r3xx_fs.h b/src/gallium/drivers/r300/r3xx_fs.h
index 3da39ec252..51cd245724 100644
--- a/src/gallium/drivers/r300/r3xx_fs.h
+++ b/src/gallium/drivers/r300/r3xx_fs.h
@@ -24,53 +24,9 @@
 #ifndef R3XX_FS_H
 #define R3XX_FS_H
 
-#include "r300_fs_inlines.h"
+#include "radeon_code.h"
 
-static struct r3xx_fragment_shader r3xx_passthrough_fragment_shader = {
-    .alu_instruction_count = 1,
-    .tex_instruction_count = 0,
-    .indirections = 0,
-    .shader.stack_size = 1,
-
-    .instructions[0].alu_rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
-        R300_ALU_OUTC_CMP,
-    .instructions[0].alu_rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
-        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
-    .instructions[0].alu_alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
-        R300_ALU_OUTA_CMP,
-    .instructions[0].alu_alpha_addr = R300_ALPHA_ADDR0(0) |
-        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
-};
-
-static struct r3xx_fragment_shader r3xx_texture_fragment_shader = {
-    .alu_instruction_count = 1,
-    .tex_instruction_count = 0,
-    .indirections = 0,
-    .shader.stack_size = 1,
-
-    .instructions[0].alu_rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
-        R300_ALU_OUTC_CMP,
-    .instructions[0].alu_rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
-        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
-    .instructions[0].alu_alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
-        R300_ALU_OUTA_CMP,
-    .instructions[0].alu_alpha_addr = R300_ALPHA_ADDR0(0) |
-        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
-};
-
-void r3xx_fs_finalize(struct r300_fragment_shader* fs,
-                      struct r300_fs_asm* assembler);
-
-void r3xx_fs_instruction(struct r3xx_fragment_shader* fs,
-                         struct r300_fs_asm* assembler,
-                         struct tgsi_full_instruction* inst);
+struct rX00_fragment_program_code r3xx_passthrough_fragment_shader;
+struct rX00_fragment_program_code r3xx_texture_fragment_shader;
 
 #endif /* R3XX_FS_H */
diff --git a/src/gallium/drivers/r300/r5xx_fs.c b/src/gallium/drivers/r300/r5xx_fs.c
index 99d826278c..f072deab0d 100644
--- a/src/gallium/drivers/r300/r5xx_fs.c
+++ b/src/gallium/drivers/r300/r5xx_fs.c
@@ -23,445 +23,103 @@
 
 #include "r5xx_fs.h"
 
-static INLINE unsigned r5xx_fix_swiz(unsigned s)
-{
-    /* For historical reasons, the swizzle values x, y, z, w, and 0 are
-     * equivalent to the actual machine code, but 1 is not. Thus, we just
-     * adjust it a bit... */
-    if (s == TGSI_EXTSWIZZLE_ONE) {
-        return R500_SWIZZLE_ONE;
-    } else {
-        return s;
-    }
-}
-
-static uint32_t r5xx_rgba_swiz(struct tgsi_full_src_register* reg)
-{
-    if (reg->SrcRegister.Extended) {
-        return r5xx_fix_swiz(reg->SrcRegisterExtSwz.ExtSwizzleX) |
-            (r5xx_fix_swiz(reg->SrcRegisterExtSwz.ExtSwizzleY) << 3) |
-            (r5xx_fix_swiz(reg->SrcRegisterExtSwz.ExtSwizzleZ) << 6) |
-            (r5xx_fix_swiz(reg->SrcRegisterExtSwz.ExtSwizzleW) << 9);
-    } else {
-        return reg->SrcRegister.SwizzleX |
-            (reg->SrcRegister.SwizzleY << 3) |
-            (reg->SrcRegister.SwizzleZ << 6) |
-            (reg->SrcRegister.SwizzleW << 9);
-    }
-}
-
-static uint32_t r5xx_strq_swiz(struct tgsi_full_src_register* reg)
-{
-    return reg->SrcRegister.SwizzleX |
-        (reg->SrcRegister.SwizzleY << 2) |
-        (reg->SrcRegister.SwizzleZ << 4) |
-        (reg->SrcRegister.SwizzleW << 6);
-}
-
-static INLINE uint32_t r5xx_rgb_swiz(struct tgsi_full_src_register* reg)
-{
-    /* Only the first 9 bits... */
-    return (r5xx_rgba_swiz(reg) & 0x1ff) |
-        (reg->SrcRegister.Negate ? (1 << 9) : 0) |
-        (reg->SrcRegisterExtMod.Absolute ? (1 << 10) : 0);
-}
-
-static INLINE uint32_t r5xx_alpha_swiz(struct tgsi_full_src_register* reg)
-{
-    /* Only the last 3 bits... */
-    return (r5xx_rgba_swiz(reg) >> 9) |
-        (reg->SrcRegister.Negate ? (1 << 9) : 0) |
-        (reg->SrcRegisterExtMod.Absolute ? (1 << 10) : 0);
-}
-
-static INLINE uint32_t r5xx_rgba_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_COS:
-        case TGSI_OPCODE_EX2:
-        case TGSI_OPCODE_LG2:
-        case TGSI_OPCODE_RCP:
-        case TGSI_OPCODE_RSQ:
-        case TGSI_OPCODE_SIN:
-            return R500_ALU_RGBA_OP_SOP;
-        case TGSI_OPCODE_DDX:
-            return R500_ALU_RGBA_OP_MDH;
-        case TGSI_OPCODE_DDY:
-            return R500_ALU_RGBA_OP_MDV;
-        case TGSI_OPCODE_FRC:
-            return R500_ALU_RGBA_OP_FRC;
-        case TGSI_OPCODE_DP3:
-            return R500_ALU_RGBA_OP_DP3;
-        case TGSI_OPCODE_DP4:
-        case TGSI_OPCODE_DPH:
-            return R500_ALU_RGBA_OP_DP4;
-        case TGSI_OPCODE_ABS:
-        case TGSI_OPCODE_CMP:
-        case TGSI_OPCODE_MOV:
-        case TGSI_OPCODE_SWZ:
-            return R500_ALU_RGBA_OP_CMP;
-        case TGSI_OPCODE_ADD:
-        case TGSI_OPCODE_MAD:
-        case TGSI_OPCODE_MUL:
-        case TGSI_OPCODE_SUB:
-            return R500_ALU_RGBA_OP_MAD;
-        default:
-            return 0;
-    }
-}
-
-static INLINE uint32_t r5xx_alpha_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_COS:
-            return R500_ALPHA_OP_COS;
-        case TGSI_OPCODE_EX2:
-            return R500_ALPHA_OP_EX2;
-        case TGSI_OPCODE_LG2:
-            return R500_ALPHA_OP_LN2;
-        case TGSI_OPCODE_RCP:
-            return R500_ALPHA_OP_RCP;
-        case TGSI_OPCODE_RSQ:
-            return R500_ALPHA_OP_RSQ;
-        case TGSI_OPCODE_FRC:
-            return R500_ALPHA_OP_FRC;
-        case TGSI_OPCODE_SIN:
-            return R500_ALPHA_OP_SIN;
-        case TGSI_OPCODE_DDX:
-            return R500_ALPHA_OP_MDH;
-        case TGSI_OPCODE_DDY:
-            return R500_ALPHA_OP_MDV;
-        case TGSI_OPCODE_DP3:
-        case TGSI_OPCODE_DP4:
-        case TGSI_OPCODE_DPH:
-            return R500_ALPHA_OP_DP;
-        case TGSI_OPCODE_ABS:
-        case TGSI_OPCODE_CMP:
-        case TGSI_OPCODE_MOV:
-        case TGSI_OPCODE_SWZ:
-            return R500_ALPHA_OP_CMP;
-        case TGSI_OPCODE_ADD:
-        case TGSI_OPCODE_MAD:
-        case TGSI_OPCODE_MUL:
-        case TGSI_OPCODE_SUB:
-            return R500_ALPHA_OP_MAD;
-        default:
-            return 0;
-    }
-}
-
-static INLINE uint32_t r5xx_tex_op(unsigned op)
-{
-    switch (op) {
-        case TGSI_OPCODE_KIL:
-            return R500_TEX_INST_TEXKILL;
-        case TGSI_OPCODE_TEX:
-            return R500_TEX_INST_LD;
-        case TGSI_OPCODE_TXB:
-            return R500_TEX_INST_LODBIAS;
-        case TGSI_OPCODE_TXP:
-            return R500_TEX_INST_PROJ;
-        default:
-            return 0;
-    }
-}
-
-/* Setup an ALU operation. */
-static INLINE void r5xx_emit_maths(struct r5xx_fragment_shader* fs,
-                                   struct r300_fs_asm* assembler,
-                                   struct tgsi_full_src_register* src,
-                                   struct tgsi_full_dst_register* dst,
-                                   unsigned op,
-                                   unsigned count)
-{
-    int i = fs->instruction_count;
-
-    if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
-        fs->instructions[i].inst0 = R500_INST_TYPE_OUT;
-        if (r300_fs_is_depr(assembler, dst)) {
-            fs->instructions[i].inst4 = R500_W_OMASK;
-        } else {
-            fs->instructions[i].inst0 |=
-                R500_ALU_OMASK(dst->DstRegister.WriteMask);
-        }
-    } else {
-        fs->instructions[i].inst0 = R500_INST_TYPE_ALU |
-            R500_ALU_WMASK(dst->DstRegister.WriteMask);
-    }
-
-    fs->instructions[i].inst0 |= R500_INST_TEX_SEM_WAIT;
-
-    fs->instructions[i].inst4 |=
-        R500_ALPHA_ADDRD(r300_fs_dst(assembler, &dst->DstRegister));
-    fs->instructions[i].inst5 =
-        R500_ALU_RGBA_ADDRD(r300_fs_dst(assembler, &dst->DstRegister));
-
-    switch (count) {
-        case 3:
-            fs->instructions[i].inst1 =
-                R500_RGB_ADDR2(r300_fs_src(assembler, &src[2].SrcRegister));
-            fs->instructions[i].inst2 =
-                R500_ALPHA_ADDR2(r300_fs_src(assembler, &src[2].SrcRegister));
-            fs->instructions[i].inst5 |=
-                R500_ALU_RGBA_SEL_C_SRC2 |
-                R500_SWIZ_RGBA_C(r5xx_rgb_swiz(&src[2])) |
-                R500_ALU_RGBA_ALPHA_SEL_C_SRC2 |
-                R500_SWIZ_ALPHA_C(r5xx_alpha_swiz(&src[2]));
-        case 2:
-            fs->instructions[i].inst1 |=
-                R500_RGB_ADDR1(r300_fs_src(assembler, &src[1].SrcRegister));
-            fs->instructions[i].inst2 |=
-                R500_ALPHA_ADDR1(r300_fs_src(assembler, &src[1].SrcRegister));
-            fs->instructions[i].inst3 =
-                R500_ALU_RGB_SEL_B_SRC1 |
-                R500_SWIZ_RGB_B(r5xx_rgb_swiz(&src[1]));
-            fs->instructions[i].inst4 |=
-                R500_ALPHA_SEL_B_SRC1 |
-                R500_SWIZ_ALPHA_B(r5xx_alpha_swiz(&src[1]));
-        case 1:
-        case 0:
-        default:
-            fs->instructions[i].inst1 |=
-                R500_RGB_ADDR0(r300_fs_src(assembler, &src[0].SrcRegister));
-            fs->instructions[i].inst2 |=
-                R500_ALPHA_ADDR0(r300_fs_src(assembler, &src[0].SrcRegister));
-            fs->instructions[i].inst3 |=
-                R500_ALU_RGB_SEL_A_SRC0 |
-                R500_SWIZ_RGB_A(r5xx_rgb_swiz(&src[0]));
-            fs->instructions[i].inst4 |=
-                R500_ALPHA_SEL_A_SRC0 |
-                R500_SWIZ_ALPHA_A(r5xx_alpha_swiz(&src[0]));
-            break;
-    }
-
-    fs->instructions[i].inst4 |= r5xx_alpha_op(op);
-    fs->instructions[i].inst5 |= r5xx_rgba_op(op);
-
-    fs->instruction_count++;
-}
-
-static INLINE void r5xx_emit_tex(struct r5xx_fragment_shader* fs,
-                                 struct r300_fs_asm* assembler,
-                                 struct tgsi_full_src_register* src,
-                                 struct tgsi_full_dst_register* dst,
-                                 uint32_t op)
-{
-    int i = fs->instruction_count;
-
-    fs->instructions[i].inst0 = R500_INST_TYPE_TEX |
-        R500_TEX_WMASK(dst->DstRegister.WriteMask) |
-        R500_INST_TEX_SEM_WAIT;
-    fs->instructions[i].inst1 = R500_TEX_ID(0) |
-        R500_TEX_SEM_ACQUIRE | //R500_TEX_IGNORE_UNCOVERED |
-        r5xx_tex_op(op);
-    fs->instructions[i].inst2 =
-        R500_TEX_SRC_ADDR(r300_fs_src(assembler, &src->SrcRegister)) |
-        R500_SWIZ_TEX_STRQ(r5xx_strq_swiz(src)) |
-        R500_TEX_DST_ADDR(r300_fs_dst(assembler, &dst->DstRegister)) |
+#include "r300_reg.h"
+
+/* XXX this all should find its way back to r300_reg */
+/* Swizzle tools */
+#define R500_SWIZZLE_ZERO 4
+#define R500_SWIZZLE_HALF 5
+#define R500_SWIZZLE_ONE 6
+#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
+#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
+#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
+#define R500_SWIZ_MOD_NEG 1
+#define R500_SWIZ_MOD_ABS 2
+#define R500_SWIZ_MOD_NEG_ABS 3
+/* Swizzles for inst2 */
+#define R500_SWIZ_TEX_STRQ(x) ((x) << 8)
+#define R500_SWIZ_TEX_RGBA(x) ((x) << 24)
+/* Swizzles for inst3 */
+#define R500_SWIZ_RGB_A(x) ((x) << 2)
+#define R500_SWIZ_RGB_B(x) ((x) << 15)
+/* Swizzles for inst4 */
+#define R500_SWIZ_ALPHA_A(x) ((x) << 14)
+#define R500_SWIZ_ALPHA_B(x) ((x) << 21)
+/* Swizzle for inst5 */
+#define R500_SWIZ_RGBA_C(x) ((x) << 14)
+#define R500_SWIZ_ALPHA_C(x) ((x) << 27)
+/* Writemasks */
+#define R500_TEX_WMASK(x) ((x) << 11)
+#define R500_ALU_WMASK(x) ((x) << 11)
+#define R500_ALU_OMASK(x) ((x) << 15)
+#define R500_W_OMASK (1 << 31)
+
+struct rX00_fragment_program_code r5xx_passthrough_fragment_shader = {
+    .code.r500.max_temp_idx = 0,
+    .code.r500.inst_end = 0,
+
+    .code.r500.inst[0].inst0 = R500_INST_TYPE_OUT |
+        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
+        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
+        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
+    .code.r500.inst[0].inst1 =
+        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
+        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
+    .code.r500.inst[0].inst2 =
+        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
+        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
+    .code.r500.inst[0].inst3 =
+        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
+        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
+        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
+        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
+    .code.r500.inst[0].inst4 =
+        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
+    .code.r500.inst[0].inst5 =
+        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
+        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
+        R500_ALU_RGBA_A_SWIZ_0,
+};
+
+struct rX00_fragment_program_code r5xx_texture_fragment_shader = {
+    .code.r500.max_temp_idx = 0,
+    .code.r500.inst_end = 1,
+
+    .code.r500.inst[0].inst0 = R500_INST_TYPE_TEX |
+        R500_INST_TEX_SEM_WAIT |
+        R500_INST_RGB_WMASK_RGB | R500_INST_ALPHA_WMASK |
+        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
+    .code.r500.inst[0].inst1 = R500_TEX_ID(0) | R500_TEX_INST_LD |
+        R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED,
+    .code.r500.inst[0].inst2 = R500_TEX_SRC_ADDR(0) |
+        R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G |
+        R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A |
+        R500_TEX_DST_ADDR(0) |
         R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G |
-        R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
-
-    if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
-        fs->instructions[i].inst2 |=
-            R500_TEX_DST_ADDR(assembler->temp_count +
-                    assembler->temp_offset);
-
-        fs->instruction_count++;
-
-        /* Setup and emit a MOV. */
-        src[0].SrcRegister.Index = assembler->temp_count;
-        src[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
-
-        src[1] = src[0];
-        src[2] = r300_constant_zero;
-        r5xx_emit_maths(fs, assembler, src, dst, TGSI_OPCODE_MOV, 3);
-    } else {
-        fs->instruction_count++;
-    }
-}
-
-void r5xx_fs_finalize(struct r5xx_fragment_shader* fs,
-                      struct r300_fs_asm* assembler)
-{
-    /* XXX should this just go with OPCODE_END? */
-    fs->instructions[fs->instruction_count - 1].inst0 |=
-        R500_INST_LAST;
-}
-
-void r5xx_fs_instruction(struct r5xx_fragment_shader* fs,
-                         struct r300_fs_asm* assembler,
-                         struct tgsi_full_instruction* inst)
-{
-    /* Switch between opcodes. When possible, prefer using the official
-     * AMD/ATI names for opcodes, please, as it facilitates using the
-     * documentation. */
-    switch (inst->Instruction.Opcode) {
-        /* XXX trig needs extra prep */
-        case TGSI_OPCODE_COS:
-        case TGSI_OPCODE_SIN:
-        /* The simple scalar ops. */
-        case TGSI_OPCODE_EX2:
-        case TGSI_OPCODE_LG2:
-        case TGSI_OPCODE_RCP:
-        case TGSI_OPCODE_RSQ:
-            /* Copy red swizzle to alpha for src0 */
-            inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleW =
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleX;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleW =
-                inst->FullSrcRegisters[0].SrcRegister.SwizzleX;
-            /* Fall through */
-        case TGSI_OPCODE_DDX:
-        case TGSI_OPCODE_DDY:
-        case TGSI_OPCODE_FRC:
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 1);
-            break;
-
-        /* The dot products. */
-        case TGSI_OPCODE_DPH:
-            /* Set alpha swizzle to one for src0 */
-            if (!inst->FullSrcRegisters[0].SrcRegister.Extended) {
-                inst->FullSrcRegisters[0].SrcRegister.Extended = TRUE;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleX =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleX;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleY =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleY;
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleZ =
-                    inst->FullSrcRegisters[0].SrcRegister.SwizzleZ;
-            }
-            inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleW =
-                TGSI_EXTSWIZZLE_ONE;
-            /* Fall through */
-        case TGSI_OPCODE_DP3:
-        case TGSI_OPCODE_DP4:
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 2);
-            break;
-
-        /* Simple three-source operations. */
-        case TGSI_OPCODE_CMP:
-            /* Swap src0 and src2 */
-            inst->FullSrcRegisters[3] = inst->FullSrcRegisters[2];
-            inst->FullSrcRegisters[2] = inst->FullSrcRegisters[0];
-            inst->FullSrcRegisters[0] = inst->FullSrcRegisters[3];
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-
-        /* The MAD variants. */
-        case TGSI_OPCODE_SUB:
-            /* Just like ADD, but flip the negation on src1 first */
-            inst->FullSrcRegisters[1].SrcRegister.Negate =
-                !inst->FullSrcRegisters[1].SrcRegister.Negate;
-            /* Fall through */
-        case TGSI_OPCODE_ADD:
-            /* Force src0 to one, move all registers over */
-            inst->FullSrcRegisters[2] = inst->FullSrcRegisters[1];
-            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[0];
-            inst->FullSrcRegisters[0] = r300_constant_one;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-        case TGSI_OPCODE_MUL:
-            /* Force our src2 to zero */
-            inst->FullSrcRegisters[2] = r300_constant_zero;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-        case TGSI_OPCODE_MAD:
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-
-        /* The MOV variants. */
-        case TGSI_OPCODE_ABS:
-            /* Set absolute value modifiers. */
-            inst->FullSrcRegisters[0].SrcRegisterExtMod.Absolute = TRUE;
-            /* Fall through */
-        case TGSI_OPCODE_MOV:
-        case TGSI_OPCODE_SWZ:
-            /* src0 -> src1 and src2 forced to zero */
-            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[0];
-            inst->FullSrcRegisters[2] = r300_constant_zero;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
-            break;
-
-        /* The compound and hybrid insts. */
-        case TGSI_OPCODE_LRP:
-            /* LRP DST A, B, C -> MAD TMP -A, C, C; MAD DST A, B, TMP */
-            inst->FullSrcRegisters[3] = inst->FullSrcRegisters[1];
-            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[2];
-            inst->FullSrcRegisters[0].SrcRegister.Negate =
-                !(inst->FullSrcRegisters[0].SrcRegister.Negate);
-            inst->FullDstRegisters[1] = inst->FullDstRegisters[0];
-            inst->FullDstRegisters[0].DstRegister.Index =
-                assembler->temp_count;
-            inst->FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], TGSI_OPCODE_MAD, 3);
-            inst->FullSrcRegisters[2].SrcRegister.Index =
-                assembler->temp_count;
-            inst->FullSrcRegisters[2].SrcRegister.File = TGSI_FILE_TEMPORARY;
-            inst->FullSrcRegisters[2].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-            inst->FullSrcRegisters[2].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-            inst->FullSrcRegisters[2].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Z;
-            inst->FullSrcRegisters[2].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
-            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[3];
-            inst->FullSrcRegisters[0].SrcRegister.Negate =
-                !(inst->FullSrcRegisters[0].SrcRegister.Negate);
-            inst->FullDstRegisters[0] = inst->FullDstRegisters[1];
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], TGSI_OPCODE_MAD, 3);
-            break;
-        case TGSI_OPCODE_POW:
-            /* POW DST A, B -> LG2 TMP A; MUL TMP TMP, B; EX2 DST TMP */
-            inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleW =
-                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleX;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleW =
-                inst->FullSrcRegisters[0].SrcRegister.SwizzleX;
-            inst->FullDstRegisters[1] = inst->FullDstRegisters[0];
-            inst->FullDstRegisters[0].DstRegister.Index =
-                assembler->temp_count;
-            inst->FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], TGSI_OPCODE_LG2, 1);
-            inst->FullSrcRegisters[0].SrcRegister.Index =
-                assembler->temp_count;
-            inst->FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Z;
-            inst->FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
-            inst->FullSrcRegisters[2] = r300_constant_zero;
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], TGSI_OPCODE_MUL, 3);
-            inst->FullDstRegisters[0] = inst->FullDstRegisters[1];
-            r5xx_emit_maths(fs, assembler, inst->FullSrcRegisters,
-                    &inst->FullDstRegisters[0], TGSI_OPCODE_EX2, 1);
-            break;
-
-        /* The texture instruction set. */
-        case TGSI_OPCODE_KIL:
-        case TGSI_OPCODE_TEX:
-        case TGSI_OPCODE_TXB:
-        case TGSI_OPCODE_TXP:
-            r5xx_emit_tex(fs, assembler, &inst->FullSrcRegisters[0],
-                    &inst->FullDstRegisters[0], inst->Instruction.Opcode);
-            break;
-
-        /* This is the end. My only friend, the end. */
-        case TGSI_OPCODE_END:
-            break;
-        default:
-            debug_printf("r300: fs: Bad opcode %d\n",
-                    inst->Instruction.Opcode);
-            break;
-    }
-
-    /* Clamp, if saturation flags are set. */
-    if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) {
-        fs->instructions[fs->instruction_count - 1].inst0 |=
-            R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
-    }
-}
+        R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A,
+    .code.r500.inst[0].inst3 = 0x0,
+    .code.r500.inst[0].inst4 = 0x0,
+    .code.r500.inst[0].inst5 = 0x0,
+
+    .code.r500.inst[1].inst0 = R500_INST_TYPE_OUT |
+        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
+        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
+        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
+    .code.r500.inst[1].inst1 =
+        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
+        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
+    .code.r500.inst[1].inst2 =
+        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
+        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
+    .code.r500.inst[1].inst3 =
+        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
+        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
+        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
+        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
+    .code.r500.inst[1].inst4 =
+        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
+    .code.r500.inst[1].inst5 =
+        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
+        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
+        R500_ALU_RGBA_A_SWIZ_0,
+};
diff --git a/src/gallium/drivers/r300/r5xx_fs.h b/src/gallium/drivers/r300/r5xx_fs.h
index 629e587be4..a4addde32b 100644
--- a/src/gallium/drivers/r300/r5xx_fs.h
+++ b/src/gallium/drivers/r300/r5xx_fs.h
@@ -24,109 +24,9 @@
 #ifndef R5XX_FS_H
 #define R5XX_FS_H
 
-#include "r300_fs_inlines.h"
+#include "radeon_code.h"
 
-/* XXX this all should find its way back to r300_reg */
-/* Swizzle tools */
-#define R500_SWIZZLE_ZERO 4
-#define R500_SWIZZLE_HALF 5
-#define R500_SWIZZLE_ONE 6
-#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
-#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
-#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
-#define R500_SWIZ_MOD_NEG 1
-#define R500_SWIZ_MOD_ABS 2
-#define R500_SWIZ_MOD_NEG_ABS 3
-/* Swizzles for inst2 */
-#define R500_SWIZ_TEX_STRQ(x) ((x) << 8)
-#define R500_SWIZ_TEX_RGBA(x) ((x) << 24)
-/* Swizzles for inst3 */
-#define R500_SWIZ_RGB_A(x) ((x) << 2)
-#define R500_SWIZ_RGB_B(x) ((x) << 15)
-/* Swizzles for inst4 */
-#define R500_SWIZ_ALPHA_A(x) ((x) << 14)
-#define R500_SWIZ_ALPHA_B(x) ((x) << 21)
-/* Swizzle for inst5 */
-#define R500_SWIZ_RGBA_C(x) ((x) << 14)
-#define R500_SWIZ_ALPHA_C(x) ((x) << 27)
-/* Writemasks */
-#define R500_TEX_WMASK(x) ((x) << 11)
-#define R500_ALU_WMASK(x) ((x) << 11)
-#define R500_ALU_OMASK(x) ((x) << 15)
-#define R500_W_OMASK (1 << 31)
-
-static struct r5xx_fragment_shader r5xx_passthrough_fragment_shader = {
-    .shader.stack_size = 0,
-    .instruction_count = 1,
-    .instructions[0].inst0 = R500_INST_TYPE_OUT |
-        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
-        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .instructions[0].inst1 =
-        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
-        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
-    .instructions[0].inst2 =
-        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
-        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
-    .instructions[0].inst3 =
-        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
-        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
-        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
-        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
-    .instructions[0].inst4 =
-        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
-    .instructions[0].inst5 =
-        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
-        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
-        R500_ALU_RGBA_A_SWIZ_0,
-};
-
-static struct r5xx_fragment_shader r5xx_texture_fragment_shader = {
-    .shader.stack_size = 1,
-    .instruction_count = 2,
-    .instructions[0].inst0 = R500_INST_TYPE_TEX |
-        R500_INST_TEX_SEM_WAIT |
-        R500_INST_RGB_WMASK_RGB | R500_INST_ALPHA_WMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .instructions[0].inst1 = R500_TEX_ID(0) | R500_TEX_INST_LD |
-        R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED,
-    .instructions[0].inst2 = R500_TEX_SRC_ADDR(0) |
-        R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G |
-        R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A |
-        R500_TEX_DST_ADDR(0) |
-        R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G |
-        R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A,
-    .instructions[0].inst3 = 0x0,
-    .instructions[0].inst4 = 0x0,
-    .instructions[0].inst5 = 0x0,
-    .instructions[1].inst0 = R500_INST_TYPE_OUT |
-        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
-        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .instructions[1].inst1 =
-        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
-        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
-    .instructions[1].inst2 =
-        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
-        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
-    .instructions[1].inst3 =
-        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
-        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
-        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
-        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
-    .instructions[1].inst4 =
-        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
-    .instructions[1].inst5 =
-        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
-        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
-        R500_ALU_RGBA_A_SWIZ_0,
-};
-
-void r5xx_fs_finalize(struct r5xx_fragment_shader* fs,
-                      struct r300_fs_asm* assembler);
-
-void r5xx_fs_instruction(struct r5xx_fragment_shader* fs,
-                         struct r300_fs_asm* assembler,
-                         struct tgsi_full_instruction* inst);
+struct rX00_fragment_program_code r5xx_passthrough_fragment_shader;
+struct rX00_fragment_program_code r5xx_texture_fragment_shader;
 
 #endif /* R5XX_FS_H */
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index fa59277438..8fac8e6e05 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,8 +36,6 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_state.h"
 #include "sp_tile_cache.h"
 
 
@@ -85,5 +83,7 @@ softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
       /* non-cached surface */
       pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, cv);
 #endif
-      }
+   }
+
+   softpipe->dirty_render_cache = TRUE;
 }
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
index 2e450672f5..9be3b86fe9 100644
--- a/src/gallium/drivers/softpipe/sp_clear.h
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -32,7 +32,6 @@
 #ifndef SP_CLEAR_H
 #define SP_CLEAR_H
 
-#include "pipe/p_state.h"
 struct pipe_context;
 
 extern void
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index c4b8b33c6a..e1e31ab047 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -81,7 +81,8 @@ softpipe_unmap_transfers(struct softpipe_context *sp)
 }
 
 
-static void softpipe_destroy( struct pipe_context *pipe )
+static void
+softpipe_destroy( struct pipe_context *pipe )
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
    uint i;
@@ -109,6 +110,15 @@ static void softpipe_destroy( struct pipe_context *pipe )
    FREE( softpipe );
 }
 
+
+/**
+ * if (the texture is being used as a framebuffer surface)
+ *    return PIPE_REFERENCED_FOR_WRITE
+ * else if (the texture is a bound texture source)
+ *    return PIPE_REFERENCED_FOR_READ  XXX not done yet
+ * else
+ *    return PIPE_UNREFERENCED
+ */
 static unsigned int
 softpipe_is_texture_referenced( struct pipe_context *pipe,
 				struct pipe_texture *texture,
@@ -117,15 +127,17 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    struct softpipe_context *softpipe = softpipe_context( pipe );
    unsigned i;
 
-   if(softpipe->dirty_render_cache) {
+   if (softpipe->dirty_render_cache) {
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-         if(softpipe->framebuffer.cbufs[i] && 
-            softpipe->framebuffer.cbufs[i]->texture == texture)
+         if (softpipe->framebuffer.cbufs[i] && 
+             softpipe->framebuffer.cbufs[i]->texture == texture) {
             return PIPE_REFERENCED_FOR_WRITE;
+         }
       }
-      if(softpipe->framebuffer.zsbuf && 
-         softpipe->framebuffer.zsbuf->texture == texture)
+      if (softpipe->framebuffer.zsbuf && 
+          softpipe->framebuffer.zsbuf->texture == texture) {
          return PIPE_REFERENCED_FOR_WRITE;
+      }
    }
    
    /* FIXME: we also need to do the same for the texture cache */
@@ -133,6 +145,7 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 static unsigned int
 softpipe_is_buffer_referenced( struct pipe_context *pipe,
 			       struct pipe_buffer *buf)
@@ -140,6 +153,7 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -210,7 +224,6 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.is_buffer_referenced = softpipe_is_buffer_referenced;
 
    softpipe_init_query_funcs( softpipe );
-   softpipe_init_texture_funcs( softpipe );
 
    /*
     * Alloc caches for accessing drawing surfaces and textures.
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 6178c4ac7e..6cf45cded2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -40,7 +40,7 @@
 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -65,8 +65,6 @@ softpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -141,6 +139,7 @@ softpipe_is_format_supported( struct pipe_screen *screen,
    case PIPE_FORMAT_DXT1_RGBA:
    case PIPE_FORMAT_DXT3_RGBA:
    case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_Z32_FLOAT:
       return FALSE;
    default:
       return TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 856c9ce176..1faeca1c2a 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -89,6 +89,23 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < spfs->info.num_inputs; i++) {
          int src;
+         enum interp_mode interp;
+
+         switch (spfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
          switch (spfs->info.input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
             src = draw_find_vs_output(softpipe->draw,
@@ -104,7 +121,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
          case TGSI_SEMANTIC_FOG:
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          case TGSI_SEMANTIC_GENERIC:
@@ -112,7 +129,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
             /* this includes texcoords and varying vars */
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
                                       spfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          default:
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index ba9b91a378..21031c11b8 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1526,7 +1526,7 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
       unsigned face;
       float sc, tc, ma;
 
-      if (arx > ary && arx > arz) {
+      if (arx >= ary && arx >= arz) {
          if (rx >= 0.0F) {
             face = PIPE_TEX_FACE_POS_X;
             sc = -rz;
@@ -1540,7 +1540,7 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
             ma = arx;
          }
       }
-      else if (ary > arx && ary > arz) {
+      else if (ary >= arx && ary >= arz) {
          if (ry >= 0.0F) {
             face = PIPE_TEX_FACE_POS_Y;
             sc = rx;
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index a3a54dada4..49b51afda7 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -30,31 +30,21 @@
   *   Michel Dänzer <michel@tungstengraphics.com>
   */
 
-#include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
 #include "sp_screen.h"
 #include "sp_winsys.h"
 
 
-/* Simple, maximally packed layout.
- */
-
-static unsigned minify( unsigned d )
-{
-   return MAX2(1, d>>1);
-}
-
-
-/* Conventional allocation path for non-display textures:
+/**
+ * Conventional allocation path for non-display textures:
+ * Use a simple, maximally packed layout.
  */
 static boolean
 softpipe_texture_layout(struct pipe_screen *screen,
@@ -94,12 +84,17 @@ softpipe_texture_layout(struct pipe_screen *screen,
    return spt->buffer != NULL;
 }
 
+
+/**
+ * Texture layout for simple color buffers.
+ */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
                               struct softpipe_texture * spt)
 {
    unsigned usage = (PIPE_BUFFER_USAGE_CPU_READ_WRITE |
                      PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+   unsigned tex_usage = spt->base.tex_usage;
 
    spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
    spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
@@ -109,15 +104,13 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
                                                 spt->base.height[0],
                                                 spt->base.format,
                                                 usage,
+                                                tex_usage,
                                                 &spt->stride[0]);
 
    return spt->buffer != NULL;
 }
 
 
-
-
-
 static struct pipe_texture *
 softpipe_texture_create(struct pipe_screen *screen,
                         const struct pipe_texture *template)
@@ -134,7 +127,8 @@ softpipe_texture_create(struct pipe_screen *screen,
                util_is_power_of_two(template->height[0]) &&
                util_is_power_of_two(template->depth[0]));
 
-   if (spt->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET) {
+   if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+                              PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!softpipe_displaytarget_layout(screen, spt))
          goto fail;
    }
@@ -355,14 +349,13 @@ softpipe_transfer_map( struct pipe_screen *screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
-   {
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
        */
       softpipe_screen(screen)->timestamp++;
    }
-   
+
    xfer_map = map + softpipe_transfer(transfer)->offset +
       transfer->y / transfer->block.height * transfer->stride +
       transfer->x / transfer->block.width * transfer->block.size;
@@ -373,7 +366,7 @@ softpipe_transfer_map( struct pipe_screen *screen,
 
 static void
 softpipe_transfer_unmap(struct pipe_screen *screen,
-                       struct pipe_transfer *transfer)
+                        struct pipe_transfer *transfer)
 {
    struct softpipe_texture *spt;
 
@@ -381,12 +374,11 @@ softpipe_transfer_unmap(struct pipe_screen *screen,
    spt = softpipe_texture(transfer->texture);
 
    pipe_buffer_unmap( screen, spt->buffer );
-}
-
 
-void
-softpipe_init_texture_funcs(struct softpipe_context *sp)
-{
+   if (transfer->usage != PIPE_TRANSFER_READ) {
+      /* Mark the texture as dirty to expire the tile caches. */
+      spt->timestamp++;
+   }
 }
 
 
@@ -412,7 +404,7 @@ softpipe_get_texture_buffer( struct pipe_texture *texture,
                              struct pipe_buffer **buf,
                              unsigned *stride )
 {
-   struct softpipe_texture *tex = (struct softpipe_texture *)texture;
+   struct softpipe_texture *tex = (struct softpipe_texture *) texture;
 
    if (!tex)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 4dd0c1239e..2537ab6a40 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -78,9 +78,6 @@ softpipe_transfer(struct pipe_transfer *pt)
 
 
 extern void
-softpipe_init_texture_funcs( struct softpipe_context *softpipe );
-
-extern void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 
 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 4ab718f233..ae0af4d055 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -1277,8 +1277,10 @@ trace_context_create(struct pipe_screen *_screen,
    tr_ctx->base.set_sampler_textures = trace_context_set_sampler_textures;
    tr_ctx->base.set_vertex_buffers = trace_context_set_vertex_buffers;
    tr_ctx->base.set_vertex_elements = trace_context_set_vertex_elements;
-   tr_ctx->base.surface_copy = trace_context_surface_copy;
-   tr_ctx->base.surface_fill = trace_context_surface_fill;
+   if (pipe->surface_copy)
+      tr_ctx->base.surface_copy = trace_context_surface_copy;
+   if (pipe->surface_fill)
+      tr_ctx->base.surface_fill = trace_context_surface_fill;
    tr_ctx->base.clear = trace_context_clear;
    tr_ctx->base.flush = trace_context_flush;
    tr_ctx->base.is_texture_referenced = trace_is_texture_referenced;
diff --git a/src/gallium/drivers/trace/tr_drm.c b/src/gallium/drivers/trace/tr_drm.c
index 98ac75e3fa..781ca5d3bc 100644
--- a/src/gallium/drivers/trace/tr_drm.c
+++ b/src/gallium/drivers/trace/tr_drm.c
@@ -49,7 +49,7 @@ trace_drm_api(struct drm_api *_api)
 
 static struct pipe_screen *
 trace_drm_create_screen(struct drm_api *_api, int fd,
-                           struct drm_create_screen_arg *arg)
+                        struct drm_create_screen_arg *arg)
 {
    struct trace_drm_api *tr_api = trace_drm_api(_api);
    struct drm_api *api = tr_api->api;
@@ -63,11 +63,11 @@ trace_drm_create_screen(struct drm_api *_api, int fd,
    screen = api->create_screen(api, fd, arg);
 
    return trace_screen_create(screen);
-};
+}
 
 static struct pipe_context *
 trace_drm_create_context(struct drm_api *_api,
-                            struct pipe_screen *_screen)
+                         struct pipe_screen *_screen)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
    struct trace_drm_api *tr_api = trace_drm_api(_api);
@@ -82,91 +82,67 @@ trace_drm_create_context(struct drm_api *_api,
    pipe = trace_context_create(_screen, pipe);
 
    return pipe;
-};
-
-static boolean
-trace_drm_buffer_from_texture(struct drm_api *_api,
-                                 struct pipe_texture *_texture,
-                                 struct pipe_buffer **_buffer,
-                                 unsigned *stride)
-{
-   struct trace_texture *tr_texture = trace_texture(_texture);
-   struct trace_drm_api *tr_api = trace_drm_api(_api);
-   struct pipe_texture *texture = tr_texture->texture;
-   struct drm_api *api = tr_api->api;
-   struct pipe_buffer *buffer = NULL;
-   boolean result;
-
-   /* TODO trace call */
-
-   result = api->buffer_from_texture(api, texture, &buffer, stride);
-
-   if (result && _buffer)
-      buffer = trace_buffer_create(trace_screen(texture->screen), buffer);
-
-   if (_buffer)
-      *_buffer = buffer;
-   else
-      pipe_buffer_reference(&buffer, NULL);
-
-   return result;
 }
 
-static struct pipe_buffer *
-trace_drm_buffer_from_handle(struct drm_api *_api,
-                                struct pipe_screen *_screen,
-                                const char *name,
-                                unsigned handle)
+static struct pipe_texture *
+trace_drm_texture_from_shared_handle(struct drm_api *_api,
+                                     struct pipe_screen *_screen,
+                                     struct pipe_texture *templ,
+                                     const char *name,
+                                     unsigned stride,
+                                     unsigned handle)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
    struct trace_drm_api *tr_api = trace_drm_api(_api);
    struct pipe_screen *screen = tr_screen->screen;
    struct drm_api *api = tr_api->api;
-   struct pipe_buffer *result;
+   struct pipe_texture *result;
 
    /* TODO trace call */
 
-   result = api->buffer_from_handle(api, screen, name, handle);
+   result = api->texture_from_shared_handle(api, screen, templ, name, stride, handle);
 
-   result = trace_buffer_create(trace_screen(_screen), result);
+   result = trace_texture_create(trace_screen(_screen), result);
 
    return result;
 }
 
 static boolean
-trace_drm_handle_from_buffer(struct drm_api *_api,
-                                struct pipe_screen *_screen,
-                                struct pipe_buffer *_buffer,
-                                unsigned *handle)
+trace_drm_shared_handle_from_texture(struct drm_api *_api,
+                                     struct pipe_screen *_screen,
+                                     struct pipe_texture *_texture,
+                                     unsigned *stride,
+                                     unsigned *handle)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
-   struct trace_buffer *tr_buffer = trace_buffer(_buffer);
+   struct trace_texture *tr_texture = trace_texture(_texture);
    struct trace_drm_api *tr_api = trace_drm_api(_api);
    struct pipe_screen *screen = tr_screen->screen;
-   struct pipe_buffer *buffer = tr_buffer->buffer;
+   struct pipe_texture *texture = tr_texture->texture;
    struct drm_api *api = tr_api->api;
 
    /* TODO trace call */
 
-   return api->handle_from_buffer(api, screen, buffer, handle);
+   return api->shared_handle_from_texture(api, screen, texture, stride, handle);
 }
 
 static boolean
-trace_drm_global_handle_from_buffer(struct drm_api *_api,
-                                       struct pipe_screen *_screen,
-                                       struct pipe_buffer *_buffer,
-                                       unsigned *handle)
+trace_drm_local_handle_from_texture(struct drm_api *_api,
+                                    struct pipe_screen *_screen,
+                                    struct pipe_texture *_texture,
+                                    unsigned *stride,
+                                    unsigned *handle)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
-   struct trace_buffer *tr_buffer = trace_buffer(_buffer);
+   struct trace_texture *tr_texture = trace_texture(_texture);
    struct trace_drm_api *tr_api = trace_drm_api(_api);
    struct pipe_screen *screen = tr_screen->screen;
-   struct pipe_buffer *buffer = tr_buffer->buffer;
+   struct pipe_texture *texture = tr_texture->texture;
    struct drm_api *api = tr_api->api;
 
    /* TODO trace call */
 
-   return api->global_handle_from_buffer(api, screen, buffer, handle);
+   return api->local_handle_from_texture(api, screen, texture, stride, handle);
 }
 
 static void
@@ -197,10 +173,9 @@ trace_drm_create(struct drm_api *api)
 
    tr_api->base.create_screen = trace_drm_create_screen;
    tr_api->base.create_context = trace_drm_create_context;
-   tr_api->base.buffer_from_texture = trace_drm_buffer_from_texture;
-   tr_api->base.buffer_from_handle = trace_drm_buffer_from_handle;
-   tr_api->base.handle_from_buffer = trace_drm_handle_from_buffer;
-   tr_api->base.global_handle_from_buffer = trace_drm_global_handle_from_buffer;
+   tr_api->base.texture_from_shared_handle = trace_drm_texture_from_shared_handle;
+   tr_api->base.shared_handle_from_texture = trace_drm_shared_handle_from_texture;
+   tr_api->base.local_handle_from_texture = trace_drm_local_handle_from_texture;
    tr_api->base.destroy = trace_drm_destroy;
    tr_api->api = api;
 
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 643587ab42..7e2ccbcfdc 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -351,7 +351,7 @@ void trace_dump_call_begin_locked(const char *klass, const char *method)
    trace_dump_indent(1);
    trace_dump_writes("<call no=\'");
    trace_dump_writef("%lu", call_no);
-   trace_dump_writes("\' class =\'");
+   trace_dump_writes("\' class=\'");
    trace_dump_escape(klass);
    trace_dump_writes("\' method=\'");
    trace_dump_escape(method);
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 5b1e26a52d..26f1c04594 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -462,6 +462,7 @@ trace_screen_surface_buffer_create(struct pipe_screen *_screen,
                                    unsigned width, unsigned height,
                                    enum pipe_format format,
                                    unsigned usage,
+                                   unsigned tex_usage,
                                    unsigned *pstride)
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
@@ -476,11 +477,13 @@ trace_screen_surface_buffer_create(struct pipe_screen *_screen,
    trace_dump_arg(uint, height);
    trace_dump_arg(format, format);
    trace_dump_arg(uint, usage);
+   trace_dump_arg(uint, tex_usage);
 
    result = screen->surface_buffer_create(screen,
                                           width, height,
                                           format,
                                           usage,
+                                          tex_usage,
                                           pstride);
 
    stride = *pstride;