66 files changed, 18740 insertions, 0 deletions
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
new file mode 100644
index 0000000000..19182afa75
--- /dev/null
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -0,0 +1,52 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965simple
+
+C_SOURCES = \
+	brw_blit.c \
+	brw_flush.c \
+	brw_screen.c \
+	brw_surface.c \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state.c \
+	brw_state_batch.c \
+	brw_state_cache.c \
+	brw_state_pool.c \
+	brw_state_upload.c \
+	brw_tex_layout.c \
+	brw_urb.c \
+	brw_util.c \
+	brw_vs.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_wm.c \
+	brw_wm_iz.c \
+	brw_wm_decl.c \
+	brw_wm_glsl.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/i965simple/SConscript b/src/gallium/drivers/i965simple/SConscript
new file mode 100644
index 0000000000..43fc2a4005
--- /dev/null
+++ b/src/gallium/drivers/i965simple/SConscript
@@ -0,0 +1,54 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_screen.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/gallium/drivers/i965simple/brw_batch.h b/src/gallium/drivers/i965simple/brw_batch.h
new file mode 100644
index 0000000000..5f5932a488
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_batch.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_BATCH_H
+#define BRW_BATCH_H
+
+#include "brw_winsys.h"
+
+#define BATCH_LOCALS
+
+#define INTEL_BATCH_NO_CLIPRECTS 0x1
+#define INTEL_BATCH_CLIPRECTS    0x2
+
+#define BEGIN_BATCH( dwords, relocs ) \
+   brw->winsys->batch_start(brw->winsys, dwords, relocs)
+
+#define OUT_BATCH( dword ) \
+   brw->winsys->batch_dword(brw->winsys, dword)
+
+#define OUT_RELOC( buf, flags, delta ) \
+   brw->winsys->batch_reloc(brw->winsys, buf, flags, delta)
+
+#define ADVANCE_BATCH() \
+   brw->winsys->batch_end( brw->winsys )
+
+/* XXX: this is bogus - need proper handling for out-of-memory in batchbuffer.
+ */
+#define FLUSH_BATCH(fence) do {				\
+   brw->winsys->batch_flush(brw->winsys, fence);	\
+   brw->hardware_dirty = ~0;				\
+} while (0)
+
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->winsys, (s), sizeof(*(s)))
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_blit.c b/src/gallium/drivers/i965simple/brw_blit.c
new file mode 100644
index 0000000000..4d11f8d2ab
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.c
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "brw_batch.h"
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_reg.h"
+
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void brw_fill_blit(struct brw_context *brw,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color)
+{
+   unsigned BR13, CMD;
+   BATCH_LOCALS;
+
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (0xF0 << 16) | (1<<24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (0xF0 << 16) | (1<<24) | (1<<25);
+      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH( CMD );
+   OUT_BATCH( dst_pitch | BR13 );
+   OUT_BATCH( (y << 16) | x );
+   OUT_BATCH( ((y+h) << 16) | (x+w) );
+   OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE, dst_offset );
+   OUT_BATCH( color );
+   ADVANCE_BATCH();
+}
+
+static unsigned translate_raster_op(unsigned logicop)
+{
+   switch(logicop) {
+   case PIPE_LOGICOP_CLEAR: return 0x00;
+   case PIPE_LOGICOP_AND: return 0x88;
+   case PIPE_LOGICOP_AND_REVERSE: return 0x44;
+   case PIPE_LOGICOP_COPY: return 0xCC;
+   case PIPE_LOGICOP_AND_INVERTED: return 0x22;
+   case PIPE_LOGICOP_NOOP: return 0xAA;
+   case PIPE_LOGICOP_XOR: return 0x66;
+   case PIPE_LOGICOP_OR: return 0xEE;
+   case PIPE_LOGICOP_NOR: return 0x11;
+   case PIPE_LOGICOP_EQUIV: return 0x99;
+   case PIPE_LOGICOP_INVERT: return 0x55;
+   case PIPE_LOGICOP_OR_REVERSE: return 0xDD;
+   case PIPE_LOGICOP_COPY_INVERTED: return 0x33;
+   case PIPE_LOGICOP_OR_INVERTED: return 0xBB;
+   case PIPE_LOGICOP_NAND: return 0x77;
+   case PIPE_LOGICOP_SET: return 0xFF;
+   default: return 0;
+   }
+}
+
+
+/* Copy BitBlt
+ */
+void brw_copy_blit(struct brw_context *brw,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op)
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+   BATCH_LOCALS;
+
+
+   DBG("%s src:buf(%d)/%d %d,%d dst:buf(%d)/%d %d,%d sz:%dx%d op:%d\n",
+       __FUNCTION__,
+       src_buffer, src_pitch, src_x, src_y,
+       dst_buffer, dst_pitch, dst_x, dst_y,
+       w,h,logic_op);
+
+   assert( logic_op - PIPE_LOGICOP_CLEAR >= 0 );
+   assert( logic_op - PIPE_LOGICOP_CLEAR < 0x10 );
+
+   src_pitch *= cpp;
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24) |
+	  (1<<25);
+      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (src_tiled) {
+      CMD |= XY_SRC_TILED;
+      src_pitch /= 4;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   if (dst_y2 < dst_y ||
+       dst_x2 < dst_x) {
+      return;
+   }
+
+   dst_pitch &= 0xffff;
+   src_pitch &= 0xffff;
+
+   /* Initial y values don't seem to work with negative pitches.  If
+    * we adjust the offsets manually (below), it seems to work fine.
+    *
+    * On the other hand, if we always adjust, the hardware doesn't
+    * know which blit directions to use, so overlapping copypixels get
+    * the wrong result.
+    */
+   if (dst_pitch > 0 && src_pitch > 0) {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( dst_pitch | BR13 );
+      OUT_BATCH( (dst_y << 16) | dst_x );
+      OUT_BATCH( (dst_y2 << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset );
+      OUT_BATCH( (src_y << 16) | src_x );
+      OUT_BATCH( src_pitch );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset );
+      ADVANCE_BATCH();
+   }
+   else {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( (dst_pitch & 0xffff) | BR13 );
+      OUT_BATCH( (0 << 16) | dst_x );
+      OUT_BATCH( (h << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset + dst_y * dst_pitch );
+      OUT_BATCH( (src_pitch & 0xffff) );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset + src_y * src_pitch );
+      ADVANCE_BATCH();
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_blit.h b/src/gallium/drivers/i965simple/brw_blit.h
new file mode 100644
index 0000000000..111c5d91d3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.h
@@ -0,0 +1,33 @@
+#ifndef BRW_BLIT_H
+#define BRW_BLIT_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_buffer;
+struct brw_context;
+
+void brw_fill_blit(struct brw_context *intel,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color);
+void brw_copy_blit(struct brw_context *intel,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op);
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
new file mode 100644
index 0000000000..3668123e2e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+
+
+static int brw_translate_compare_func(int func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+      return BRW_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:
+      return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return BRW_COMPAREFUNCTION_ALWAYS;
+}
+
+static int brw_translate_stencil_op(int op)
+{
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return BRW_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return BRW_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return BRW_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return BRW_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return BRW_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return BRW_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return BRW_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return BRW_STENCILOP_INVERT;
+   default:
+      return BRW_STENCILOP_ZERO;
+   }
+}
+
+
+static int brw_translate_logic_op(int opcode)
+{
+   switch(opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return BRW_LOGICOPFUNCTION_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return BRW_LOGICOPFUNCTION_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return BRW_LOGICOPFUNCTION_AND_REVERSE;
+   case PIPE_LOGICOP_COPY:
+      return BRW_LOGICOPFUNCTION_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return BRW_LOGICOPFUNCTION_AND_INVERTED;
+   case PIPE_LOGICOP_NOOP:
+      return BRW_LOGICOPFUNCTION_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return BRW_LOGICOPFUNCTION_XOR;
+   case PIPE_LOGICOP_OR:
+      return BRW_LOGICOPFUNCTION_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return BRW_LOGICOPFUNCTION_OR_INVERTED;
+   case PIPE_LOGICOP_NOR:
+      return BRW_LOGICOPFUNCTION_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return BRW_LOGICOPFUNCTION_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return BRW_LOGICOPFUNCTION_INVERT;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return BRW_LOGICOPFUNCTION_OR_REVERSE;
+   case PIPE_LOGICOP_NAND:
+      return BRW_LOGICOPFUNCTION_NAND;
+   case PIPE_LOGICOP_SET:
+      return BRW_LOGICOPFUNCTION_SET;
+   default:
+      return BRW_LOGICOPFUNCTION_SET;
+   }
+}
+
+
+static void upload_cc_vp( struct brw_context *brw )
+{
+   struct brw_cc_viewport ccv;
+
+   memset(&ccv, 0, sizeof(ccv));
+
+   ccv.min_depth = 0.0;
+   ccv.max_depth = 1.0;
+
+   brw->cc.vp_gs_offset = brw_cache_data( &brw->cache[BRW_CC_VP], &ccv );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_cc_vp
+};
+
+
+static void upload_cc_unit( struct brw_context *brw )
+{
+   struct brw_cc_unit_state cc;
+
+   memset(&cc, 0, sizeof(cc));
+
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      cc.cc0.stencil_enable = brw->attribs.DepthStencil->stencil[0].enabled;
+      cc.cc0.stencil_func = brw_translate_compare_func(brw->attribs.DepthStencil->stencil[0].func);
+      cc.cc0.stencil_fail_op = brw_translate_stencil_op(brw->attribs.DepthStencil->stencil[0].fail_op);
+      cc.cc0.stencil_pass_depth_fail_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zfail_op);
+      cc.cc0.stencil_pass_depth_pass_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zpass_op);
+      cc.cc1.stencil_ref = brw->attribs.DepthStencil->stencil[0].ref_value;
+      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].writemask;
+      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].valuemask;
+
+      if (brw->attribs.DepthStencil->stencil[1].enabled) {
+	 cc.cc0.bf_stencil_enable = brw->attribs.DepthStencil->stencil[1].enabled;
+	 cc.cc0.bf_stencil_func = brw_translate_compare_func(
+            brw->attribs.DepthStencil->stencil[1].func);
+	 cc.cc0.bf_stencil_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].fail_op);
+	 cc.cc0.bf_stencil_pass_depth_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zfail_op);
+	 cc.cc0.bf_stencil_pass_depth_pass_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zpass_op);
+	 cc.cc1.bf_stencil_ref = brw->attribs.DepthStencil->stencil[1].ref_value;
+	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].writemask;
+	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].valuemask;
+      }
+
+      /* Not really sure about this:
+       */
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+   /* BRW_NEW_BLEND */
+   if (brw->attribs.Blend->logicop_enable) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = brw_translate_logic_op( brw->attribs.Blend->logicop_func );
+   }
+   else if (brw->attribs.Blend->blend_enable) {
+      int eqRGB = brw->attribs.Blend->rgb_func;
+      int eqA = brw->attribs.Blend->alpha_func;
+      int srcRGB = brw->attribs.Blend->rgb_src_factor;
+      int dstRGB = brw->attribs.Blend->rgb_dst_factor;
+      int srcA = brw->attribs.Blend->alpha_src_factor;
+      int dstA = brw->attribs.Blend->alpha_dst_factor;
+
+      if (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX) {
+	 srcRGB = dstRGB = PIPE_BLENDFACTOR_ONE;
+      }
+
+      if (eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX) {
+	 srcA = dstA = PIPE_BLENDFACTOR_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation( eqRGB );
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation( eqA );
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+   
+   /* BRW_NEW_ALPHATEST
+    */
+   if (brw->attribs.DepthStencil->alpha.enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = 
+	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
+
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref_value);
+
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+   }
+
+   if (brw->attribs.Blend->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
+   if (brw->attribs.DepthStencil->depth.enabled) {
+      cc.cc2.depth_test = brw->attribs.DepthStencil->depth.enabled;
+      cc.cc2.depth_test_function = brw_translate_compare_func(brw->attribs.DepthStencil->depth.func);
+      cc.cc2.depth_write_enable = brw->attribs.DepthStencil->depth.writemask;
+   }
+
+   /* CACHE_NEW_CC_VP */
+   cc.cc4.cc_viewport_state_offset =  brw->cc.vp_gs_offset >> 5;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
+
+   brw->cc.state_gs_offset = brw_cache_data( &brw->cache[BRW_CC_UNIT], &cc );
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .brw = BRW_NEW_DEPTH_STENCIL | BRW_NEW_BLEND | BRW_NEW_ALPHA_TEST,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .update = upload_cc_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_clip.c b/src/gallium/drivers/i965simple/brw_clip.c
new file mode 100644
index 0000000000..268124cc53
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.c
@@ -0,0 +1,206 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_clip.h"
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static void compile_clip_prog( struct brw_context *brw,
+			     struct brw_clip_prog_key *key )
+{
+   struct brw_clip_compile c;
+   const unsigned *program;
+   unsigned program_size;
+   unsigned delta;
+   unsigned i;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   for (i = 0, delta = REG_SIZE; i < PIPE_MAX_SHADER_OUTPUTS; i++)
+      if (c.key.attrs & (1<<i)) {
+	 c.offset[i] = delta;
+	 delta += ATTR_SIZE;
+      }
+
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_TRIANGLES:
+#if 0
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+#endif
+	 brw_emit_tri_clip( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case PIPE_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->clip.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_CLIP_PROG],
+						&c.key,
+						sizeof(c.key),
+						program,
+						program_size,
+						&c.prog_data,
+						&brw->clip.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_clip_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_CLIP_PROG],
+			   key, sizeof(*key),
+			   &brw->clip.prog_data,
+			   &brw->clip.prog_gs_offset);
+}
+
+
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_clip_prog(struct brw_context *brw)
+{
+   struct brw_clip_prog_key key;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key:
+    */
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->reduced_primitive;
+   /* CACHE_NEW_VS_PROG */
+   key.attrs = brw->vs.prog_data->outputs_written;
+   /* BRW_NEW_RASTER */
+   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+   /* BRW_NEW_CLIP */
+   key.nr_userclip = brw->attribs.Clip.nr; /* XXX */
+
+#if 0
+   key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->attribs.Raster->cull_mode == PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+         if (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+             brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL)
+            key.do_unfilled = 1;
+
+	 /* Most cases the fixed function units will handle.  Cases where
+	  * one or more polygon faces are unfilled will require help:
+	  */
+	 if (key.do_unfilled) {
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+
+	    if (brw->attribs.Raster->offset_cw ||
+                brw->attribs.Raster->offset_ccw) {
+	       key.offset_units = brw->attribs.Raster->offset_units;
+	       key.offset_factor = brw->attribs.Raster->offset_scale;
+	    }
+            key.fill_ccw = brw->attribs.Raster->fill_ccw;
+            key.fill_cw = brw->attribs.Raster->fill_cw;
+            key.offset_ccw = brw->attribs.Raster->offset_ccw;
+            key.offset_cw = brw->attribs.Raster->offset_cw;
+            if (brw->attribs.Raster->light_twoside &&
+                key.fill_cw != CLIP_CULL)
+               key.copy_bfc_cw = 1;
+	 }
+      }
+   }
+#else
+   key.clip_mode = BRW_CLIPMODE_ACCEPT_ALL;
+#endif
+
+   if (!search_cache(brw, &key))
+      compile_clip_prog( brw, &key );
+}
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_CLIP |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_clip_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip.h b/src/gallium/drivers/i965simple/brw_clip.h
new file mode 100644
index 0000000000..d70fc094ff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.h
@@ -0,0 +1,170 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   unsigned attrs:32;		
+   unsigned primitive:4;
+   unsigned nr_userclip:3;
+   unsigned do_flat_shading:1;
+   unsigned do_unfilled:1;
+   unsigned fill_cw:2;		/* includes cull information */
+   unsigned fill_ccw:2;		/* includes cull information */
+   unsigned offset_cw:1;
+   unsigned offset_ccw:1;
+   unsigned pad0:17;
+
+   unsigned copy_bfc_cw:1;
+   unsigned copy_bfc_ccw:1;
+   unsigned clip_mode:3;
+   unsigned pad1:27;
+   
+   float offset_factor;
+   float offset_units;
+};
+
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   boolean need_direction;
+
+   unsigned last_mrf;
+
+   unsigned header_position_offset;
+   unsigned offset[PIPE_MAX_ATTRIBS];
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      unsigned nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_clip_line.c b/src/gallium/drivers/i965simple/brw_clip_line.c
new file mode 100644
index 0000000000..75d9e5fcda
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_line.c
@@ -0,0 +1,245 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        float dp0 = DOTPROD( vtx0, plane[p] );
+ *        float dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           float t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           float t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together:
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+#if 0
+	 /* dp = DP4(vtx->position, plane)
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1))
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+#else
+         #warning "disabled"
+#endif
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+	    brw_math_invert(p, c->reg.t, c->reg.t);
+	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+	    brw_MOV(p, c->reg.t1, c->reg.t);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 }
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+	    /* Coming back in.  We know that both cannot be negative
+	     * because the line would have been culled in that case.
+	     */
+
+	    /* If both are positive, do nothing */
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             {
+		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+		brw_math_invert(p, c->reg.t, c->reg.t);
+		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+		brw_MOV(p, c->reg.t0, c->reg.t);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	     }
+	     brw_ENDIF(p, is_neg2);
+	 }
+	 brw_ENDIF(p, is_negative);
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+
+   clip_and_emit_line(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_point.c b/src/gallium/drivers/i965simple/brw_clip_point.c
new file mode 100644
index 0000000000..6fce7210d1
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_point.c
@@ -0,0 +1,47 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
new file mode 100644
index 0000000000..8e78dd51be
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_state.c
@@ -0,0 +1,93 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_state clip;
+
+   memset(&clip, 0, sizeof(clip));
+
+   /* CACHE_NEW_CLIP_PROG */
+   clip.thread0.grf_reg_count =
+      align(brw->clip.prog_data->total_grf, 16) / 16 - 1;
+   clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
+   clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   clip.clip5.clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   clip.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+
+   /* BRW_NEW_URB_FENCE */
+   clip.thread4.nr_urb_entries = brw->urb.nr_clip_entries; 
+   clip.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   clip.thread4.max_threads = 1; /* 2 threads */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1; 
+
+   /* CONSTANT */
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;   
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   brw->clip.state_gs_offset = brw_cache_data( &brw->cache[BRW_CLIP_UNIT], &clip );
+}
+
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .update = upload_clip_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip_tri.c b/src/gallium/drivers/i965simple/brw_clip_tri.c
new file mode 100644
index 0000000000..c5da7b825e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_tri.c
@@ -0,0 +1,566 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 unsigned delta = c->nr_attrs*16 + 32;
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_UD);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, TRUE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 }
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+#if 0
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+
+    /* test nearz, xmin, ymin plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+
+   /* if -ve rhw workaround bit is set,
+      do cliptest */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   neg_rhw = brw_IF(p, BRW_EXECUTE_1);
+   {
+       brw_clip_test(c);
+   }
+   brw_ENDIF(p, neg_rhw);
+
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+
+   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+      do_clip_tri(c);
+   else
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_unfilled.c b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
new file mode 100644
index 0000000000..b774a76dd6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
@@ -0,0 +1,477 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
+
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0, negate(v2)); 
+   brw_ADD(p, f, v1, negate(v2)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) &&
+       !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      unsigned i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0]));
+
+	 if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1]));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  float iz	= 1.0 / dir.z;
+  float ac	= dir.x * iz;
+  float bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg pos = deref_4f(vert, c->offset[VERT_RESULT_HPOS]);
+   struct brw_reg z = get_element(pos, 2);
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       boolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			boolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     unsigned mode, 
+			     boolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+
+   assert(c->offset[VERT_RESULT_EDGE]);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_clip_util.c b/src/gallium/drivers/i965simple/brw_clip_util.c
new file mode 100644
index 0000000000..6d58ceafff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_util.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+static void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, TGSI_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c,
+				     struct brw_indirect vert_addr )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+
+   release_tmp(c, tmp);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   unsigned i;
+
+   /* Just copy the vertex header:
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->nr_attrs; i++) {
+      unsigned delta = i*16 + 32;
+
+      if (delta == c->offset[VERT_RESULT_EDGE]) {
+	 if (force_edgeflag)
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate:
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p,
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p,
+		 tmp,
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0);
+
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta),
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      unsigned delta = i*16 + 32;
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   unsigned start = c->last_mrf;
+
+   assert(!(allocate && eot));
+
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p,
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p,
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 0, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+
+   if (c->offset[VERT_RESULT_COL0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
+
+   if (c->offset[VERT_RESULT_COL1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
+
+   if (c->offset[VERT_RESULT_BFC0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
+
+   if (c->offset[VERT_RESULT_BFC1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+
+   /* Shift so that lowest outcode bit is rightmost:
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+
+      release_tmp(c, tmp);
+   }
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
new file mode 100644
index 0000000000..9b33285bc7
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -0,0 +1,139 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_draw.h"
+#include "brw_vs.h"
+#include "brw_tex_layout.h"
+#include "brw_winsys.h"
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+#ifndef BRW_DEBUG
+int BRW_DEBUG = (0);
+#endif
+
+
+static void brw_destroy(struct pipe_context *pipe)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   if(brw->winsys->destroy)
+      brw->winsys->destroy(brw->winsys);
+   
+   FREE(brw);
+}
+
+
+static void brw_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+                      unsigned clearValue)
+{
+   int x, y, w, h;
+   /* FIXME: corny... */
+
+   x = 0;
+   y = 0;
+   w = ps->width;
+   h = ps->height;
+
+   pipe->surface_fill(pipe, ps, x, y, w, h, clearValue);
+}
+
+static unsigned int
+brw_is_texture_referenced( struct pipe_context *pipe,
+			   struct pipe_texture *texture,
+			   unsigned face, unsigned level)
+{
+   /**
+    * FIXME: Optimize.
+    */
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+static unsigned int
+brw_is_buffer_referenced( struct pipe_context *pipe,
+			  struct pipe_buffer *buf)
+{
+   /**
+    * FIXME: Optimize.
+    */
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+struct pipe_context *brw_create(struct pipe_screen *screen,
+                                struct brw_winsys *brw_winsys,
+                                unsigned pci_id)
+{
+   struct brw_context *brw;
+
+   debug_printf("%s: creating brw_context with pci id 0x%x\n",
+                __FUNCTION__, pci_id);
+
+   brw = CALLOC_STRUCT(brw_context);
+   if (brw == NULL)
+      return NULL;
+
+   brw->winsys = brw_winsys;
+   brw->pipe.winsys = screen->winsys;
+   brw->pipe.screen = screen;
+
+   brw->pipe.destroy = brw_destroy;
+   brw->pipe.clear = brw_clear;
+
+   brw->pipe.is_texture_referenced = brw_is_texture_referenced;
+   brw->pipe.is_buffer_referenced = brw_is_buffer_referenced;
+
+   brw_init_surface_functions(brw);
+   brw_init_texture_functions(brw);
+   brw_init_state_functions(brw);
+   brw_init_flush_functions(brw);
+   brw_init_draw_functions( brw );
+
+
+   brw_init_state( brw );
+
+   brw->pci_id = pci_id;
+   brw->dirty = ~0;
+   brw->hardware_dirty = ~0;
+
+   memset(&brw->wm.bind, ~0, sizeof(brw->wm.bind));
+
+   return &brw->pipe;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
new file mode 100644
index 0000000000..3079485180
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.h
@@ -0,0 +1,684 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "tgsi/tgsi_scan.h"
+
+#include "brw_structs.h"
+#include "brw_winsys.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawining a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contigous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes descisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_MAX_CURBE                    (32*16)
+
+struct brw_context;
+struct brw_winsys;
+
+
+/* Raised when we receive new state across the pipe interface:
+ */
+#define BRW_NEW_VIEWPORT                0x1
+#define BRW_NEW_RASTERIZER              0x2
+#define BRW_NEW_FS                      0x4
+#define BRW_NEW_BLEND                   0x8
+#define BRW_NEW_CLIP                    0x10
+#define BRW_NEW_SCISSOR                 0x20
+#define BRW_NEW_STIPPLE                 0x40
+#define BRW_NEW_FRAMEBUFFER             0x80
+#define BRW_NEW_ALPHA_TEST              0x100
+#define BRW_NEW_DEPTH_STENCIL           0x200
+#define BRW_NEW_SAMPLER                 0x400
+#define BRW_NEW_TEXTURE                 0x800
+#define BRW_NEW_CONSTANTS               0x1000
+#define BRW_NEW_VBO                     0x2000
+#define BRW_NEW_VS                      0x4000
+
+/* Raised for other internal events:
+ */
+#define BRW_NEW_URB_FENCE               0x10000
+#define BRW_NEW_PSP                     0x20000
+#define BRW_NEW_CURBE_OFFSETS           0x40000
+#define BRW_NEW_REDUCED_PRIMITIVE       0x80000
+#define BRW_NEW_PRIMITIVE               0x100000
+#define BRW_NEW_SCENE                 0x200000
+#define BRW_NEW_SF_LINKAGE              0x400000
+
+extern int BRW_DEBUG;
+
+#define DEBUG_TEXTURE	0x1
+#define DEBUG_STATE	0x2
+#define DEBUG_IOCTL	0x4
+#define DEBUG_PRIMS	0x8
+#define DEBUG_VERTS	0x10
+#define DEBUG_FALLBACKS	0x20
+#define DEBUG_VERBOSE	0x40
+#define DEBUG_DRI       0x80
+#define DEBUG_DMA       0x100
+#define DEBUG_SANITY    0x200
+#define DEBUG_SYNC      0x400
+#define DEBUG_SLEEP     0x800
+#define DEBUG_PIXEL     0x1000
+#define DEBUG_STATS     0x2000
+#define DEBUG_TILE      0x4000
+#define DEBUG_SINGLE_THREAD   0x8000
+#define DEBUG_WM        0x10000
+#define DEBUG_URB       0x20000
+#define DEBUG_VS        0x40000
+#define DEBUG_BATCH	0x80000
+#define DEBUG_BUFMGR	0x100000
+#define DEBUG_BLIT	0x200000
+#define DEBUG_REGION	0x400000
+#define DEBUG_MIPTREE	0x800000
+
+#define DBG(...) do {						\
+   if (BRW_DEBUG & FILE_DEBUG_FLAG)				\
+      debug_printf(__VA_ARGS__);				\
+} while(0)
+
+#define PRINT(...) do {						\
+   debug_printf(__VA_ARGS__);			                \
+} while(0)
+
+struct brw_state_flags {
+   unsigned cache;
+   unsigned brw;
+};
+
+
+struct brw_vertex_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   int id;
+};
+
+
+struct brw_fragment_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   
+   boolean UsesDepth; /* XXX add this to tgsi_shader_info? */
+   int id;
+};
+
+
+struct pipe_setup_linkage {
+   struct {
+      unsigned vp_output:5;
+      unsigned interp_mode:4;
+      unsigned bf_vp_output:5;
+   } fp_input[PIPE_MAX_SHADER_INPUTS];
+
+   unsigned fp_input_count:5;
+   unsigned max_vp_output:5;
+};
+   
+
+
+struct brw_texture {
+   struct pipe_texture base;
+
+   /* Derived from the above:
+    */
+   unsigned stride;
+   unsigned depth_pitch;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.  Pretty much have to accept that hardware formats
+    * are going to be so diverse that there is no unified way to
+    * compute the offsets of depth/cube images within a mipmap level,
+    * so have to store them as a lookup table:
+    */
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
+
+   /* Includes image offset tables:
+    */
+   unsigned level_offset[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+};
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+
+struct brw_wm_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+
+   unsigned first_curbe_grf;
+   unsigned total_grf;
+   unsigned total_scratch;
+
+   /* Internally generated constants for the CURBE.  These are loaded
+    * ahead of the data from the constant buffer.
+    */
+   const float internal_const[8];
+   unsigned nr_internal_consts;
+   unsigned max_const;
+
+   boolean error;
+};
+
+struct brw_sf_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   unsigned urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   unsigned curb_read_length;	/* user planes? */
+   unsigned clip_mode;
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_gs_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_vs_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+   unsigned total_grf;
+   unsigned outputs_written;
+
+   unsigned inputs_read;
+
+   unsigned max_const;
+
+   float    imm_buf[PIPE_MAX_CONSTANT][4];
+   unsigned num_imm;
+   unsigned num_consts;
+
+   /* Used for calculating urb partitions:
+    */
+   unsigned urb_entry_size;
+};
+
+
+#define BRW_MAX_TEX_UNIT 8
+#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + 1
+
+/* Create a fixed sized struct for caching binding tables:
+ */
+struct brw_surface_binding_table {
+   unsigned surf_ss_offset[BRW_WM_MAX_SURF];
+};
+
+
+struct brw_cache;
+
+struct brw_mem_pool {
+   struct pipe_buffer *buffer;
+
+   unsigned size;
+   unsigned offset;		/* offset of first free byte */
+
+   struct brw_context *brw;
+};
+
+struct brw_cache_item {
+   unsigned hash;
+   unsigned key_size;		/* for variable-sized keys */
+   const void *key;
+
+   unsigned offset;		/* offset within pool's buffer */
+   unsigned data_size;
+
+   struct brw_cache_item *next;
+};
+
+
+
+struct brw_cache {
+   unsigned id;
+
+   const char *name;
+
+   struct brw_context *brw;
+   struct brw_mem_pool *pool;
+
+   struct brw_cache_item **items;
+   unsigned size, n_items;
+
+   unsigned key_size;		/* for fixed-size keys */
+   unsigned aux_size;
+
+   unsigned last_addr;			/* offset of active item */
+};
+
+
+
+
+/* Considered adding a member to this struct to document which flags
+ * an update might raise so that ordering of the state atoms can be
+ * checked or derived at runtime.  Dropped the idea in favor of having
+ * a debug mode where the state is monitored for flags which are
+ * raised that have already been tested against.
+ */
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   void (*update)( struct brw_context *brw );
+};
+
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+
+
+
+enum brw_mempool_id {
+   BRW_GS_POOL,
+   BRW_SS_POOL,
+   BRW_MAX_POOL
+};
+
+
+struct brw_cached_batch_item {
+   struct header *header;
+   unsigned sz;
+   struct brw_cached_batch_item *next;
+};
+
+
+
+/* Protect against a future where PIPE_MAX_ATTRIBS > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define ATTRIB_BIT_DWORDS  ((PIPE_MAX_ATTRIBS+31)/32)
+
+
+
+
+struct brw_vertex_info {
+   unsigned varying;  /* varying:1[PIPE_MAX_ATTRIBS] */
+   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_MAX_ATTRIBS] */
+};
+
+
+
+
+
+struct brw_context
+{
+   struct pipe_context pipe;
+   struct brw_winsys *winsys;
+
+   unsigned primitive;
+   unsigned reduced_primitive;
+
+   boolean emit_state_always;
+
+   struct {
+      struct brw_state_flags dirty;
+   } state;
+
+
+   struct {
+      const struct pipe_blend_state         *Blend;
+      const struct pipe_depth_stencil_alpha_state *DepthStencil;
+      const struct pipe_poly_stipple        *PolygonStipple;
+      const struct pipe_rasterizer_state    *Raster;
+      const struct pipe_sampler_state       *Samplers[PIPE_MAX_SAMPLERS];
+      const struct brw_vertex_program       *VertexProgram;
+      const struct brw_fragment_program     *FragmentProgram;
+
+      struct pipe_clip_state          Clip;
+      struct pipe_blend_color         BlendColor;
+      struct pipe_scissor_state       Scissor;
+      struct pipe_viewport_state      Viewport;
+      struct pipe_framebuffer_state   FrameBuffer;
+
+      const struct pipe_constant_buffer *Constants[2];
+      const struct brw_texture          *Texture[PIPE_MAX_SAMPLERS];
+   } attribs;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+
+   struct brw_mem_pool pool[BRW_MAX_POOL];
+   struct brw_cache cache[BRW_MAX_CACHE];
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+
+      /* Arrays with buffer objects to copy non-bufferobj arrays into
+       * for upload:
+       */
+      const struct pipe_vertex_buffer *vbo_array[PIPE_MAX_ATTRIBS];
+
+      struct brw_vertex_element_state inputs[PIPE_MAX_ATTRIBS];
+
+#define BRW_NR_UPLOAD_BUFS 17
+#define BRW_UPLOAD_INIT_SIZE (128*1024)
+
+      /* Summary of size and varying of active arrays, so we can check
+       * for changes to this state:
+       */
+      struct brw_vertex_info info;
+   } vb;
+
+
+   unsigned hardware_dirty;
+   unsigned dirty;
+   unsigned pci_id;
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      unsigned vsize;		/* vertex size plus header in urb registers */
+      unsigned csize;		/* constant buffer size in urb registers */
+      unsigned sfsize;		/* setup data size in urb registers */
+
+      boolean constrained;
+
+      unsigned nr_vs_entries;
+      unsigned nr_gs_entries;
+      unsigned nr_clip_entries;
+      unsigned nr_sf_entries;
+      unsigned nr_cs_entries;
+
+/*       unsigned vs_size; */
+/*       unsigned gs_size; */
+/*       unsigned clip_size; */
+/*       unsigned sf_size; */
+/*       unsigned cs_size; */
+
+      unsigned vs_start;
+      unsigned gs_start;
+      unsigned clip_start;
+      unsigned sf_start;
+      unsigned cs_start;
+   } urb;
+
+
+   /* BRW_NEW_CURBE_OFFSETS:
+    */
+   struct {
+      unsigned wm_start;
+      unsigned wm_size;
+      unsigned clip_start;
+      unsigned clip_size;
+      unsigned vs_start;
+      unsigned vs_size;
+      unsigned total_size;
+
+      unsigned gs_offset;
+
+      float *last_buf;
+      unsigned last_bufsz;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      boolean prog_active;
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      struct pipe_setup_linkage linkage;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+
+//      struct brw_wm_compiler *compile_data;
+
+
+      /**
+       * Array of sampler state uploaded at sampler_gs_offset of BRW_SAMPLER
+       * cache
+       */
+      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+
+      unsigned render_surf;
+      unsigned nr_surfaces;
+
+      unsigned max_threads;
+      struct pipe_buffer *scratch_buffer;
+      unsigned scratch_buffer_size;
+
+      unsigned sampler_count;
+      unsigned sampler_gs_offset;
+
+      struct brw_surface_binding_table bind;
+      unsigned bind_ss_offset;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } wm;
+
+
+   struct {
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } cc;
+
+
+   /* Used to give every program string a unique id
+    */
+   unsigned program_id;
+};
+
+
+#define BRW_PACKCOLOR8888(r,g,b,a)  ((r<<24) | (g<<16) | (b<<8) | a)
+
+
+/*======================================================================
+ * brw_vtbl.c
+ */
+void brw_do_flush( struct brw_context *brw,
+		   unsigned flags );
+
+
+/*======================================================================
+ * brw_state.c
+ */
+void brw_validate_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+
+/*======================================================================
+ * brw_tex.c
+ */
+void brwUpdateTextureState( struct brw_context *brw );
+
+
+/* brw_urb.c
+ */
+void brw_upload_urb_fence(struct brw_context *brw);
+
+void brw_upload_constant_buffer_state(struct brw_context *brw);
+
+void brw_init_surface_functions(struct brw_context *brw);
+void brw_init_state_functions(struct brw_context *brw);
+void brw_init_flush_functions(struct brw_context *brw);
+void brw_init_string_functions(struct brw_context *brw);
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static inline struct brw_context *
+brw_context( struct pipe_context *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+#endif
+
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
new file mode 100644
index 0000000000..904cde8e30
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -0,0 +1,369 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "pipe/p_state.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#define FILE_DEBUG_FLAG DEBUG_FALLBACKS
+
+/* Partition the CURBE between the various users of constant values:
+ */
+static void calculate_curbe_offsets( struct brw_context *brw )
+{
+   /* CACHE_NEW_WM_PROG */
+   unsigned nr_fp_regs = align(brw->wm.prog_data->max_const, 16);
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   unsigned nr_vp_regs = align(brw->vs.prog_data->max_const, 16);
+   unsigned nr_clip_regs = 0;
+   unsigned total_regs;
+
+#if 0
+   /* BRW_NEW_CLIP ? */
+   if (brw->attribs.Transform->ClipPlanesEnabled) {
+      unsigned nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
+      nr_clip_regs = align(nr_planes * 4, 16);
+   }
+#endif
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* This can happen - what to do?  Probably rather than falling
+    * back, the best thing to do is emit programs which code the
+    * constants as immediate values.  Could do this either as a static
+    * cap on WM and VS, or adaptively.
+    *
+    * Unfortunately, this is currently dependent on the results of the
+    * program generation process (in the case of wm), so this would
+    * introduce the need to re-generate programs in the event of a
+    * curbe allocation failure.
+    */
+   /* Max size is 32 - just large enough to
+    * hold the 128 parameters allowed by
+    * the fragment and vertex program
+    * api's.  It's not clear what happens
+    * when both VP and FP want to use 128
+    * parameters, though.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      unsigned reg = 0;
+
+      /* Calculate a new layout:
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+#if 0
+      if (0)
+	 DBG("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+#endif
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_VS),
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .update = calculate_curbe_offsets
+};
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+void brw_upload_constant_buffer_state(struct brw_context *brw)
+{
+   struct brw_constant_buffer_state cbs;
+   memset(&cbs, 0, sizeof(cbs));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
+   cbs.header.length = sizeof(cbs)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+}
+
+
+static float fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static void upload_constant_buffer(struct brw_context *brw)
+{
+   struct brw_mem_pool *pool = &brw->pool[BRW_GS_POOL];
+   unsigned sz = brw->curbe.total_size;
+   unsigned bufsz = sz * sizeof(float);
+   float *buf;
+   unsigned i;
+
+
+   if (sz == 0) {
+      struct brw_constant_buffer cb;
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 0;
+      cb.bits0.buffer_length = 0;
+      cb.bits0.buffer_address = 0;
+      BRW_BATCH_STRUCT(brw, &cb);
+
+      if (brw->curbe.last_buf) {
+	 free(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+
+      return;
+   }
+
+   buf = (float *)malloc(bufsz);
+
+   memset(buf, 0, bufsz);
+
+   if (brw->curbe.wm_size) {
+      unsigned offset = brw->curbe.wm_start * 16;
+
+      /* First the constant buffer constants:
+       */
+      
+      /* Then any internally generated constants: 
+       */
+      for (i = 0; i < brw->wm.prog_data->nr_internal_consts; i++)
+	 buf[offset + i] = brw->wm.prog_data->internal_const[i];
+
+      assert(brw->wm.prog_data->max_const == 
+	     brw->wm.prog_data->nr_internal_consts);
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      unsigned offset = brw->curbe.clip_start * 16;
+      unsigned j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes: BRW_NEW_CLIP:
+       */
+      for (j = 0; j < brw->attribs.Clip.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->attribs.Clip.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->attribs.Clip.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->attribs.Clip.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->attribs.Clip.ucp[j][3];
+	 i++;
+      }
+   }
+
+
+   if (brw->curbe.vs_size) {
+      unsigned offset = brw->curbe.vs_start * 16;
+      /*unsigned nr = vp->max_const;*/
+      const struct pipe_constant_buffer *cbuffer = brw->attribs.Constants[0];
+      struct pipe_winsys *ws = brw->pipe.winsys;
+      /* FIXME: buffer size is num_consts + num_immediates */
+      if (brw->vs.prog_data->num_consts) {
+         /* map the vertex constant buffer and copy to curbe: */
+         void *data = ws->buffer_map(ws, cbuffer->buffer, 0);
+         /* FIXME: this is wrong. the cbuffer->buffer->size currently
+          * represents size of consts + immediates. so if we'll
+          * have both we'll copy over the end of the buffer
+          * with the subsequent memcpy */
+         memcpy(&buf[offset], data, cbuffer->buffer->size);
+         ws->buffer_unmap(ws, cbuffer->buffer);
+         offset += cbuffer->buffer->size;
+      }
+      /*immediates*/
+      if (brw->vs.prog_data->num_imm) {
+         memcpy(&buf[offset], brw->vs.prog_data->imm_buf,
+                brw->vs.prog_data->num_imm * 4 * sizeof(float));
+      }
+   }
+
+   if (1) {
+      for (i = 0; i < sz; i+=4)
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   brw->curbe.last_buf, buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      free(buf);
+/*       return; */
+   }
+   else {
+      if (brw->curbe.last_buf)
+	 free(brw->curbe.last_buf);
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+
+      if (!brw_pool_alloc(pool,
+			  bufsz,
+			  1 << 6,
+			  &brw->curbe.gs_offset)) {
+	 debug_printf("out of GS memory for curbe\n");
+	 assert(0);
+	 return;
+      }
+
+
+      /* Copy data to the buffer:
+       */
+      brw->winsys->buffer_subdata_typed(brw->winsys,
+					pool->buffer, 
+					brw->curbe.gs_offset, 
+					bufsz, 
+					buf,
+					BRW_CONSTANT_BUFFER );
+   }
+
+   /* TODO: only emit the constant_buffer packet when necessary, ie:
+      - contents have changed
+      - offset has changed
+      - hw requirements due to other packets emitted.
+   */
+   {
+      struct brw_constant_buffer cb;
+
+      memset(&cb, 0, sizeof(cb));
+
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 1;
+      cb.bits0.buffer_length = sz - 1;
+      cb.bits0.buffer_address = brw->curbe.gs_offset >> 6;
+
+      /* Because this provokes an action (ie copy the constants into the
+       * URB), it shouldn't be shortcircuited if identical to the
+       * previous time - because eg. the urb destination may have
+       * changed, or the urb contents different to last time.
+       *
+       * Note that the data referred to is actually copied internally,
+       * not just used in place according to passed pointer.
+       *
+       * It appears that the CS unit takes care of using each available
+       * URB entry (Const URB Entry == CURBE) in turn, and issuing
+       * flushes as necessary when doublebuffering of CURBEs isn't
+       * possible.
+       */
+      BRW_BATCH_STRUCT(brw, &cb);
+   }
+}
+
+/* This tracked state is unique in that the state it monitors varies
+ * dynamically depending on the parameters tracked by the fragment and
+ * vertex programs.  This is the template used as a starting point,
+ * each context will maintain a copy of this internally and update as
+ * required.
+ */
+const struct brw_tracked_state brw_constant_buffer = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_CONSTANTS |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS),
+      .cache = (CACHE_NEW_WM_PROG)
+   },
+   .update = upload_constant_buffer
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_defines.h b/src/gallium/drivers/i965simple/brw_defines.h
new file mode 100644
index 0000000000..715d2d2d01
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_defines.h
@@ -0,0 +1,870 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/*
+ */
+#define MI_NOOP                              0x00
+#define MI_USER_INTERRUPT                    0x02
+#define MI_WAIT_FOR_EVENT                    0x03
+#define MI_FLUSH                             0x04
+#define MI_REPORT_HEAD                       0x07
+#define MI_ARB_ON_OFF                        0x08
+#define MI_BATCH_BUFFER_END                  0x0A
+#define MI_OVERLAY_FLIP                      0x11
+#define MI_LOAD_SCAN_LINES_INCL              0x12
+#define MI_LOAD_SCAN_LINES_EXCL              0x13
+#define MI_DISPLAY_BUFFER_INFO               0x14
+#define MI_SET_CONTEXT                       0x18
+#define MI_STORE_DATA_IMM                    0x20
+#define MI_STORE_DATA_INDEX                  0x21
+#define MI_LOAD_REGISTER_IMM                 0x22
+#define MI_STORE_REGISTER_MEM                0x24
+#define MI_BATCH_BUFFER_START                0x31
+
+#define MI_SYNCHRONOUS_FLIP                  0x0
+#define MI_ASYNCHRONOUS_FLIP                 0x1
+
+#define MI_BUFFER_SECURE                     0x0
+#define MI_BUFFER_NONSECURE                  0x1
+
+#define MI_ARBITRATE_AT_CHAIN_POINTS         0x0
+#define MI_ARBITRATE_BETWEEN_INSTS           0x1
+#define MI_NO_ARBITRATION                    0x3
+
+#define MI_CONDITION_CODE_WAIT_DISABLED      0x0
+#define MI_CONDITION_CODE_WAIT_0             0x1
+#define MI_CONDITION_CODE_WAIT_1             0x2
+#define MI_CONDITION_CODE_WAIT_2             0x3
+#define MI_CONDITION_CODE_WAIT_3             0x4
+#define MI_CONDITION_CODE_WAIT_4             0x5
+
+#define MI_DISPLAY_PIPE_A                    0x0
+#define MI_DISPLAY_PIPE_B                    0x1
+
+#define MI_DISPLAY_PLANE_A                   0x0
+#define MI_DISPLAY_PLANE_B                   0x1
+#define MI_DISPLAY_PLANE_C                   0x2
+
+#define MI_STANDARD_FLIP                                 0x0
+#define MI_ENQUEUE_FLIP_PERFORM_BASE_FRAME_NUMBER_LOAD   0x1
+#define MI_ENQUEUE_FLIP_TARGET_FRAME_NUMBER_RELATIVE     0x2
+#define MI_ENQUEUE_FLIP_ABSOLUTE_TARGET_FRAME_NUMBER     0x3
+
+#define MI_PHYSICAL_ADDRESS                  0x0
+#define MI_VIRTUAL_ADDRESS                   0x1
+
+#define MI_BUFFER_MEMORY_MAIN                0x0
+#define MI_BUFFER_MEMORY_GTT                 0x2
+#define MI_BUFFER_MEMORY_PER_PROCESS_GTT     0x3
+
+#define MI_FLIP_CONTINUE                     0x0
+#define MI_FLIP_ON                           0x1
+#define MI_FLIP_OFF                          0x2
+
+#define MI_UNTRUSTED_REGISTER_SPACE          0x0
+#define MI_TRUSTED_REGISTER_SPACE            0x1
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+#define _3DPRIMITIVE  0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0
+#define BRW_ANISORATIO_4     1
+#define BRW_ANISORATIO_6     2
+#define BRW_ANISORATIO_8     3
+#define BRW_ANISORATIO_10    4
+#define BRW_ANISORATIO_12    5
+#define BRW_ANISORATIO_14    6
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15
+
+#define BRW_MAPFILTER_NEAREST        0x0
+#define BRW_MAPFILTER_LINEAR         0x1
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0
+#define BRW_MIPFILTER_NEAREST     1
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0
+#define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D
+#define BRW_SURFACEFORMAT_MONO8                          0x18E
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191
+#define BRW_SURFACEFORMAT_FXT1                           0x192
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+#define BRW_VERTEXBUFFER_ACCESS_VERTEXDATA     0
+#define BRW_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
+
+#define BRW_VFCOMPONENT_NOSTORE      0
+#define BRW_VFCOMPONENT_STORE_SRC    1
+#define BRW_VFCOMPONENT_STORE_0      2
+#define BRW_VFCOMPONENT_STORE_1_FLT  3
+#define BRW_VFCOMPONENT_STORE_1_INT  4
+#define BRW_VFCOMPONENT_STORE_VID    5
+#define BRW_VFCOMPONENT_STORE_IID    6
+#define BRW_VFCOMPONENT_STORE_PID    7
+
+
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_O     8
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT           0x6104
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+#define CMD_VERTEX_BUFFER             0x7808
+#define CMD_VERTEX_ELEMENT            0x7809
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS             0x780b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw.c b/src/gallium/drivers/i965simple/brw_draw.c
new file mode 100644
index 0000000000..49d80cb41c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_prim.h"
+
+static unsigned hw_prim[PIPE_PRIM_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static void brw_set_prim(struct brw_context *brw, int prim)
+{
+   PRINT("PRIM: %d\n", prim);
+
+   /* Slight optimization to avoid the GS program when not needed:
+    */
+   if (prim == PIPE_PRIM_QUAD_STRIP &&
+       brw->attribs.Raster->flatshade &&
+       brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_FILL)
+      prim = PIPE_PRIM_TRIANGLE_STRIP;
+
+   if (prim != brw->primitive) {
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      if (u_reduced_prim(prim) != brw->reduced_primitive) {
+	 brw->reduced_primitive = u_reduced_prim(prim);
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+
+      brw_validate_state(brw);
+   }
+
+}
+
+
+static unsigned trim(int prim, unsigned length)
+{
+   if (prim == PIPE_PRIM_QUAD_STRIP)
+      return length > 3 ? (length - length % 2) : 0;
+   else if (prim == PIPE_PRIM_QUADS)
+      return length - length % 4;
+   else
+      return length;
+}
+
+
+
+static boolean brw_emit_prim( struct brw_context *brw,
+			      boolean indexed,
+			      unsigned start,
+			      unsigned count )
+
+{
+   struct brw_3d_primitive prim_packet;
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      PRINT("PRIM: %d %d %d\n",  brw->primitive, start, count);
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim[brw->primitive];
+   prim_packet.header.indexed = indexed;
+
+   prim_packet.verts_per_instance = trim(brw->primitive, count);
+   prim_packet.start_vert_location = start;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = 0;
+
+   if (prim_packet.verts_per_instance == 0)
+      return TRUE;
+
+   return brw_batchbuffer_data( brw->winsys,
+                                &prim_packet,
+                                sizeof(prim_packet) );
+}
+
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static boolean brw_try_draw_elements( struct pipe_context *pipe,
+				      struct pipe_buffer *index_buffer,
+				      unsigned index_size,
+				      unsigned mode,
+				      unsigned start,
+				      unsigned count )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Set the first primitive ahead of validate_state:
+    */
+   brw_set_prim(brw, mode);
+
+   /* Upload index, vertex data:
+    */
+   if (index_buffer &&
+       !brw_upload_indices( brw, index_buffer, index_size, start, count ))
+      return FALSE;
+
+   if (!brw_upload_vertex_buffers(brw))
+      return FALSE;
+
+   if (!brw_upload_vertex_elements( brw ))
+      return FALSE;
+
+   /* XXX:  Need to separate validate and upload of state.
+    */
+   if (brw->state.dirty.brw)
+      brw_validate_state( brw );
+
+   if (!brw_emit_prim(brw, index_buffer != NULL,
+                      start, count))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_elements( struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode,
+				  unsigned start,
+				  unsigned count )
+{
+   if (!brw_try_draw_elements( pipe,
+			       indexBuffer,
+			       indexSize,
+			       mode, start, count ))
+   {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe,
+				  indexBuffer,
+				  indexSize,
+				  mode, start,
+				  count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_arrays( struct pipe_context *pipe,
+				    unsigned mode,
+				    unsigned start,
+				    unsigned count )
+{
+   if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+   
+   return TRUE;
+}
+
+
+
+void brw_init_draw_functions( struct brw_context *brw )
+{
+   brw->pipe.draw_arrays = brw_draw_arrays;
+   brw->pipe.draw_elements = brw_draw_elements;
+}
+
+
diff --git a/src/gallium/drivers/i965simple/brw_draw.h b/src/gallium/drivers/i965simple/brw_draw.h
new file mode 100644
index 0000000000..62fe0d5d0e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.h
@@ -0,0 +1,55 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "pipe/p_context.h"
+
+struct brw_context;
+
+
+
+void brw_init_draw_functions( struct brw_context *brw );
+
+
+boolean brw_upload_vertices( struct brw_context *brw,
+			       unsigned min_index,
+			       unsigned max_index );
+
+boolean brw_upload_indices(struct brw_context *brw,
+                           const struct pipe_buffer *index_buffer,
+                           int ib_size, int start, int count);
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw );
+boolean brw_upload_vertex_elements( struct brw_context *brw );
+
+unsigned brw_translate_surface_format( unsigned id );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
new file mode 100644
index 0000000000..2d9ca3f2ea
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -0,0 +1,300 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+struct brw_array_state {
+   union header_union header;
+
+   struct {
+      union {
+	 struct {
+	    unsigned pitch:11;
+	    unsigned pad:15;
+	    unsigned access_type:1;
+	    unsigned vb_index:5;
+	 } bits;
+	 unsigned dword;
+      } vb0;
+
+      struct pipe_buffer *buffer;
+      unsigned offset;
+
+      unsigned max_index;
+      unsigned instance_data_step_rate;
+
+   } vb[BRW_VBP_MAX];
+};
+
+
+
+unsigned brw_translate_surface_format( unsigned id )
+{
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned get_index_type(int type)
+{
+   switch (type) {
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw )
+{
+   struct brw_array_state vbp;
+   unsigned nr_enabled = 0;
+   unsigned i;
+
+   memset(&vbp, 0, sizeof(vbp));
+
+   /* This is a hardware limit:
+    */
+
+   for (i = 0; i < BRW_VEP_MAX; i++)
+   {
+      if (brw->vb.vbo_array[i] == NULL) {
+	 nr_enabled = i;
+	 break;
+      }
+
+      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->stride;
+      vbp.vb[i].vb0.bits.pad = 0;
+      vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
+      vbp.vb[i].vb0.bits.vb_index = i;
+      vbp.vb[i].offset = brw->vb.vbo_array[i]->buffer_offset;
+      vbp.vb[i].buffer = brw->vb.vbo_array[i]->buffer;
+      vbp.vb[i].max_index = brw->vb.vbo_array[i]->max_index;
+   }
+
+
+   vbp.header.bits.length = (1 + nr_enabled * 4) - 2;
+   vbp.header.bits.opcode = CMD_VERTEX_BUFFER;
+
+   BEGIN_BATCH(vbp.header.bits.length+2, 0);
+   OUT_BATCH( vbp.header.dword );
+
+   for (i = 0; i < nr_enabled; i++) {
+      OUT_BATCH( vbp.vb[i].vb0.dword );
+      OUT_RELOC( vbp.vb[i].buffer,  PIPE_BUFFER_USAGE_GPU_READ,
+		 vbp.vb[i].offset);
+      OUT_BATCH( vbp.vb[i].max_index );
+      OUT_BATCH( vbp.vb[i].instance_data_step_rate );
+   }
+   ADVANCE_BATCH();
+   return TRUE;
+}
+
+
+
+boolean brw_upload_vertex_elements( struct brw_context *brw )
+{
+   struct brw_vertex_element_packet vep;
+
+   unsigned i;
+   unsigned nr_enabled = brw->attribs.VertexProgram->info.num_inputs;
+
+   memset(&vep, 0, sizeof(vep));
+
+   for (i = 0; i < nr_enabled; i++) 
+      vep.ve[i] = brw->vb.inputs[i];
+
+
+   vep.header.length = (1 + nr_enabled * sizeof(vep.ve[0])/4) - 2;
+   vep.header.opcode = CMD_VERTEX_ELEMENT;
+   brw_cached_batch_struct(brw, &vep, 4 + nr_enabled * sizeof(vep.ve[0]));
+
+   return TRUE;
+}
+
+boolean brw_upload_indices( struct brw_context *brw,
+                            const struct pipe_buffer *index_buffer,
+                            int ib_size, int start, int count)
+{
+   /* Emit the indexbuffer packet:
+    */
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(ib_size);
+      ib.header.bits.cut_index_enable = 0;
+
+
+      BEGIN_BATCH(4, 0);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start);
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start + count);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+   return TRUE;
+}
diff --git a/src/gallium/drivers/i965simple/brw_eu.c b/src/gallium/drivers/i965simple/brw_eu.c
new file mode 100644
index 0000000000..e2002d1821
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.c
@@ -0,0 +1,130 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional )
+{
+   p->current->header.destreg__conditonalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, boolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, unsigned value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, unsigned value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_compile *p )
+{
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+const unsigned *brw_get_program( struct brw_compile *p,
+			       unsigned *sz )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   return (const unsigned *)p->store;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu.h b/src/gallium/drivers/i965simple/brw_eu.h
new file mode 100644
index 0000000000..23151ae9ed
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.h
@@ -0,0 +1,888 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   unsigned type:4;
+   unsigned file:2;
+   unsigned nr:8;
+   unsigned subnr:5;		/* :1 in align16 */
+   unsigned negate:1;		/* source only */
+   unsigned abs:1;		/* source only */
+   unsigned vstride:4;		/* source only */
+   unsigned width:3;		/* src only, align1 only */
+   unsigned hstride:2;   		/* src only, align1 only */
+   unsigned address_mode:1;	/* relative addressing, hopefully! */
+   unsigned pad0:1;
+
+   union {
+      struct {
+	 unsigned swizzle:8;		/* src only, align16 only */
+	 unsigned writemask:4;		/* dest only, align16 only */
+	 int  indirect_offset:10;	/* relative addressing offset */
+	 unsigned pad1:10;		/* two dwords total */
+      } bits;
+
+      float f;
+      int   d;
+      unsigned ud;
+   } dw1;
+};
+
+
+struct brw_indirect {
+   unsigned addr_subnr:4;
+   int addr_offset:10;
+   unsigned pad:18;
+};
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 1200
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   unsigned nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   unsigned flag_value;
+   boolean single_program_flow;
+};
+
+
+
+static __inline int type_sz( unsigned type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+static __inline struct brw_reg brw_reg( unsigned file,
+					unsigned nr,
+					unsigned subnr,
+					unsigned type,
+					unsigned vstride,
+					unsigned width,
+					unsigned hstride,
+					unsigned swizzle,
+					unsigned writemask)
+{
+
+   struct brw_reg reg;
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec16_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+static __inline struct brw_reg brw_vec8_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec4_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec2_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  TGSI_WRITEMASK_XY);
+}
+
+static __inline struct brw_reg brw_vec1_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  TGSI_WRITEMASK_X);
+}
+
+
+static __inline struct brw_reg retype( struct brw_reg reg,
+				       unsigned type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static __inline struct brw_reg suboffset( struct brw_reg reg,
+					  unsigned delta )
+{
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static __inline struct brw_reg offset( struct brw_reg reg,
+				       unsigned delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static __inline struct brw_reg byte_offset( struct brw_reg reg,
+					    unsigned bytes )
+{
+   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_uw16_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw8_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw1_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_imm_reg( unsigned type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);
+}
+
+static __inline struct brw_reg brw_imm_f( float f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_d( int d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_ud( unsigned ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_uw( ushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_w( short w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w;
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/* Vector of eight signed half-byte values:
+ */
+static __inline struct brw_reg brw_imm_v( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/* Vector of four 8-bit float values:
+ */
+static __inline struct brw_reg brw_imm_vf( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static __inline struct brw_reg brw_imm_vf4( unsigned v0,
+					    unsigned v1,
+					    unsigned v2,
+					    unsigned v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static __inline struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+
+static __inline struct brw_reg brw_vec1_grf( unsigned nr,
+					       unsigned subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec8_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec4_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static __inline struct brw_reg brw_vec2_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_uw8_grf( unsigned nr,
+					    unsigned subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_NULL,
+		       0);
+}
+
+static __inline struct brw_reg brw_address_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_ADDRESS,
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static __inline struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		  BRW_ARF_IP,
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  TGSI_WRITEMASK_XYZW); /* NOTE! */
+}
+
+static __inline struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_ACCUMULATOR,
+		       0);
+}
+
+
+static __inline struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static __inline struct brw_reg brw_mask_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static __inline struct brw_reg brw_message_reg( unsigned nr )
+{
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static __inline unsigned cvt( unsigned val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static __inline struct brw_reg stride( struct brw_reg reg,
+				       unsigned vstride,
+				       unsigned width,
+				       unsigned hstride )
+{
+
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+static __inline struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static __inline struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static __inline struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static __inline struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static __inline struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+static __inline struct brw_reg get_element( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static __inline struct brw_reg get_element_ud( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static __inline struct brw_reg brw_swizzle( struct brw_reg reg,
+					    unsigned x,
+					    unsigned y,
+					    unsigned z,
+					    unsigned w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     unsigned x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static __inline struct brw_reg brw_writemask( struct brw_reg reg,
+					      unsigned mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static __inline struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  unsigned mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static __inline struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static __inline struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static __inline struct brw_reg brw_vec4_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec1_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg deref_4f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_1f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_4b(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static __inline struct brw_reg deref_1uw(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static __inline struct brw_reg deref_1ud(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static __inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static __inline struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, int offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static __inline struct brw_indirect brw_indirect( unsigned addr_subnr, int offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+static __inline struct brw_instruction *current_insn( struct brw_compile *p)
+{
+	return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, unsigned value );
+void brw_set_saturate( struct brw_compile *p, unsigned value );
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode );
+void brw_set_compression_control( struct brw_compile *p, boolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value );
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc );
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional );
+
+void brw_init_compile( struct brw_compile *p );
+const unsigned *brw_get_program( struct brw_compile *p, unsigned *sz );
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src );
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 );
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     unsigned msg_reg_nr,
+		     unsigned scratch_offset );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_math_invert( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_eu_debug.c b/src/gallium/drivers/i965simple/brw_eu_debug.c
new file mode 100644
index 0000000000..4adfb0c02f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_debug.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#include "util/u_debug.h"
+
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   debug_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      debug_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else {
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_emit.c b/src/gallium/drivers/i965simple/brw_eu_emit.c
new file mode 100644
index 0000000000..400a80b6fb
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_emit.c
@@ -0,0 +1,1080 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size( struct brw_instruction *insn,
+				  struct brw_reg reg )
+{
+   if (reg.width == BRW_WIDTH_8 &&
+       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
+      insn->header.execution_size = BRW_EXECUTE_16;
+   else
+      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
+}
+
+
+static void brw_set_dest( struct brw_instruction *insn,
+			  struct brw_reg dest )
+{
+   insn->bits1.da1.dest_reg_file = dest.file;
+   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+      insn->bits1.da1.dest_reg_nr = dest.nr;
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
+	 insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+      }
+   }
+   else {
+      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+	 insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+      }
+   }
+
+   /* NEW: Set the execution size based on dest.width and
+    * insn->compression_control:
+    */
+   guess_execution_size(insn, dest);
+}
+
+static void brw_set_src0( struct brw_instruction *insn,
+		      struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src0_reg_file = reg.file;
+   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits2.da1.src0_abs = reg.abs;
+   insn->bits2.da1.src0_negate = reg.negate;
+   insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+
+      /* Required to set some fields in src1 as well:
+       */
+      insn->bits1.da1.src1_reg_file = 0; /* arf */
+      insn->bits1.da1.src1_reg_type = reg.type;
+   }
+   else
+   {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
+	    insn->bits2.da1.src0_reg_nr = reg.nr;
+	 }
+	 else {
+	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+	    insn->bits2.da16.src0_reg_nr = reg.nr;
+	 }
+      }
+      else {
+	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
+	 }
+	 else {
+	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+	 }
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
+	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
+	    insn->bits2.da1.src0_width = reg.width;
+	    insn->bits2.da1.src0_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits2.da16.src0_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+void brw_set_src1( struct brw_instruction *insn,
+			  struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src1_reg_file = reg.file;
+   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits3.da1.src1_abs = reg.abs;
+   insn->bits3.da1.src1_negate = reg.negate;
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   }
+   else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
+	 insn->bits3.da1.src1_reg_nr = reg.nr;
+      }
+      else {
+	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+	 insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
+	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
+	    insn->bits3.da1.src1_width = reg.width;
+	    insn->bits3.da1.src1_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits3.da16.src1_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+
+static void brw_set_math_message( struct brw_instruction *insn,
+				  unsigned msg_length,
+				  unsigned response_length,
+				  unsigned function,
+				  unsigned integer_type,
+				  boolean low_precision,
+				  boolean saturate,
+				  unsigned dataType )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.math.function = function;
+   insn->bits3.math.int_type = integer_type;
+   insn->bits3.math.precision = low_precision;
+   insn->bits3.math.saturate = saturate;
+   insn->bits3.math.data_type = dataType;
+   insn->bits3.math.response_length = response_length;
+   insn->bits3.math.msg_length = msg_length;
+   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+   insn->bits3.math.end_of_thread = 0;
+}
+
+static void brw_set_urb_message( struct brw_instruction *insn,
+				 boolean allocate,
+				 boolean used,
+				 unsigned msg_length,
+				 unsigned response_length,
+				 boolean end_of_thread,
+				 boolean complete,
+				 unsigned offset,
+				 unsigned swizzle_control )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.urb.opcode = 0;	/* ? */
+   insn->bits3.urb.offset = offset;
+   insn->bits3.urb.swizzle_control = swizzle_control;
+   insn->bits3.urb.allocate = allocate;
+   insn->bits3.urb.used = used;	/* ? */
+   insn->bits3.urb.complete = complete;
+   insn->bits3.urb.response_length = response_length;
+   insn->bits3.urb.msg_length = msg_length;
+   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_write_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned msg_length,
+				      unsigned pixel_scoreboard_clear,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_write.binding_table_index = binding_table_index;
+   insn->bits3.dp_write.msg_control = msg_control;
+   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+   insn->bits3.dp_write.msg_type = msg_type;
+   insn->bits3.dp_write.send_commit_msg = 0;
+   insn->bits3.dp_write.response_length = response_length;
+   insn->bits3.dp_write.msg_length = msg_length;
+   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_read_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned target_cache,
+				      unsigned msg_length,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_read.binding_table_index = binding_table_index;
+   insn->bits3.dp_read.msg_control = msg_control;
+   insn->bits3.dp_read.msg_type = msg_type;
+   insn->bits3.dp_read.target_cache = target_cache;
+   insn->bits3.dp_read.response_length = response_length;
+   insn->bits3.dp_read.msg_length = msg_length;
+   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
+   insn->bits3.dp_read.end_of_thread = end_of_thread;
+}
+
+static void brw_set_sampler_message( struct brw_instruction *insn,
+				     unsigned binding_table_index,
+				     unsigned sampler,
+				     unsigned msg_type,
+				     unsigned response_length,
+				     unsigned msg_length,
+				     boolean eot)
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.sampler.binding_table_index = binding_table_index;
+   insn->bits3.sampler.sampler = sampler;
+   insn->bits3.sampler.msg_type = msg_type;
+   insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+   insn->bits3.sampler.response_length = response_length;
+   insn->bits3.sampler.msg_length = msg_length;
+   insn->bits3.sampler.end_of_thread = eot;
+   insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+}
+
+
+
+static struct brw_instruction *next_insn( struct brw_compile *p,
+					  unsigned opcode )
+{
+   struct brw_instruction *insn;
+
+   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   /* Reset this one-shot flag:
+    */
+
+   if (p->current->header.destreg__conditonalmod) {
+      p->current->header.destreg__conditonalmod = 0;
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }
+
+   insn->header.opcode = opcode;
+   return insn;
+}
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   return insn;
+}
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+   return insn;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+
+
+
+void brw_NOP(struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
+   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(insn, brw_imm_ud(0x0));
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+struct brw_instruction *brw_JMPI(struct brw_compile *p,
+	      struct brw_reg dest,
+	      struct brw_reg src0,
+	      struct brw_reg src1)
+{
+   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ *
+ * No attempt is made to deal with stack overflow (14 elements?).
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, unsigned execute_size)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      assert(execute_size == BRW_EXECUTE_1);
+
+      insn = next_insn(p, BRW_OPCODE_ADD);
+      insn->header.predicate_inverse = 1;
+   } else {
+      insn = next_insn(p, BRW_OPCODE_IF);
+   }
+
+   /* Override the defaults for this instruction:
+    */
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.execution_size = execute_size;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_ELSE);
+   }
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = if_insn->header.execution_size;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   /* Patch the if instruction to point at this instruction.
+    */
+   if (p->single_program_flow) {
+      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+
+      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
+   } else {
+      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+
+      if_insn->bits3.if_else.jump_count = insn - if_insn;
+      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.pad0 = 0;
+   }
+
+   return insn;
+}
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *patch_insn)
+{
+   if (p->single_program_flow) {
+      /* In single program flow mode, there's no need to execute an ENDIF,
+       * since we don't need to do any stack operations, and if we're executing
+       * currently, we want to just continue executing.
+       */
+      struct brw_instruction *next = &p->store[p->nr_insn];
+
+      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+
+      patch_insn->bits3.ud = (next - patch_insn) * 16;
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(insn, brw_imm_d(0x0));
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = patch_insn->header.execution_size;
+      insn->header.mask_control = BRW_MASK_ENABLE;
+
+      assert(patch_insn->bits3.if_else.jump_count == 0);
+
+      /* Patch the if or else instructions to point at this or the next
+       * instruction respectively.
+       */
+      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
+	 /* Automagically turn it into an IFF:
+	  */
+	 patch_insn->header.opcode = BRW_OPCODE_IFF;
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 0;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 1;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else {
+	 assert(0);
+      }
+
+      /* Also pop item off the stack in the endif instruction:
+       */
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   }
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+/* DO/WHILE loop:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
+{
+   if (p->single_program_flow) {
+      return &p->store[p->nr_insn];
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      insn->header.mask_control = BRW_MASK_DISABLE;
+
+      return insn;
+   }
+}
+
+
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow)
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   else
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (p->single_program_flow) {
+      insn->header.execution_size = BRW_EXECUTE_1;
+
+      insn->bits3.d = (do_insn - insn) * 16;
+   } else {
+      insn->header.execution_size = do_insn->header.execution_size;
+
+      assert(do_insn->header.opcode == BRW_OPCODE_DO);
+      insn->bits3.if_else.jump_count = do_insn - insn;
+      insn->bits3.if_else.pop_count = 0;
+      insn->bits3.if_else.pad0 = 0;
+   }
+
+/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+   return insn;
+}
+
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn)
+{
+   struct brw_instruction *landing = &p->store[p->nr_insn];
+
+   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+
+   jmp_insn->bits3.ud = (landing - jmp_insn) - 1;
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   insn->header.destreg__conditonalmod = conditional;
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+
+/*    guess_execution_size(insn, src0); */
+
+
+   /* Make it so that future instructions will use the computed flag
+    * value until brw_set_predicate_control_flag_value() is called
+    * again.
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == 0) {
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+      p->flag_value = 0xff;
+   }
+}
+
+
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/* Invert 8 values
+ */
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision )
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   insn->header.predicate_control = 0;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			data_type);
+}
+
+/* Use 2 send instructions to invert 16 elements
+ */
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision )
+{
+   struct brw_instruction *insn;
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* First instruction:
+    */
+   brw_push_insn_state(p);
+   brw_set_predicate_control_flag_value(p, 0xff);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   /* Second instruction:
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+
+   brw_set_dest(insn, offset(dest,1));
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   brw_pop_insn_state(p);
+}
+
+
+
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      unsigned msg_length = 3;
+      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+
+      brw_set_dp_write_message(insn,
+			       255, /* bti */
+			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
+			       msg_length,
+			       0, /* pixel scoreboard */
+			       0, /* response_length */
+			       0); /* eot */
+   }
+
+}
+
+
+void brw_dp_READ_16( struct brw_compile *p,
+		      struct brw_reg dest,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+      brw_set_dp_read_message(insn,
+			      255, /* bti */
+			      3,  /* msg_control */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      1, /* target cache */
+			      1, /* msg_length */
+			      2, /* response_length */
+			      0); /* eot */
+   }
+}
+
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = 0; /* XXX */
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_dp_write_message(insn,
+			    binding_table_index,
+			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
+			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+			    msg_length,
+			    1,	/* pixel scoreboard */
+			    response_length,
+			    eot);
+}
+
+
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot)
+{
+   boolean need_stall = 0;
+
+   if(writemask == 0) {
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
+      return;
+   }
+
+   /* Hardware doesn't do destination dependency checking on send
+    * instructions properly.  Add a workaround which generates the
+    * dependency by other means.  In practice it seems like this bug
+    * only crops up for texture samples, and only where registers are
+    * written by the send and then written again later without being
+    * read in between.  Luckily for us, we already track that
+    * information and use it to modify the writemask for the
+    * instruction, so that is a guide for whether a workaround is
+    * needed.
+    */
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      unsigned dst_offset = 0;
+      unsigned i, newmask = 0, len = 0;
+
+      for (i = 0; i < 4; i++) {
+	 if (writemask & (1<<i))
+	    break;
+	 dst_offset += 2;
+      }
+      for (; i < 4; i++) {
+	 if (!(writemask & (1<<i)))
+	    break;
+	 newmask |= 1<<i;
+	 len++;
+      }
+
+      if (newmask != writemask) {
+	 need_stall = 1;
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
+      }
+      else {
+	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+
+	 newmask = ~newmask & TGSI_WRITEMASK_XYZW;
+
+	 brw_push_insn_state(p);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	 brw_MOV(p, m1, brw_vec8_grf(0,0));
+  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
+
+	 brw_pop_insn_state(p);
+
+  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+	 dest = offset(dest, dst_offset);
+	 response_length = len * 2;
+      }
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src0);
+      brw_set_sampler_message(insn,
+			      binding_table_index,
+			      sampler,
+			      msg_type,
+			      response_length,
+			      msg_length,
+			      eot);
+   }
+
+   if (need_stall)
+   {
+      struct brw_reg reg = vec8(offset(dest, response_length-1));
+
+      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
+       */
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, FALSE);
+      brw_MOV(p, reg, reg);
+      brw_pop_insn_state(p);
+   }
+
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_urb_message(insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length,
+		       eot,
+		       writes_complete,
+		       offset,
+		       swizzle);
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_util.c b/src/gallium/drivers/i965simple/brw_eu_util.c
new file mode 100644
index 0000000000..3a65b141f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_flush.c b/src/gallium/drivers/i965simple/brw_flush.c
new file mode 100644
index 0000000000..e6001c30d9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_flush.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_batch.h"
+
+
+static void brw_flush( struct pipe_context *pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle **fence )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush.flags |= BRW_INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush.flags |= BRW_FLUSH_READ_CACHE;
+
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH( fence );
+}
+
+
+
+void brw_init_flush_functions( struct brw_context *brw )
+{
+   brw->pipe.flush = brw_flush;
+}
diff --git a/src/gallium/drivers/i965simple/brw_gs.c b/src/gallium/drivers/i965simple/brw_gs.c
new file mode 100644
index 0000000000..de60868ccc
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.c
@@ -0,0 +1,196 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static void compile_gs_prog( struct brw_context *brw,
+			     struct brw_gs_prog_key *key )
+{
+   struct brw_gs_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   c.key = *key;
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_QUADS:
+      brw_gs_quads( &c );
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return;
+      }
+      break;
+   default:
+      return;
+   }
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->gs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_GS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->gs.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_gs_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_GS_PROG],
+			   key, sizeof(*key),
+			   &brw->gs.prog_data,
+			   &brw->gs.prog_gs_offset);
+}
+
+
+static const int gs_prim[PIPE_PRIM_POLYGON+1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->attrs = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_gs_prog( struct brw_context *brw )
+{
+   struct brw_gs_prog_key key;
+
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (brw->gs.prog_active) {
+      if (!search_cache(brw, &key))
+	 compile_gs_prog( brw, &key );
+   }
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_gs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_gs.h b/src/gallium/drivers/i965simple/brw_gs.h
new file mode 100644
index 0000000000..f09141c6aa
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   unsigned attrs:32;
+   unsigned primitive:4;
+   unsigned hint_gs_always:1;
+   unsigned need_gs_prog:1;
+   unsigned pad:26;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_gs_emit.c b/src/gallium/drivers/i965simple/brw_gs_emit.c
new file mode 100644
index 0000000000..c3cc90b10f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_emit.c
@@ -0,0 +1,148 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    boolean last,
+			    unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   boolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
new file mode 100644
index 0000000000..5b8016b2e9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_state.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+
+static void upload_gs_unit( struct brw_context *brw )
+{
+   struct brw_gs_unit_state gs;
+
+   memset(&gs, 0, sizeof(gs));
+
+   /* CACHE_NEW_GS_PROG */
+   if (brw->gs.prog_active) {
+      gs.thread0.grf_reg_count =
+	 align(brw->gs.prog_data->total_grf, 16) / 16 - 1;
+      gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
+      gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   }
+   else {
+      gs.thread0.grf_reg_count = 0;
+      gs.thread0.kernel_start_pointer = 0;
+      gs.thread3.urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_URB_FENCE */
+   gs.thread4.nr_urb_entries = brw->urb.nr_gs_entries;
+   gs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+
+   gs.thread4.max_threads = 0; /* Hardware requirement */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   /* CONSTANT */
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+
+
+   brw->gs.state_gs_offset = brw_cache_data( &brw->cache[BRW_GS_UNIT], &gs );
+}
+
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .update = upload_gs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_misc_state.c b/src/gallium/drivers/i965simple/brw_misc_state.c
new file mode 100644
index 0000000000..99ff4403a5
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_misc_state.c
@@ -0,0 +1,488 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_batch.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = brw->attribs.BlendColor.color[0];
+   bcc.blend_constant_color[1] = brw->attribs.BlendColor.color[1];
+   bcc.blend_constant_color[2] = brw->attribs.BlendColor.color[2];
+   bcc.blend_constant_color[3] = brw->attribs.BlendColor.color[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .brw = BRW_NEW_BLEND,
+      .cache = 0
+   },
+   .update = upload_blend_constant_color
+};
+
+
+/***********************************************************************
+ * Drawing rectangle 
+ */
+static void upload_drawing_rect(struct brw_context *brw)
+{
+   struct brw_drawrect bdr;
+
+   memset(&bdr, 0, sizeof(bdr));
+   bdr.header.opcode = CMD_DRAW_RECT;
+   bdr.header.length = sizeof(bdr)/4 - 2;
+   bdr.xmin = 0;
+   bdr.ymin = 0;
+   bdr.xmax = brw->attribs.FrameBuffer.cbufs[0]->width;
+   bdr.ymax = brw->attribs.FrameBuffer.cbufs[0]->height;
+   bdr.xorg = 0;
+   bdr.yorg = 0;
+
+   /* Can't use BRW_CACHED_BATCH_STRUCT because this is also emitted
+    * uncached in brw_draw.c:
+    */
+   BRW_BATCH_STRUCT(brw, &bdr);
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_drawing_rect
+};
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is the BRW_SS_POOL cache buffer.
+ */
+static void upload_binding_table_pointers(struct brw_context *brw)
+{
+   struct brw_binding_table_pointers btp;
+   memset(&btp, 0, sizeof(btp));
+
+   btp.header.opcode = CMD_BINDING_TABLE_PTRS;
+   btp.header.length = sizeof(btp)/4 - 2;
+   btp.vs = 0;
+   btp.gs = 0;
+   btp.clp = 0;
+   btp.sf = 0;
+   btp.wm = brw->wm.bind_ss_offset;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &btp);
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = CACHE_NEW_SURF_BIND
+   },
+   .update = upload_binding_table_pointers,
+};
+
+
+/**
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is the BRW_GS_POOL buffer.
+ */
+static void upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   struct brw_pipelined_state_pointers psp;
+   memset(&psp, 0, sizeof(psp));
+
+   psp.header.opcode = CMD_PIPELINED_STATE_POINTERS;
+   psp.header.length = sizeof(psp)/4 - 2;
+
+   psp.vs.offset = brw->vs.state_gs_offset >> 5;
+   psp.sf.offset = brw->sf.state_gs_offset >> 5;
+   psp.wm.offset = brw->wm.state_gs_offset >> 5;
+   psp.cc.offset = brw->cc.state_gs_offset >> 5;
+
+   /* GS gets turned on and off regularly.  Need to re-emit URB fence
+    * after this occurs.
+    */
+   if (brw->gs.prog_active) {
+      psp.gs.offset = brw->gs.state_gs_offset >> 5;
+      psp.gs.enable = 1;
+   }
+
+   if (0) {
+      psp.clp.offset = brw->clip.state_gs_offset >> 5;
+      psp.clp.enable = 1;
+   }
+
+
+   if (BRW_CACHED_BATCH_STRUCT(brw, &psp))
+      brw->state.dirty.brw |= BRW_NEW_PSP;
+}
+
+const struct brw_tracked_state brw_pipelined_state_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_pipelined_state_pointers
+};
+
+static void upload_psp_urb_cbs(struct brw_context *brw )
+{
+   upload_pipelined_state_pointers(brw);
+   brw_upload_urb_fence(brw);
+   brw_upload_constant_buffer_state(brw);
+}
+
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .brw = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_psp_urb_cbs
+};
+
+/**
+ * Upload the depthbuffer offset and format.
+ *
+ * We have to do this per state validation as we need to emit the relocation
+ * in the batch buffer.
+ */
+static void upload_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *depth_surface = brw->attribs.FrameBuffer.zsbuf;
+
+   BEGIN_BATCH(5, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (5 - 2));
+   if (depth_surface == NULL) {
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      unsigned int format;
+      struct brw_texture *tex = (struct brw_texture *)depth_surface->texture;
+      assert(depth_surface->block.width == 1);
+      assert(depth_surface->block.height == 1);
+      switch (depth_surface->block.size) {
+      case 2:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 break;
+      case 4:
+	 if (depth_surface->format == PIPE_FORMAT_Z32_FLOAT)
+	    format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 else
+	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 break;
+      default:
+	 assert(0);
+	 return;
+      }
+
+      OUT_BATCH((depth_surface->stride - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+//		(depth_surface->region->tiled << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(tex->buffer,
+		PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE, 0);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((depth_surface->stride/depth_surface->block.size - 1) << 6) |
+		((depth_surface->height - 1) << 19));
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_depthbuffer,
+};
+
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static void upload_polygon_stipple(struct brw_context *brw)
+{
+   struct brw_polygon_stipple bps;
+   unsigned i;
+
+   memset(&bps, 0, sizeof(bps));
+   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps.header.length = sizeof(bps)/4-2;
+
+   /* XXX: state tracker should send *all* state down initially!
+    */
+   if (brw->attribs.PolygonStipple)
+      for (i = 0; i < 32; i++)
+	 bps.stipple[i] = brw->attribs.PolygonStipple->stipple[31 - i]; /* invert */
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static void upload_line_stipple(struct brw_context *brw)
+{
+   struct brw_line_stipple bls;
+   float tmp;
+   int tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+
+   bls.bits0.pattern = brw->attribs.Raster->line_stipple_pattern;
+   bls.bits1.repeat_count = brw->attribs.Raster->line_stipple_factor;
+
+   tmp = 1.0 / (float) brw->attribs.Raster->line_stipple_factor;
+   tmpi = tmp * (1<<13);
+
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc constant state packets
+ */
+
+static void upload_pipe_control(struct brw_context *brw)
+{
+   struct brw_pipe_control pc;
+
+   return;
+
+   memset(&pc, 0, sizeof(pc));
+
+   pc.header.opcode = CMD_PIPE_CONTROL;
+   pc.header.length = sizeof(pc)/4 - 2;
+   pc.header.post_sync_operation = PIPE_CONTROL_NOWRITE;
+
+   pc.header.instruction_state_cache_flush_enable = 1;
+
+   pc.bits1.dest_addr_type = PIPE_CONTROL_GTTWRITE_GLOBAL;
+
+   BRW_BATCH_STRUCT(brw, &pc);
+}
+
+const struct brw_tracked_state brw_pipe_control = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_pipe_control
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static void upload_invarient_state( struct brw_context *brw )
+{
+   {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+      flush.flags = BRW_FLUSH_STATE_CACHE | BRW_FLUSH_READ_CACHE;
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      ps.header.opcode = CMD_PIPELINE_SELECT;
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping.
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      vfs.opcode = CMD_VF_STATISTICS;
+      if (BRW_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1;
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+
+   
+   {
+      struct brw_polygon_stipple_offset bpso;
+      
+      memset(&bpso, 0, sizeof(bpso));
+      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+      bpso.header.length = sizeof(bpso)/4-2;      
+      bpso.bits0.x_offset = 0;
+      bpso.bits0.y_offset = 0;
+
+      BRW_BATCH_STRUCT(brw, &bpso);
+   }
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_invarient_state
+};
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static void upload_state_base_address( struct brw_context *brw )
+{
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+   OUT_RELOC(brw->pool[BRW_GS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* General state base address */
+   OUT_RELOC(brw->pool[BRW_SS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* Surface state base address */
+   OUT_BATCH(1); /* Indirect object base address */
+   OUT_BATCH(1); /* General state upper bound */
+   OUT_BATCH(1); /* Indirect object upper bound */
+   ADVANCE_BATCH();
+}
+
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_state_base_address
+};
diff --git a/src/gallium/drivers/i965simple/brw_reg.h b/src/gallium/drivers/i965simple/brw_reg.h
new file mode 100644
index 0000000000..9e885c3b3b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_reg.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+
+/* Stalls command execution waiting for the given events to have occurred. */
+#define MI_WAIT_FOR_EVENT               (CMD_MI | (0x3 << 23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+/* Primitive dispatch on 830-945 */
+#define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
new file mode 100644
index 0000000000..b22e105f10
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -0,0 +1,246 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_string.h"
+#include "util/u_simple_screen.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_tex_layout.h"
+
+
+static const char *
+brw_get_vendor( struct pipe_screen *screen )
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+brw_get_name( struct pipe_screen *screen )
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (brw_screen(screen)->pci_id) {
+   case PCI_CHIP_I965_Q:
+      chipset = "Intel(R) 965Q";
+      break;
+   case PCI_CHIP_I965_G:
+   case PCI_CHIP_I965_G_1:
+      chipset = "Intel(R) 965G";
+      break;
+   case PCI_CHIP_I965_GM:
+      chipset = "Intel(R) 965GM";
+      break;
+   case PCI_CHIP_I965_GME:
+      chipset = "Intel(R) 965GME/GLE";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
+   return buffer;
+}
+
+
+static int
+brw_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+brw_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+brw_is_format_supported( struct pipe_screen *screen,
+                         enum pipe_format format, 
+                         enum pipe_texture_target target,
+                         unsigned tex_usage, 
+                         unsigned geom_flags )
+{
+#if 0
+   /* XXX: This is broken -- rewrite if still needed. */
+   static const unsigned tex_supported[] = {
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_L8A8_UNORM,
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      PIPE_FORMAT_S8_Z24,
+   };
+
+
+   /* Actually a lot more than this - add later:
+    */
+   static const unsigned render_supported[] = {
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+   };
+
+   /*
+    */
+   static const unsigned z_stencil_supported[] = {
+      PIPE_FORMAT_Z16_UNORM,
+      PIPE_FORMAT_Z32_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+   };
+
+   switch (type) {
+   case PIPE_RENDER_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   case PIPE_TEX_FORMAT:
+      *numFormats = Elements(tex_supported);
+      return render_supported;
+
+   case PIPE_Z_STENCIL_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   default:
+      *numFormats = 0;
+      return NULL;
+   }
+#else
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return TRUE;
+   default:
+      return FALSE;
+   };
+   return FALSE;
+#endif
+}
+
+
+static void
+brw_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new brw_screen object
+ */
+struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id)
+{
+   struct brw_screen *brwscreen = CALLOC_STRUCT(brw_screen);
+
+   if (!brwscreen)
+      return NULL;
+
+   brwscreen->pci_id = pci_id;
+
+   brwscreen->screen.winsys = winsys;
+
+   brwscreen->screen.destroy = brw_destroy_screen;
+
+   brwscreen->screen.get_name = brw_get_name;
+   brwscreen->screen.get_vendor = brw_get_vendor;
+   brwscreen->screen.get_param = brw_get_param;
+   brwscreen->screen.get_paramf = brw_get_paramf;
+   brwscreen->screen.is_format_supported = brw_is_format_supported;
+
+   brw_init_screen_texture_funcs(&brwscreen->screen);
+   u_simple_screen_init(&brwscreen->screen);
+
+   return &brwscreen->screen;
+}
diff --git a/src/gallium/drivers/i965simple/brw_screen.h b/src/gallium/drivers/i965simple/brw_screen.h
new file mode 100644
index 0000000000..d3c70387e6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen screen;
+
+   uint pci_id;
+};
+
+
+/** cast wrapper */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+
+extern struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965simple/brw_sf.c b/src/gallium/drivers/i965simple/brw_sf.c
new file mode 100644
index 0000000000..b82a2e143b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+#include "tgsi/tgsi_parse.h"
+
+
+static void compile_sf_prog( struct brw_context *brw,
+			     struct brw_sf_prog_key *key )
+{
+   struct brw_sf_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.key = *key;
+
+
+   c.nr_attrs = c.key.vp_output_count;
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+
+   c.nr_setup_attrs = c.key.fp_input_count + 1; /* +1 for position */
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case SF_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c );
+      break;
+   case SF_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c );
+      break;
+   case SF_POINTS:
+      c.nr_verts = 1;
+      brw_emit_point_setup( &c );
+      break;
+
+   case SF_UNFILLED_TRIS:
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->sf.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_SF_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->sf.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_sf_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_SF_PROG],
+			   key, sizeof(*key),
+			   &brw->sf.prog_data,
+			   &brw->sf.prog_gs_offset);
+}
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_sf_prog( struct brw_context *brw )
+{
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct brw_sf_prog_key key;
+   struct tgsi_parse_context parse;
+   int i, done = 0;
+
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.vp_output_count = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_FS */
+   key.fp_input_count = brw->attribs.FragmentProgram->info.file_max[TGSI_FILE_INPUT] + 1;
+
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_TRIANGLES:
+//      if (key.attrs & (1<<VERT_RESULT_EDGE))
+//	 key.primitive = SF_UNFILLED_TRIS;
+//      else
+      key.primitive = SF_TRIANGLES;
+      break;
+   case PIPE_PRIM_LINES:
+      key.primitive = SF_LINES;
+      break;
+   case PIPE_PRIM_POINTS:
+      key.primitive = SF_POINTS;
+      break;
+   }
+
+
+
+   /* Scan fp inputs to figure out what interpolation modes are
+    * required for each incoming vp output.  There is an assumption
+    * that the state tracker makes sure there is a 1:1 linkage between
+    * these sets of attributes (XXX: position??)
+    */
+   tgsi_parse_init( &parse, fs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+	    int interp_mode = parse.FullToken.FullDeclaration.Declaration.Interpolate;
+	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    
+	    switch (interp_mode) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       for (i = first; i <= last; i++) 
+		  key.const_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_LINEAR:
+	       for (i = first; i <= last; i++) 
+		  key.linear_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       for (i = first; i <= last; i++) 
+		  key.persp_mask |= (1 << i);
+	       break;
+	    default:
+	       break;
+	    }
+
+	    /* Also need stuff for flat shading, twosided color.
+	     */
+
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   /* Hack: Adjust for position.  Optimize away when not required (ie
+    * for perspective interpolation).
+    */
+   key.persp_mask <<= 1;
+   key.linear_mask <<= 1; 
+   key.linear_mask |= 1;
+   key.const_mask <<= 1;
+
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
+
+
+//   key.do_point_sprite = brw->attribs.Point->PointSprite;
+//   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
+
+//   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+//   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
+
+//   if (key.do_twoside_color)
+//      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
+
+
+   if (!search_cache(brw, &key))
+      compile_sf_prog( brw, &key );
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_REDUCED_PRIMITIVE |
+		BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = upload_sf_prog
+};
+
+
+
+#if 0
+/* Build a struct like the one we'd like the state tracker to pass to
+ * us.
+ */
+static void update_sf_linkage( struct brw_context *brw )
+{
+   const struct brw_vertex_program *vs = brw->attribs.VertexProgram;
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct pipe_setup_linkage state;
+   struct tgsi_parse_context parse;
+
+   int i, j;
+   int nr_vp_outputs = 0;
+   int done = 0;
+
+   struct { 
+      unsigned semantic:8;
+      unsigned semantic_index:16;
+   } fp_semantic[32], vp_semantic[32];
+
+   memset(&state, 0, sizeof(state));
+
+   state.fp_input_count = 0;
+
+
+
+   
+
+
+   assert(state.fp_input_count == fs->program.num_inputs);
+
+      
+   /* Then scan vp outputs
+    */
+   done = 0;
+   tgsi_parse_init( &parse, vs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+
+	    for (i = first; i < last; i++) {
+	       vp_semantic[i].semantic = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	       vp_semantic[i].semantic_index = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+	    }
+	    
+	    assert(last > nr_vp_outputs);
+	    nr_vp_outputs = last;
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+
+   /* Now match based on semantic information.
+    */
+   for (i = 0; i< state.fp_input_count; i++) {
+      for (j = 0; j < nr_vp_outputs; j++) {
+	 if (fp_semantic[i].semantic == vp_semantic[j].semantic &&
+	     fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	    state.fp_input[i].vp_output = j;
+	 }
+      }
+      if (fp_semantic[i].semantic == TGSI_SEMANTIC_COLOR) {
+	 for (j = 0; j < nr_vp_outputs; j++) {
+	    if (TGSI_SEMANTIC_BCOLOR == vp_semantic[j].semantic &&
+		fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	       state.fp_input[i].bf_vp_output = j;
+	    }
+	 }
+      }
+   }
+
+   if (memcmp(&brw->sf.linkage, &state, sizeof(state)) != 0) {
+      brw->sf.linkage = state;
+      brw->state.dirty.brw |= BRW_NEW_SF_LINKAGE;
+   }
+}
+
+
+const struct brw_tracked_state brw_sf_linkage = {
+   .dirty = {
+      .brw   = (BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = update_sf_linkage
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf.h b/src/gallium/drivers/i965simple/brw_sf.h
new file mode 100644
index 0000000000..b7ada47560
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.h
@@ -0,0 +1,122 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+
+
+struct brw_sf_prog_key {
+   unsigned vp_output_count:5;
+   unsigned fp_input_count:5;
+
+   unsigned primitive:2;
+   unsigned do_twoside_color:1;
+   unsigned do_flat_shading:1;
+   unsigned frontface_ccw:1;
+   unsigned do_point_sprite:1;
+
+   /* Interpolation masks;
+    */
+   unsigned linear_mask;
+   unsigned persp_mask;
+   unsigned const_mask;
+
+
+//   int SpriteOrigin;
+};
+
+struct brw_sf_point_tex {
+	boolean CoordReplace;
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   unsigned nr_verts;
+   unsigned nr_attrs;
+   unsigned nr_attr_regs;
+   unsigned nr_setup_attrs;
+   unsigned nr_setup_regs;
+#if 0
+   ubyte attr_to_idx[VERT_RESULT_MAX];
+   ubyte idx_to_attr[VERT_RESULT_MAX];
+   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+#endif
+};
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c );
+void brw_emit_line_setup( struct brw_sf_compile *c );
+void brw_emit_point_setup( struct brw_sf_compile *c );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf_emit.c b/src/gallium/drivers/i965simple/brw_sf_emit.c
new file mode 100644
index 0000000000..78d6fa5e9e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_emit.c
@@ -0,0 +1,382 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+
+
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   unsigned reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   brw_push_insn_state(p);
+
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   brw_math(&c->func,
+	    c->inv_det,
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0,
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
+                               FRAG_BIT_COL0 | \
+			       FRAG_BIT_COL1)
+
+static boolean calculate_masks( struct brw_sf_compile *c,
+				  unsigned reg,
+				  ushort *pc,
+				  ushort *pc_persp,
+				  ushort *pc_linear)
+{
+   boolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   unsigned persp_mask = c->key.persp_mask;
+   unsigned linear_mask = c->key.linear_mask;
+
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+
+   if (persp_mask & (1 << (reg*2)))
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << (reg*2)))
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << (reg*2+1)))
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << (reg*2+1)))
+	 *pc_linear |= 0xf0;
+   }
+
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
+   
+
+   return is_last_attr;
+}
+
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   debug_printf("%s START ==============\n", __FUNCTION__);
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      ushort pc = 0, pc_persp = 0, pc_linear = 0;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+
+
+      /* Calculate coefficients for interpolated values:
+       */
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
+
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+
+   c->nr_verts = 2;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   c->nr_verts = 1;
+   alloc_regs(c);
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
new file mode 100644
index 0000000000..2a5de61c21
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_state.c
@@ -0,0 +1,181 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_sf_vp(struct brw_context *brw)
+{
+   struct brw_sf_viewport sfv;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+
+   /* BRW_NEW_VIEWPORT */
+   {
+      const float *scale = brw->attribs.Viewport.scale;
+      const float *trans = brw->attribs.Viewport.translate;
+
+      sfv.viewport.m00 = scale[0];
+      sfv.viewport.m11 = scale[1];
+      sfv.viewport.m22 = scale[2]; 
+      sfv.viewport.m30 = trans[0];
+      sfv.viewport.m31 = trans[1];
+      sfv.viewport.m32 = trans[2];
+   }
+
+   /* _NEW_SCISSOR */
+   sfv.scissor.xmin = brw->attribs.Scissor.minx;
+   sfv.scissor.xmax = brw->attribs.Scissor.maxx - 1;
+   sfv.scissor.ymin = brw->attribs.Scissor.miny;
+   sfv.scissor.ymax = brw->attribs.Scissor.maxy - 1;
+
+   brw->sf.vp_gs_offset = brw_cache_data( &brw->cache[BRW_SF_VP], &sfv );
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .brw   = (BRW_NEW_SCISSOR |
+		BRW_NEW_VIEWPORT),
+      .cache = 0
+   },
+   .update = upload_sf_vp
+};
+
+static void upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_state sf;
+   memset(&sf, 0, sizeof(sf));
+
+   /* CACHE_NEW_SF_PROG */
+   sf.thread0.grf_reg_count = align(brw->sf.prog_data->total_grf, 16) / 16 - 1;
+   sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
+   sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   sf.thread3.dispatch_grf_start_reg = 3;
+   sf.thread3.urb_entry_read_offset = 1;
+
+   /* BRW_NEW_URB_FENCE */
+   sf.thread4.nr_urb_entries = brw->urb.nr_sf_entries;
+   sf.thread4.urb_entry_allocation_size = brw->urb.sfsize - 1;
+   sf.thread4.max_threads = MIN2(12, brw->urb.nr_sf_entries / 2) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   sf.sf5.sf_viewport_state_offset = brw->sf.vp_gs_offset >> 5;
+   sf.sf5.viewport_transform = 1;
+
+   /* BRW_NEW_RASTER */
+   if (brw->attribs.Raster->scissor)
+      sf.sf6.scissor = 1;
+
+#if 0
+   if (brw->attribs.Polygon->FrontFace == GL_CCW)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+
+   if (brw->attribs.Polygon->CullFlag) {
+      switch (brw->attribs.Polygon->CullFaceMode) {
+      case GL_FRONT:
+	 sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+	 break;
+      case GL_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+   }
+   else
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#else
+   sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#endif
+
+   sf.sf6.line_width = CLAMP(brw->attribs.Raster->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (brw->attribs.Raster->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   sf.sf6.point_rast_rule = 1;	/* opengl conventions */
+
+   sf.sf7.sprite_point = brw->attribs.Raster->point_sprite;
+   sf.sf7.point_size = CLAMP(brw->attribs.Raster->line_width, 1.0, 255.0) * (1<<3);
+   sf.sf7.use_point_size_state = !brw->attribs.Raster->point_size_per_vertex;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   sf.sf7.trifan_pv = 2;
+   sf.sf7.linestrip_pv = 1;
+   sf.sf7.tristrip_pv = 2;
+   sf.sf7.line_last_pixel_enable = 0;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   sf.sf6.dest_org_vbias = 0x8;
+   sf.sf6.dest_org_hbias = 0x8;
+
+   brw->sf.state_gs_offset = brw_cache_data( &brw->cache[BRW_SF_UNIT], &sf );
+}
+
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_URB_FENCE),
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = upload_sf_unit
+};
+
+
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
new file mode 100644
index 0000000000..86d877d7ef
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -0,0 +1,48 @@
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+/**
+ * XXX this obsolete new and no longer compiled.
+ */
+void brw_shader_info(const struct tgsi_token *tokens,
+		     struct brw_shader_info *info )
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned last = decl->DeclarationRange.Last;
+      
+	 // Broken by crazy wpos init:
+	 //assert( info->nr_regs[decl->Declaration.File] <= last);
+
+	 info->nr_regs[decl->Declaration.File] = MAX2(info->nr_regs[decl->Declaration.File],
+						      last+1);
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
new file mode 100644
index 0000000000..b47f5373f3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <zack@tungstengraphics.com>
+ *           Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_draw.h"
+
+
+#define DUP( TYPE, VAL )                        \
+do {                                            \
+   struct TYPE *x = malloc(sizeof(*x));         \
+   memcpy(x, VAL, sizeof(*x) );                 \
+   return x;                                    \
+} while (0)
+
+/************************************************************************
+ * Blend 
+ */
+static void *
+brw_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{   
+   DUP( pipe_blend_state, blend );
+}
+
+static void brw_bind_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Blend = (struct pipe_blend_state*)blend;
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+
+static void brw_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   free(blend);
+}
+
+static void brw_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.BlendColor = *blend_color;
+
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+/************************************************************************
+ * Sampler 
+ */
+
+static void *
+brw_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   DUP( pipe_sampler_state, sampler );
+}
+
+static void brw_bind_sampler_states(struct pipe_context *pipe,
+                                    unsigned num, void **sampler)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_samplers &&
+       !memcmp(brw->attribs.Samplers, sampler, num * sizeof(void *)))
+      return;
+
+   memcpy(brw->attribs.Samplers, sampler, num * sizeof(void *));
+   memset(&brw->attribs.Samplers[num], 0, (PIPE_MAX_SAMPLERS - num) *
+          sizeof(void *));
+
+   brw->num_samplers = num;
+
+   brw->state.dirty.brw |= BRW_NEW_SAMPLER;
+}
+
+static void brw_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   free(sampler);
+}
+
+
+/************************************************************************
+ * Depth stencil 
+ */
+
+static void *
+brw_create_depth_stencil_state(struct pipe_context *pipe,
+                           const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   DUP( pipe_depth_stencil_alpha_state, depth_stencil );
+}
+
+static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
+                                         void *depth_stencil)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.DepthStencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   brw->state.dirty.brw |= BRW_NEW_DEPTH_STENCIL;
+}
+
+static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
+                                           void *depth_stencil)
+{
+   free(depth_stencil);
+}
+
+/************************************************************************
+ * Scissor
+ */
+static void brw_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   memcpy( &brw->attribs.Scissor, scissor, sizeof(*scissor) );
+   brw->state.dirty.brw |= BRW_NEW_SCISSOR;
+}
+
+
+/************************************************************************
+ * Stipple
+ */
+
+static void brw_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+/************************************************************************
+ * Fragment shader
+ */
+
+static void * brw_create_fs_state(struct pipe_context *pipe,
+                                   const struct pipe_shader_state *shader)
+{
+   struct brw_fragment_program *brw_fp = CALLOC_STRUCT(brw_fragment_program);
+
+   brw_fp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_fp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_fp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_fp->info2);
+#endif
+
+   tgsi_dump(shader->tokens, 0);
+
+
+   return (void *)brw_fp;
+}
+
+static void brw_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FragmentProgram = (struct brw_fragment_program *)shader;
+   brw->state.dirty.brw |= BRW_NEW_FS;
+}
+
+static void brw_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_fragment_program *brw_fp = (struct brw_fragment_program *) shader;
+
+   FREE((void *) brw_fp->program.tokens);
+   FREE(brw_fp);
+}
+
+
+/************************************************************************
+ * Vertex shader and other TNL state 
+ */
+
+static void *brw_create_vs_state(struct pipe_context *pipe,
+                                 const struct pipe_shader_state *shader)
+{
+   struct brw_vertex_program *brw_vp = CALLOC_STRUCT(brw_vertex_program);
+
+   brw_vp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_vp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_vp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_vp->info2);
+#endif
+   tgsi_dump(shader->tokens, 0);
+
+   return (void *)brw_vp;
+}
+
+static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
+   brw->state.dirty.brw |= BRW_NEW_VS;
+
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+}
+
+static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_vertex_program *brw_vp = (struct brw_vertex_program *) shader;
+
+   FREE((void *) brw_vp->program.tokens);
+   FREE(brw_vp);
+}
+
+
+static void brw_set_clip_state( struct pipe_context *pipe,
+                                const struct pipe_clip_state *clip )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Clip = *clip;
+}
+
+
+static void brw_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Viewport = *viewport; /* struct copy */
+   brw->state.dirty.brw |= BRW_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   //draw_set_viewport_state(brw->draw, viewport);
+}
+
+
+static void brw_set_vertex_buffers(struct pipe_context *pipe,
+				   unsigned count,
+				   const struct pipe_vertex_buffer *buffers)
+{
+   struct brw_context *brw = brw_context(pipe);
+   memcpy(brw->vb.vbo_array, buffers, count * sizeof(buffers[0]));
+}
+
+static void brw_set_vertex_elements(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_element *elements)
+{
+   /* flush ? */
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   for (i = 0; i < count; i++) {
+      struct brw_vertex_element_state el;
+      memset(&el, 0, sizeof(el));
+
+      el.ve0.src_offset = elements[i].src_offset;
+      el.ve0.src_format = brw_translate_surface_format(elements[i].src_format);
+      el.ve0.valid = 1;
+      el.ve0.vertex_buffer_index = elements[i].vertex_buffer_index;
+
+      el.ve1.dst_offset   = i * 4;
+
+      el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent0 = BRW_VFCOMPONENT_STORE_SRC;
+
+      switch (elements[i].nr_components) {
+      case 1: el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_0;
+      case 2: el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_0;
+      case 3: el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_1_FLT;
+         break;
+      }
+
+      brw->vb.inputs[i] = el;
+   }
+}
+
+
+
+/************************************************************************
+ * Constant buffers
+ */
+
+static void brw_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(buf == 0 || index == 0);
+
+   brw->attribs.Constants[shader] = buf;
+   brw->state.dirty.brw |= BRW_NEW_CONSTANTS;
+}
+
+
+/************************************************************************
+ * Texture surfaces
+ */
+
+
+static void brw_set_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num,
+                                     struct pipe_texture **texture)
+{
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_textures &&
+       !memcmp(brw->attribs.Texture, texture, num *
+               sizeof(struct pipe_texture *)))
+      return;
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             texture[i]);
+
+   for (i = num; i < brw->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             NULL);
+
+   brw->num_textures = num;
+
+   brw->state.dirty.brw |= BRW_NEW_TEXTURE;
+}
+
+
+/************************************************************************
+ * Render targets, etc
+ */
+
+static void brw_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FrameBuffer = *fb; /* struct copy */
+
+   brw->state.dirty.brw |= BRW_NEW_FRAMEBUFFER;
+}
+
+
+
+/************************************************************************
+ * Rasterizer state
+ */
+
+static void *
+brw_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   DUP(pipe_rasterizer_state, rasterizer);
+}
+
+static void brw_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *setup )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Raster = (struct pipe_rasterizer_state *)setup;
+
+   /* Also pass-through to draw module:
+    */
+   //draw_set_rasterizer_state(brw->draw, setup);
+
+   brw->state.dirty.brw |= BRW_NEW_RASTERIZER;
+}
+
+static void brw_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *setup)
+{
+   free(setup);
+}
+
+
+
+void
+brw_init_state_functions( struct brw_context *brw )
+{
+   brw->pipe.create_blend_state = brw_create_blend_state;
+   brw->pipe.bind_blend_state = brw_bind_blend_state;
+   brw->pipe.delete_blend_state = brw_delete_blend_state;
+
+   brw->pipe.create_sampler_state = brw_create_sampler_state;
+   brw->pipe.bind_sampler_states = brw_bind_sampler_states;
+   brw->pipe.delete_sampler_state = brw_delete_sampler_state;
+
+   brw->pipe.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
+   brw->pipe.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
+   brw->pipe.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
+
+   brw->pipe.create_rasterizer_state = brw_create_rasterizer_state;
+   brw->pipe.bind_rasterizer_state = brw_bind_rasterizer_state;
+   brw->pipe.delete_rasterizer_state = brw_delete_rasterizer_state;
+   brw->pipe.create_fs_state = brw_create_fs_state;
+   brw->pipe.bind_fs_state = brw_bind_fs_state;
+   brw->pipe.delete_fs_state = brw_delete_fs_state;
+   brw->pipe.create_vs_state = brw_create_vs_state;
+   brw->pipe.bind_vs_state = brw_bind_vs_state;
+   brw->pipe.delete_vs_state = brw_delete_vs_state;
+
+   brw->pipe.set_blend_color = brw_set_blend_color;
+   brw->pipe.set_clip_state = brw_set_clip_state;
+   brw->pipe.set_constant_buffer = brw_set_constant_buffer;
+   brw->pipe.set_framebuffer_state = brw_set_framebuffer_state;
+
+//   brw->pipe.set_feedback_state = brw_set_feedback_state;
+//   brw->pipe.set_feedback_buffer = brw_set_feedback_buffer;
+
+   brw->pipe.set_polygon_stipple = brw_set_polygon_stipple;
+   brw->pipe.set_scissor_state = brw_set_scissor_state;
+   brw->pipe.set_sampler_textures = brw_set_sampler_textures;
+   brw->pipe.set_viewport_state = brw_set_viewport_state;
+   brw->pipe.set_vertex_buffers = brw_set_vertex_buffers;
+   brw->pipe.set_vertex_elements = brw_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.h b/src/gallium/drivers/i965simple/brw_state.h
new file mode 100644
index 0000000000..de0a6371b8
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.h
@@ -0,0 +1,151 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "brw_context.h"
+#include "brw_winsys.h"
+
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_constant_buffer_state;
+const struct brw_tracked_state brw_constant_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple_offset;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_active_vertprog;
+const struct brw_tracked_state brw_tnl_vertprog;
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_clear_surface_cache;
+const struct brw_tracked_state brw_clear_batch_cache;
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data );
+
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_sz);
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_sz,
+			 const void *data,
+			 unsigned data_sz,
+			 const void *aux,
+			 void *aux_return );
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return);
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+
+static inline struct pipe_buffer *brw_cache_buffer(struct brw_context *brw,
+                                                          enum brw_cache_id id)
+{
+   return brw->cache[id].pool->buffer;
+}
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+boolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   unsigned sz );
+
+void brw_destroy_batch_cache( struct brw_context *brw );
+
+
+/***********************************************************************
+ * brw_state_pool.c
+ */
+void brw_init_pools( struct brw_context *brw );
+void brw_destroy_pools( struct brw_context *brw );
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return);
+
+void brw_pool_fence( struct brw_context *brw,
+		     struct brw_mem_pool *pool,
+		     unsigned fence );
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool );
+
+void brw_clear_all_caches( struct brw_context *brw );
+void brw_invalidate_pools( struct brw_context *brw );
+void brw_clear_batch_cache_flush( struct brw_context *brw );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
new file mode 100644
index 0000000000..43a1c89fc4
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_batch.c
@@ -0,0 +1,113 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_state.h"
+#include "brw_winsys.h"
+
+#include "util/u_memory.h"
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+boolean brw_cached_batch_struct( struct brw_context *brw,
+                                 const void *data,
+                                 unsigned sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->emit_state_always) {
+      brw_batchbuffer_data(brw->winsys, data, sz);
+      return TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return FALSE;
+	 if (item->sz != sz) {
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = MALLOC(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+emit:
+   memcpy(item->header, newheader, sz);
+   brw_batchbuffer_data(brw->winsys, data, sz);
+   return TRUE;
+}
+
+static void clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      free((void *)item->header);
+      free(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+
+
+   brw_clear_all_caches(brw);
+
+   brw_invalidate_pools(brw);
+}
+
+void brw_clear_batch_cache_flush( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+
+/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
new file mode 100644
index 0000000000..094248fa69
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_cache.c
@@ -0,0 +1,443 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_state.h"
+
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+#include "util/u_memory.h"
+
+
+
+/***********************************************************************
+ * Check cache for uploaded version of struct, else upload new one.
+ * Fail when memory is exhausted.
+ *
+ * XXX: FIXME: Currently search is so slow it would be quicker to
+ * regenerate the data every time...
+ */
+
+static unsigned hash_key( const void *key, unsigned key_size )
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+
+static struct brw_cache_item *search_cache( struct brw_cache *cache,
+					     unsigned hash,
+					     const void *key,
+					     unsigned key_size)
+{
+   struct brw_cache_item *c;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void rehash( struct brw_cache *cache )
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   unsigned size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) MALLOC(size * sizeof(*items));
+   memset(items, 0, size * sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return)
+{
+   struct brw_cache_item *item;
+   unsigned addr = 0;
+   unsigned hash = hash_key(key, key_size);
+
+   item = search_cache(cache, hash, key, key_size);
+
+   if (item) {
+      if (aux_return)
+	 *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+
+      *offset_return = addr = item->offset;
+   }
+
+   if (item == NULL || addr != cache->last_addr) {
+      cache->brw->state.dirty.cache |= 1<<cache->id;
+      cache->last_addr = addr;
+   }
+
+   return item != NULL;
+}
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_size,
+			 const void *data,
+			 unsigned data_size,
+			 const void *aux,
+			 void *aux_return )
+{
+   unsigned offset;
+   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   unsigned hash = hash_key(key, key_size);
+   void *tmp = MALLOC(key_size + cache->aux_size);
+
+   if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
+      /* Should not be possible:
+       */
+      debug_printf("brw_pool_alloc failed\n");
+      exit(1);
+   }
+
+   memcpy(tmp, key, key_size);
+
+   if (cache->aux_size)
+      memcpy(tmp+key_size, aux, cache->aux_size);
+
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->offset = offset;
+   item->data_size = data_size;
+
+   if (++cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+
+   if (aux_return) {
+      assert(cache->aux_size);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+             cache->name, 
+	     data_size,
+             (void*)cache->pool->buffer,
+             offset);
+
+   /* Copy data to the buffer:
+    */
+   cache->brw->winsys->buffer_subdata_typed(cache->brw->winsys,
+					    cache->pool->buffer, 
+					    offset, 
+					    data_size, 
+					    data,
+					    cache->id);
+
+   cache->brw->state.dirty.cache |= 1<<cache->id;
+   cache->last_addr = offset;
+
+   return offset;
+}
+
+/* This doesn't really work with aux data.  Use search/upload instead
+ */
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_size)
+{
+   unsigned addr;
+
+   if (!brw_search_cache(cache, data, data_size, NULL, &addr)) {
+      addr = brw_upload_cache(cache,
+			      data, data_size,
+			      data, data_size,
+			      NULL, NULL);
+   }
+
+   return addr;
+}
+
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data)
+{
+   return brw_cache_data_sz(cache, data, cache->key_size);
+}
+
+enum pool_type {
+   DW_SURFACE_STATE,
+   DW_GENERAL_STATE
+};
+
+static void brw_init_cache( struct brw_context *brw,
+			    const char *name,
+			    unsigned id,
+			    unsigned key_size,
+			    unsigned aux_size,
+			    enum pool_type pool_type)
+{
+   struct brw_cache *cache = &brw->cache[id];
+   cache->brw = brw;
+   cache->id = id;
+   cache->name = name;
+   cache->items = NULL;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+
+   cache->key_size = key_size;
+   cache->aux_size = aux_size;
+   switch (pool_type) {
+   case DW_GENERAL_STATE: cache->pool = &brw->pool[BRW_GS_POOL]; break;
+   case DW_SURFACE_STATE: cache->pool = &brw->pool[BRW_SS_POOL]; break;
+   default: assert(0); break;
+   }
+}
+
+void brw_init_caches( struct brw_context *brw )
+{
+
+   brw_init_cache(brw,
+		  "CC_VP",
+		  BRW_CC_VP,
+		  sizeof(struct brw_cc_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CC_UNIT",
+		  BRW_CC_UNIT,
+		  sizeof(struct brw_cc_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_PROG",
+		  BRW_WM_PROG,
+		  sizeof(struct brw_wm_prog_key),
+		  sizeof(struct brw_wm_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER_DEFAULT_COLOR",
+		  BRW_SAMPLER_DEFAULT_COLOR,
+		  sizeof(struct brw_sampler_default_color),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER",
+		  BRW_SAMPLER,
+		  0,		/* variable key/data size */
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_UNIT",
+		  BRW_WM_UNIT,
+		  sizeof(struct brw_wm_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_PROG",
+		  BRW_SF_PROG,
+		  sizeof(struct brw_sf_prog_key),
+		  sizeof(struct brw_sf_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_VP",
+		  BRW_SF_VP,
+		  sizeof(struct brw_sf_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_UNIT",
+		  BRW_SF_UNIT,
+		  sizeof(struct brw_sf_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_UNIT",
+		  BRW_VS_UNIT,
+		  sizeof(struct brw_vs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_PROG",
+		  BRW_VS_PROG,
+		  sizeof(struct brw_vs_prog_key),
+		  sizeof(struct brw_vs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_UNIT",
+		  BRW_CLIP_UNIT,
+		  sizeof(struct brw_clip_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_PROG",
+		  BRW_CLIP_PROG,
+		  sizeof(struct brw_clip_prog_key),
+		  sizeof(struct brw_clip_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_UNIT",
+		  BRW_GS_UNIT,
+		  sizeof(struct brw_gs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_PROG",
+		  BRW_GS_PROG,
+		  sizeof(struct brw_gs_prog_key),
+		  sizeof(struct brw_gs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURFACE",
+		  BRW_SS_SURFACE,
+		  sizeof(struct brw_surface_state),
+		  0,
+		  DW_SURFACE_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURF_BIND",
+		  BRW_SS_SURF_BIND,
+		  sizeof(struct brw_surface_binding_table),
+		  0,
+		  DW_SURFACE_STATE);
+}
+
+
+/* When we lose hardware context, need to invalidate the surface cache
+ * as these structs must be explicitly re-uploaded.  They are subject
+ * to fixup by the memory manager as they contain absolute agp
+ * offsets, so we need to ensure there is a fresh version of the
+ * struct available to receive the fixup.
+ *
+ * XXX: Need to ensure that there aren't two versions of a surface or
+ * bufferobj with different backing data active in the same buffer at
+ * once?  Otherwise the cache could confuse them.  Maybe better not to
+ * cache at all?
+ *
+ * --> Isn't this the same as saying need to ensure batch is flushed
+ *         before new data is uploaded to an existing buffer?  We
+ *         already try to make sure of that.
+ */
+static void clear_cache( struct brw_cache *cache )
+{
+   struct brw_cache_item *c, *next;
+   unsigned i;
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 free((void *)c->key);
+	 free(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+}
+
+void brw_clear_all_caches( struct brw_context *brw )
+{
+   int i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+
+   if (brw->curbe.last_buf) {
+      FREE(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+
+
+void brw_destroy_caches( struct brw_context *brw )
+{
+   unsigned i;
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
new file mode 100644
index 0000000000..e91263cb1f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -0,0 +1,138 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_pool.c
+ * Implements the state pool allocator.
+ *
+ * For the 965, we create two state pools for state cache entries.  Objects
+ * will be allocated into the pools depending on which state base address
+ * their pointer is relative to in other 965 state.
+ *
+ * The state pools are relatively simple: As objects are allocated, increment
+ * the offset to allocate space.  When the pool is "full" (rather, close to
+ * full), we reset the pool and reset the state cache entries that point into
+ * the pool.
+ */
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return)
+{
+   unsigned fixup = align(pool->offset, alignment) - pool->offset;
+
+   size = align(size, 4);
+
+   if (pool->offset + fixup + size >= pool->size) {
+      debug_printf("%s failed\n", __FUNCTION__);
+      assert(0);
+      exit(0);
+   }
+
+   pool->offset += fixup;
+   *offset_return = pool->offset;
+   pool->offset += size;
+
+   return TRUE;
+}
+
+static
+void brw_invalidate_pool( struct brw_mem_pool *pool )
+{
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
+
+   pool->offset = 0;
+
+   brw_clear_all_caches(pool->brw);
+}
+
+
+static void brw_init_pool( struct brw_context *brw,
+			   unsigned pool_id,
+			   unsigned size )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pool->size = size;
+   pool->brw = brw;
+
+   pool->buffer = pipe_buffer_create(brw->pipe.screen,
+                                     4096,
+                                     0 /*  DRM_BO_FLAG_MEM_TT */,
+                                     size);
+}
+
+static void brw_destroy_pool( struct brw_context *brw,
+			      unsigned pool_id )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pipe_buffer_reference( pool->brw->pipe.screen,
+			  &pool->buffer,
+			  NULL );
+}
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool )
+{
+   if (pool->offset > (pool->size * 3) / 4) {
+      brw->state.dirty.brw |= BRW_NEW_SCENE;
+   }
+
+}
+
+void brw_init_pools( struct brw_context *brw )
+{
+   brw_init_pool(brw, BRW_GS_POOL, 0x80000);
+   brw_init_pool(brw, BRW_SS_POOL, 0x80000);
+}
+
+void brw_destroy_pools( struct brw_context *brw )
+{
+   brw_destroy_pool(brw, BRW_GS_POOL);
+   brw_destroy_pool(brw, BRW_SS_POOL);
+}
+
+
+void brw_invalidate_pools( struct brw_context *brw )
+{
+   brw_invalidate_pool(&brw->pool[BRW_GS_POOL]);
+   brw_invalidate_pool(&brw->pool[BRW_SS_POOL]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
new file mode 100644
index 0000000000..bac9161b5f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_upload.c
@@ -0,0 +1,202 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "util/u_memory.h"
+
+/* This is used to initialize brw->state.atoms[].  We could use this
+ * list directly except for a single atom, brw_constant_buffer, which
+ * has a .dirty value which changes according to the parameters of the
+ * current fragment and vertex programs, and so cannot be a static
+ * value.
+ */
+const struct brw_tracked_state *atoms[] =
+{
+   &brw_vs_prog,
+   &brw_gs_prog,
+   &brw_clip_prog,
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+   &brw_pipe_control,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_drawing_rect,
+   &brw_depthbuffer,
+
+   &brw_polygon_stipple,
+   &brw_line_stipple,
+
+   &brw_psp_urb_cbs,
+
+   &brw_constant_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_pools(brw);
+   brw_init_caches(brw);
+
+   brw->state.dirty.brw = ~0;
+   brw->emit_state_always = 0;
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+   brw_destroy_pools(brw);
+}
+
+/***********************************************************************
+ */
+
+static boolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+
+/***********************************************************************
+ * Emit all state:
+ */
+void brw_validate_state( struct brw_context *brw )
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   unsigned i;
+
+   if (brw->emit_state_always) 
+      state->brw |= ~0;
+
+   if (state->cache == 0 &&
+       state->brw == 0)
+      return;
+
+   if (brw->state.dirty.brw & BRW_NEW_SCENE)
+      brw_clear_batch_cache_flush(brw);
+
+   if (BRW_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;
+      memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty)) {
+	    atom->update( brw );
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty))
+	    atom->update( brw );
+      }
+   }
+
+   memset(state, 0, sizeof(*state));
+}
diff --git a/src/gallium/drivers/i965simple/brw_structs.h b/src/gallium/drivers/i965simple/brw_structs.h
new file mode 100644
index 0000000000..bbb087e95d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_structs.h
@@ -0,0 +1,1348 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_STRUCTS_H
+#define BRW_STRUCTS_H
+
+#include "pipe/p_compiler.h"
+
+/* Command packets:
+ */
+struct header
+{
+   unsigned length:16;
+   unsigned opcode:16;
+};
+
+
+union header_union
+{
+   struct header bits;
+   unsigned dword;
+};
+
+struct brw_3d_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:3;
+      unsigned wc_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned operation:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } dest;
+
+   unsigned dword2;
+   unsigned dword3;
+};
+
+
+struct brw_3d_primitive
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned pad:2;
+      unsigned topology:5;
+      unsigned indexed:1;
+      unsigned opcode:16;
+   } header;
+
+   unsigned verts_per_instance;
+   unsigned start_vert_location;
+   unsigned instance_count;
+   unsigned start_instance_location;
+   unsigned base_vert_location;
+};
+
+/* These seem to be passed around as function args, so it works out
+ * better to keep them as #defines:
+ */
+#define BRW_FLUSH_READ_CACHE           0x1
+#define BRW_FLUSH_STATE_CACHE          0x2
+#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
+#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
+
+struct brw_mi_flush
+{
+   unsigned flags:4;
+   unsigned pad:12;
+   unsigned opcode:16;
+};
+
+struct brw_vf_statistics
+{
+   unsigned statistics_enable:1;
+   unsigned pad:15;
+   unsigned opcode:16;
+};
+
+
+
+struct brw_binding_table_pointers
+{
+   struct header header;
+   unsigned vs;
+   unsigned gs;
+   unsigned clp;
+   unsigned sf;
+   unsigned wm;
+};
+
+
+struct brw_blend_constant_color
+{
+   struct header header;
+   float blend_constant_color[4];
+};
+
+
+struct brw_depthbuffer
+{
+   union header_union header;
+
+   union {
+      struct {
+	 unsigned pitch:18;
+	 unsigned format:3;
+	 unsigned pad:4;
+	 unsigned depth_offset_disable:1;
+	 unsigned tile_walk:1;
+	 unsigned tiled_surface:1;
+	 unsigned pad2:1;
+	 unsigned surface_type:3;
+      } bits;
+      unsigned dword;
+   } dword1;
+
+   unsigned dword2_base_addr;
+
+   union {
+      struct {
+	 unsigned pad:1;
+	 unsigned mipmap_layout:1;
+	 unsigned lod:4;
+	 unsigned width:13;
+	 unsigned height:13;
+      } bits;
+      unsigned dword;
+   } dword3;
+
+   union {
+      struct {
+	 unsigned pad:12;
+	 unsigned min_array_element:9;
+	 unsigned depth:11;
+      } bits;
+      unsigned dword;
+   } dword4;
+};
+
+struct brw_drawrect
+{
+   struct header header;
+   unsigned xmin:16;
+   unsigned ymin:16;
+   unsigned xmax:16;
+   unsigned ymax:16;
+   unsigned xorg:16;
+   unsigned yorg:16;
+};
+
+
+
+
+struct brw_global_depth_offset_clamp
+{
+   struct header header;
+   float depth_offset_clamp;
+};
+
+struct brw_indexbuffer
+{
+   union {
+      struct
+      {
+	 unsigned length:8;
+	 unsigned index_format:2;
+	 unsigned cut_index_enable:1;
+	 unsigned pad:5;
+	 unsigned opcode:16;
+      } bits;
+      unsigned dword;
+
+   } header;
+
+   unsigned buffer_start;
+   unsigned buffer_end;
+};
+
+
+struct brw_line_stipple
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pattern:16;
+      unsigned pad:16;
+   } bits0;
+
+   struct
+   {
+      unsigned repeat_count:9;
+      unsigned pad:7;
+      unsigned inverse_repeat_count:16;
+   } bits1;
+};
+
+
+struct brw_pipelined_state_pointers
+{
+   struct header header;
+
+   struct {
+      unsigned pad:5;
+      unsigned offset:27;
+   } vs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } gs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } clp;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } sf;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } wm;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27; /* KW: check me! */
+   } cc;
+};
+
+
+struct brw_polygon_stipple_offset
+{
+   struct header header;
+
+   struct {
+      unsigned y_offset:5;
+      unsigned pad:3;
+      unsigned x_offset:5;
+      unsigned pad0:19;
+   } bits0;
+};
+
+
+
+struct brw_polygon_stipple
+{
+   struct header header;
+   unsigned stipple[32];
+};
+
+
+
+struct brw_pipeline_select
+{
+   struct
+   {
+      unsigned pipeline_select:1;
+      unsigned pad:15;
+      unsigned opcode:16;
+   } header;
+};
+
+
+struct brw_pipe_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:2;
+      unsigned instruction_state_cache_flush_enable:1;
+      unsigned write_cache_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned post_sync_operation:2;
+
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } bits1;
+
+   unsigned data0;
+   unsigned data1;
+};
+
+
+struct brw_urb_fence
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned vs_realloc:1;
+      unsigned gs_realloc:1;
+      unsigned clp_realloc:1;
+      unsigned sf_realloc:1;
+      unsigned vfe_realloc:1;
+      unsigned cs_realloc:1;
+      unsigned pad:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned vs_fence:10;
+      unsigned gs_fence:10;
+      unsigned clp_fence:10;
+      unsigned pad:2;
+   } bits0;
+
+   struct
+   {
+      unsigned sf_fence:10;
+      unsigned vf_fence:10;
+      unsigned cs_fence:10;
+      unsigned pad:2;
+   } bits1;
+};
+
+struct brw_constant_buffer_state /* previously brw_command_streamer */
+{
+   struct header header;
+
+   struct
+   {
+      unsigned nr_urb_entries:3;
+      unsigned pad:1;
+      unsigned urb_entry_size:5;
+      unsigned pad0:23;
+   } bits0;
+};
+
+struct brw_constant_buffer
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned valid:1;
+      unsigned pad:7;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned buffer_length:6;
+      unsigned buffer_address:26;
+   } bits0;
+};
+
+struct brw_state_base_address
+{
+   struct header header;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned general_state_address:27;
+   } bits0;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned surface_state_address:27;
+   } bits1;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned indirect_object_state_address:27;
+   } bits2;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned general_state_upper_bound:20;
+   } bits3;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned indirect_object_state_upper_bound:20;
+   } bits4;
+};
+
+struct brw_state_prefetch
+{
+   struct header header;
+
+   struct
+   {
+      unsigned prefetch_count:3;
+      unsigned pad:3;
+      unsigned prefetch_pointer:26;
+   } bits0;
+};
+
+struct brw_system_instruction_pointer
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pad:4;
+      unsigned system_instruction_pointer:28;
+   } bits0;
+};
+
+
+
+
+/* State structs for the various fixed function units:
+ */
+
+
+struct thread0
+{
+   unsigned pad0:1;
+   unsigned grf_reg_count:3;
+   unsigned pad1:2;
+   unsigned kernel_start_pointer:26;
+};
+
+struct thread1
+{
+   unsigned ext_halt_exception_enable:1;
+   unsigned sw_exception_enable:1;
+   unsigned mask_stack_exception_enable:1;
+   unsigned timeout_exception_enable:1;
+   unsigned illegal_op_exception_enable:1;
+   unsigned pad0:3;
+   unsigned depth_coef_urb_read_offset:6;	/* WM only */
+   unsigned pad1:2;
+   unsigned floating_point_mode:1;
+   unsigned thread_priority:1;
+   unsigned binding_table_entry_count:8;
+   unsigned pad3:5;
+   unsigned single_program_flow:1;
+};
+
+struct thread2
+{
+   unsigned per_thread_scratch_space:4;
+   unsigned pad0:6;
+   unsigned scratch_space_base_pointer:22;
+};
+
+
+struct thread3
+{
+   unsigned dispatch_grf_start_reg:4;
+   unsigned urb_entry_read_offset:6;
+   unsigned pad0:1;
+   unsigned urb_entry_read_length:6;
+   unsigned pad1:1;
+   unsigned const_urb_entry_read_offset:6;
+   unsigned pad2:1;
+   unsigned const_urb_entry_read_length:6;
+   unsigned pad3:1;
+};
+
+
+
+struct brw_clip_unit_state
+{
+   struct thread0 thread0;
+   struct
+   {
+      unsigned pad0:7;
+      unsigned sw_exception_enable:1;
+      unsigned pad1:3;
+      unsigned mask_stack_exception_enable:1;
+      unsigned pad2:1;
+      unsigned illegal_op_exception_enable:1;
+      unsigned pad3:2;
+      unsigned floating_point_mode:1;
+      unsigned thread_priority:1;
+      unsigned binding_table_entry_count:8;
+      unsigned pad4:5;
+      unsigned single_program_flow:1;
+   } thread1;
+
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned gs_output_stats:1; /* not always */
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1; 	/* may be less */
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned pad0:13;
+      unsigned clip_mode:3;
+      unsigned userclip_enable_flags:8;
+      unsigned userclip_must_clip:1;
+      unsigned pad1:1;
+      unsigned guard_band_enable:1;
+      unsigned viewport_z_clip_enable:1;
+      unsigned viewport_xy_clip_enable:1;
+      unsigned vertex_position_space:1;
+      unsigned api_mode:1;
+      unsigned pad2:1;
+   } clip5;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned clipper_viewport_state_ptr:27;
+   } clip6;
+
+
+   float viewport_xmin;
+   float viewport_xmax;
+   float viewport_ymin;
+   float viewport_ymax;
+};
+
+
+
+struct brw_cc_unit_state
+{
+   struct
+   {
+      unsigned pad0:3;
+      unsigned bf_stencil_pass_depth_pass_op:3;
+      unsigned bf_stencil_pass_depth_fail_op:3;
+      unsigned bf_stencil_fail_op:3;
+      unsigned bf_stencil_func:3;
+      unsigned bf_stencil_enable:1;
+      unsigned pad1:2;
+      unsigned stencil_write_enable:1;
+      unsigned stencil_pass_depth_pass_op:3;
+      unsigned stencil_pass_depth_fail_op:3;
+      unsigned stencil_fail_op:3;
+      unsigned stencil_func:3;
+      unsigned stencil_enable:1;
+   } cc0;
+
+
+   struct
+   {
+      unsigned bf_stencil_ref:8;
+      unsigned stencil_write_mask:8;
+      unsigned stencil_test_mask:8;
+      unsigned stencil_ref:8;
+   } cc1;
+
+
+   struct
+   {
+      unsigned logicop_enable:1;
+      unsigned pad0:10;
+      unsigned depth_write_enable:1;
+      unsigned depth_test_function:3;
+      unsigned depth_test:1;
+      unsigned bf_stencil_write_mask:8;
+      unsigned bf_stencil_test_mask:8;
+   } cc2;
+
+
+   struct
+   {
+      unsigned pad0:8;
+      unsigned alpha_test_func:3;
+      unsigned alpha_test:1;
+      unsigned blend_enable:1;
+      unsigned ia_blend_enable:1;
+      unsigned pad1:1;
+      unsigned alpha_test_format:1;
+      unsigned pad2:16;
+   } cc3;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned cc_viewport_state_offset:27;
+   } cc4;
+
+   struct
+   {
+      unsigned pad0:2;
+      unsigned ia_dest_blend_factor:5;
+      unsigned ia_src_blend_factor:5;
+      unsigned ia_blend_function:3;
+      unsigned statistics_enable:1;
+      unsigned logicop_func:4;
+      unsigned pad1:11;
+      unsigned dither_enable:1;
+   } cc5;
+
+   struct
+   {
+      unsigned clamp_post_alpha_blend:1;
+      unsigned clamp_pre_alpha_blend:1;
+      unsigned clamp_range:2;
+      unsigned pad0:11;
+      unsigned y_dither_offset:2;
+      unsigned x_dither_offset:2;
+      unsigned dest_blend_factor:5;
+      unsigned src_blend_factor:5;
+      unsigned blend_function:3;
+   } cc6;
+
+   struct {
+      union {
+	 float f;
+	 ubyte ub[4];
+      } alpha_ref;
+   } cc7;
+};
+
+
+
+struct brw_sf_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:6;
+      unsigned pad3:1;
+   } thread4;
+
+   struct
+   {
+      unsigned front_winding:1;
+      unsigned viewport_transform:1;
+      unsigned pad0:3;
+      unsigned sf_viewport_state_offset:27;
+   } sf5;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned dest_org_vbias:4;
+      unsigned dest_org_hbias:4;
+      unsigned scissor:1;
+      unsigned disable_2x2_trifilter:1;
+      unsigned disable_zero_pix_trifilter:1;
+      unsigned point_rast_rule:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned line_width:4;
+      unsigned fast_scissor_disable:1;
+      unsigned cull_mode:2;
+      unsigned aa_enable:1;
+   } sf6;
+
+   struct
+   {
+      unsigned point_size:11;
+      unsigned use_point_size_state:1;
+      unsigned subpixel_precision:1;
+      unsigned sprite_point:1;
+      unsigned pad0:11;
+      unsigned trifan_pv:2;
+      unsigned linestrip_pv:2;
+      unsigned tristrip_pv:2;
+      unsigned line_last_pixel_enable:1;
+   } sf7;
+
+};
+
+
+struct brw_gs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1;
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } gs5;
+
+
+   struct
+   {
+      unsigned max_vp_index:4;
+      unsigned pad0:26;
+      unsigned reorder_enable:1;
+      unsigned pad1:1;
+   } gs6;
+};
+
+
+struct brw_vs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:4;
+      unsigned pad3:3;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } vs5;
+
+   struct
+   {
+      unsigned vs_enable:1;
+      unsigned vert_cache_disable:1;
+      unsigned pad0:30;
+   } vs6;
+};
+
+
+struct brw_wm_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct {
+      unsigned stats_enable:1;
+      unsigned pad0:1;
+      unsigned sampler_count:3;
+      unsigned sampler_state_pointer:27;
+   } wm4;
+
+   struct
+   {
+      unsigned enable_8_pix:1;
+      unsigned enable_16_pix:1;
+      unsigned enable_32_pix:1;
+      unsigned pad0:7;
+      unsigned legacy_global_depth_bias:1;
+      unsigned line_stipple:1;
+      unsigned depth_offset:1;
+      unsigned polygon_stipple:1;
+      unsigned line_aa_region_width:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned early_depth_test:1;
+      unsigned thread_dispatch_enable:1;
+      unsigned program_uses_depth:1;
+      unsigned program_computes_depth:1;
+      unsigned program_uses_killpixel:1;
+      unsigned legacy_line_rast: 1;
+      unsigned pad1:1;
+      unsigned max_threads:6;
+      unsigned pad2:1;
+   } wm5;
+
+   float global_depth_offset_constant;
+   float global_depth_offset_scale;
+};
+
+struct brw_sampler_default_color {
+   float color[4];
+};
+
+struct brw_sampler_state
+{
+
+   struct
+   {
+      unsigned shadow_function:3;
+      unsigned lod_bias:11;
+      unsigned min_filter:3;
+      unsigned mag_filter:3;
+      unsigned mip_filter:2;
+      unsigned base_level:5;
+      unsigned pad:1;
+      unsigned lod_preclamp:1;
+      unsigned default_color_mode:1;
+      unsigned pad0:1;
+      unsigned disable:1;
+   } ss0;
+
+   struct
+   {
+      unsigned r_wrap_mode:3;
+      unsigned t_wrap_mode:3;
+      unsigned s_wrap_mode:3;
+      unsigned pad:3;
+      unsigned max_lod:10;
+      unsigned min_lod:10;
+   } ss1;
+
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned default_color_pointer:27;
+   } ss2;
+
+   struct
+   {
+      unsigned pad:19;
+      unsigned max_aniso:3;
+      unsigned chroma_key_mode:1;
+      unsigned chroma_key_index:2;
+      unsigned chroma_key_enable:1;
+      unsigned monochrome_filter_width:3;
+      unsigned monochrome_filter_height:3;
+   } ss3;
+};
+
+
+struct brw_clipper_viewport
+{
+   float xmin;
+   float xmax;
+   float ymin;
+   float ymax;
+};
+
+struct brw_cc_viewport
+{
+   float min_depth;
+   float max_depth;
+};
+
+struct brw_sf_viewport
+{
+   struct {
+      float m00;
+      float m11;
+      float m22;
+      float m30;
+      float m31;
+      float m32;
+   } viewport;
+
+   struct {
+      short xmin;
+      short ymin;
+      short xmax;
+      short ymax;
+   } scissor;
+};
+
+/* Documented in the subsystem/shared-functions/sampler chapter...
+ */
+struct brw_surface_state
+{
+   struct {
+      unsigned cube_pos_z:1;
+      unsigned cube_neg_z:1;
+      unsigned cube_pos_y:1;
+      unsigned cube_neg_y:1;
+      unsigned cube_pos_x:1;
+      unsigned cube_neg_x:1;
+      unsigned pad:4;
+      unsigned mipmap_layout_mode:1;
+      unsigned vert_line_stride_ofs:1;
+      unsigned vert_line_stride:1;
+      unsigned color_blend:1;
+      unsigned writedisable_blue:1;
+      unsigned writedisable_green:1;
+      unsigned writedisable_red:1;
+      unsigned writedisable_alpha:1;
+      unsigned surface_format:9;
+      unsigned data_return_format:1;
+      unsigned pad0:1;
+      unsigned surface_type:3;
+   } ss0;
+
+   struct {
+      unsigned base_addr;
+   } ss1;
+
+   struct {
+      unsigned pad:2;
+      unsigned mip_count:4;
+      unsigned width:13;
+      unsigned height:13;
+   } ss2;
+
+   struct {
+      unsigned tile_walk:1;
+      unsigned tiled_surface:1;
+      unsigned pad:1;
+      unsigned pitch:18;
+      unsigned depth:11;
+   } ss3;
+
+   struct {
+      unsigned pad:19;
+      unsigned min_array_elt:9;
+      unsigned min_lod:4;
+   } ss4;
+};
+
+
+
+struct brw_vertex_buffer_state
+{
+   struct {
+      unsigned pitch:11;
+      unsigned pad:15;
+      unsigned access_type:1;
+      unsigned vb_index:5;
+   } vb0;
+
+   unsigned start_addr;
+   unsigned max_index;
+#if 1
+   unsigned instance_data_step_rate; /* not included for sequential/random vertices? */
+#endif
+};
+
+#define BRW_VBP_MAX 17
+
+struct brw_vb_array_state {
+   struct header header;
+   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
+};
+
+
+struct brw_vertex_element_state
+{
+   struct
+   {
+      unsigned src_offset:11;
+      unsigned pad:5;
+      unsigned src_format:9;
+      unsigned pad0:1;
+      unsigned valid:1;
+      unsigned vertex_buffer_index:5;
+   } ve0;
+
+   struct
+   {
+      unsigned dst_offset:8;
+      unsigned pad:8;
+      unsigned vfcomponent3:4;
+      unsigned vfcomponent2:4;
+      unsigned vfcomponent1:4;
+      unsigned vfcomponent0:4;
+   } ve1;
+};
+
+#define BRW_VEP_MAX 18
+
+struct brw_vertex_element_packet {
+   struct header header;
+   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
+};
+
+
+struct brw_urb_immediate {
+   unsigned opcode:4;
+   unsigned offset:6;
+   unsigned swizzle_control:2;
+   unsigned pad:1;
+   unsigned allocate:1;
+   unsigned used:1;
+   unsigned complete:1;
+   unsigned response_length:4;
+   unsigned msg_length:4;
+   unsigned msg_target:4;
+   unsigned pad1:3;
+   unsigned end_of_thread:1;
+};
+
+/* Instruction format for the execution units:
+ */
+
+struct brw_instruction
+{
+   struct
+   {
+      unsigned opcode:7;
+      unsigned pad:1;
+      unsigned access_mode:1;
+      unsigned mask_control:1;
+      unsigned dependency_control:2;
+      unsigned compression_control:2;
+      unsigned thread_control:2;
+      unsigned predicate_control:4;
+      unsigned predicate_inverse:1;
+      unsigned execution_size:3;
+      unsigned destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      unsigned pad0:2;
+      unsigned debug_control:1;
+      unsigned saturate:1;
+   } header;
+
+   union {
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad:1;
+	 unsigned dest_subreg_nr:5;
+	 unsigned dest_reg_nr:8;
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } da1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad:6;
+	 int dest_indirect_offset:10;	/* offset against the deref'd address reg */
+	 unsigned dest_subreg_nr:3; /* subnr for the address reg a0.x */
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } ia1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad0:1;
+	 unsigned dest_writemask:4;
+	 unsigned dest_subreg_nr:1;
+	 unsigned dest_reg_nr:8;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } da16;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad0:6;
+	 unsigned dest_writemask:4;
+	 int dest_indirect_offset:6;
+	 unsigned dest_subreg_nr:3;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } ia16;
+   } bits1;
+
+
+   union {
+      struct
+      {
+	 unsigned src0_subreg_nr:5;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } da1;
+
+      struct
+      {
+	 int src0_indirect_offset:10;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 unsigned src0_subreg_nr:1;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } da16;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 int src0_indirect_offset:6;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia16;
+
+   } bits2;
+
+   union
+   {
+      struct
+      {
+	 unsigned src1_subreg_nr:5;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad0:7;
+      } da1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 unsigned src1_subreg_nr:1;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad2:7;
+      } da16;
+
+      struct
+      {
+	 int  src1_indirect_offset:10;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 int  src1_indirect_offset:6;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad2:6;
+      } ia16;
+
+
+      struct
+      {
+	 int  jump_count:16;	/* note: signed */
+	 unsigned  pop_count:4;
+	 unsigned  pad0:12;
+      } if_else;
+
+      struct {
+	 unsigned function:4;
+	 unsigned int_type:1;
+	 unsigned precision:1;
+	 unsigned saturate:1;
+	 unsigned data_type:1;
+	 unsigned pad0:8;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } math;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned sampler:4;
+	 unsigned return_format:2;
+	 unsigned msg_type:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } sampler;
+
+      struct brw_urb_immediate urb;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:4;
+	 unsigned msg_type:2;
+	 unsigned target_cache:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_read;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:3;
+	 unsigned pixel_scoreboard_clear:1;
+	 unsigned msg_type:3;
+	 unsigned send_commit_msg:1;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_write;
+
+      struct {
+	 unsigned pad:16;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } generic;
+
+      int d;
+      unsigned ud;
+   } bits3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
new file mode 100644
index 0000000000..724a69b2ee
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+brw_surface_copy(struct pipe_context *pipe,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 struct pipe_surface *src,
+                 unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+   assert( dst != src );
+   assert( dst->block.size == src->block.size );
+   assert( dst->block.width == src->block.height );
+   assert( dst->block.height == src->block.height );
+
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+      
+      const void *src_map = pipe->screen->surface_map( pipe->screen,
+                                                       src,
+                                                       PIPE_BUFFER_USAGE_CPU_READ );
+      
+      util_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dstx, dsty,
+                     width, height,
+                     src_map,
+                     src->stride,
+                     srcx, srcy);
+
+      pipe->screen->surface_unmap(pipe->screen, src);
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct brw_texture *dst_tex = (struct brw_texture *)dst->texture;
+      struct brw_texture *src_tex = (struct brw_texture *)src->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_copy_blit(brw_context(pipe),
+                    FALSE,
+                    dst->block.size,
+                    (short) src->stride/src->block.size, src_tex->buffer, src->offset, FALSE,
+                    (short) dst->stride/dst->block.size, dst_tex->buffer, dst->offset, FALSE,
+                    (short) srcx, (short) srcy, (short) dstx, (short) dsty,
+                    (short) width, (short) height, PIPE_LOGICOP_COPY);
+   }
+}
+
+
+static void
+brw_surface_fill(struct pipe_context *pipe,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 unsigned width, unsigned height, unsigned value)
+{
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+
+      util_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct brw_texture *tex = (struct brw_texture *)dst->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_fill_blit(brw_context(pipe),
+                    dst->block.size,
+                    (short) dst->stride/dst->block.size, 
+                    tex->buffer, dst->offset, FALSE,
+                    (short) dstx, (short) dsty,
+                    (short) width, (short) height,
+                    value);
+   }
+}
+
+
+void
+brw_init_surface_functions(struct brw_context *brw)
+{
+   brw->pipe.surface_copy  = brw_surface_copy;
+   brw->pipe.surface_fill  = brw_surface_fill;
+}
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
new file mode 100644
index 0000000000..998ffaeac4
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -0,0 +1,380 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "brw_context.h"
+#include "brw_tex_layout.h"
+
+
+#define FILE_DEBUG_FLAG DEBUG_TEXTURE
+
+#if 0
+unsigned intel_compressed_alignment(unsigned internalFormat)
+{
+    unsigned alignment = 4;
+
+    switch (internalFormat) {
+    case GL_COMPRESSED_RGB_FXT1_3DFX:
+    case GL_COMPRESSED_RGBA_FXT1_3DFX:
+        alignment = 8;
+        break;
+
+    default:
+        break;
+    }
+
+    return alignment;
+}
+#endif
+
+
+static void intel_miptree_set_image_offset(struct brw_texture *tex,
+                                           unsigned level,
+                                           unsigned img,
+                                           unsigned x, unsigned y)
+{
+   struct pipe_texture *pt = &tex->base;
+   if (img == 0 && level == 0)
+      assert(x == 0 && y == 0);
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * pt->block.size;
+}
+
+static void intel_miptree_set_level_info(struct brw_texture *tex,
+                                         unsigned level,
+                                         unsigned nr_images,
+                                         unsigned x, unsigned y,
+                                         unsigned w, unsigned h, unsigned d)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   assert(level < PIPE_MAX_TEXTURE_LEVELS);
+
+   pt->width[level] = w;
+   pt->height[level] = h;
+   pt->depth[level] = d;
+   
+   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
+   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
+
+   tex->level_offset[level] = y * tex->stride + x * tex->base.block.size;
+   tex->nr_images[level] = nr_images;
+
+   /*
+   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+       level, w, h, d, x, y, tex->level_offset[level]);
+   */
+
+   /* Not sure when this would happen, but anyway: 
+    */
+   if (tex->image_offset[level]) {
+      FREE(tex->image_offset[level]);
+      tex->image_offset[level] = NULL;
+   }
+
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void i945_miptree_layout_2d(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   const int align_x = 2, align_y = 4;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   tex->stride = align(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx 
+	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+
+      if (mip1_nblocksx > nblocksx)
+	 tex->stride = mip1_nblocksx * pt->block.size;
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      intel_miptree_set_level_info(tex, level, 1, x, y, width,
+				   height, 1);
+
+      nblocksy = align(nblocksy, align_y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+	 x += align(nblocksx, align_x);
+      }
+      else {
+	 y += nblocksy;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static boolean brw_miptree_layout(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   /* XXX: these vary depending on image format:
+    */
+/*    int align_w = 4; */
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_3D: {
+      unsigned width  = pt->width[0];
+      unsigned height = pt->height[0];
+      unsigned depth = pt->depth[0];
+      unsigned nblocksx = pt->nblocksx[0];
+      unsigned nblocksy = pt->nblocksy[0];
+      unsigned pack_x_pitch, pack_x_nr;
+      unsigned pack_y_pitch;
+      unsigned level;
+      unsigned align_h = 2;
+      unsigned align_w = 4;
+
+      tex->total_nblocksy = 0;
+
+      tex->stride = align(pt->nblocksx[0], 4);
+      pack_y_pitch = align(pt->nblocksy[0], align_h);
+
+      pack_x_pitch = tex->stride / pt->block.size;
+      pack_x_nr = 1;
+
+      for (level = 0; level <= pt->last_level; level++) {
+	 unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
+	 int x = 0;
+	 int y = 0;
+	 uint q, j;
+
+	 intel_miptree_set_level_info(tex, level, nr_images,
+				      0, tex->total_nblocksy,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(tex, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 tex->total_nblocksy += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+         nblocksx = pf_get_nblocksx(&pt->block, width);
+         nblocksy = pf_get_nblocksy(&pt->block, height);
+
+         if (pf_is_compressed(pt->format)) {
+            pack_y_pitch = (height + 3) / 4;
+
+            if (pack_x_pitch > align(width, align_w)) {
+               pack_x_pitch = align(width, align_w);
+               pack_x_nr <<= 1;
+            }
+         } else {
+            if (pack_x_pitch > 4) {
+               pack_x_pitch >>= 1;
+               pack_x_nr <<= 1;
+               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+            }
+
+            if (pack_y_pitch > 2) {
+               pack_y_pitch >>= 1;
+               pack_y_pitch = align(pack_y_pitch, align_h);
+            }
+         }
+
+      }
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(tex);
+      break;
+   }
+#if 0
+   PRINT("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       pt->pitch,
+       pt->total_nblocksy,
+       pt->block.size,
+       pt->stride * pt->total_nblocksy );
+#endif
+
+   return TRUE;
+}
+
+
+static struct pipe_texture *
+brw_texture_create_screen(struct pipe_screen *screen,
+                          const struct pipe_texture *templat)
+{
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
+
+   if (tex) {
+      tex->base = *templat;
+      pipe_reference_init(&tex->base.reference, 1);
+
+      tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
+      tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
+   
+      if (brw_miptree_layout(tex))
+	 tex->buffer = screen->buffer_create(screen, 64,
+                                          PIPE_BUFFER_USAGE_PIXEL,
+                                          tex->stride *
+                                          tex->total_nblocksy);
+
+      if (!tex->buffer) {
+	 FREE(tex);
+         return NULL;
+      }
+   }
+
+   return &tex->base;
+}
+
+
+static void
+brw_texture_destroy_screen(struct pipe_texture *pt)
+{
+   struct brw_texture *tex = (struct brw_texture *)pt;
+   uint i;
+
+   /*
+     DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+   */
+
+   pipe_buffer_reference(&tex->buffer, NULL);
+
+   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+      if (tex->image_offset[i])
+         free(tex->image_offset[i]);
+
+   free(tex);
+}
+
+
+static struct pipe_surface *
+brw_get_tex_surface_screen(struct pipe_screen *screen,
+                           struct pipe_texture *pt,
+                           unsigned face, unsigned level, unsigned zslice)
+{
+   struct brw_texture *tex = (struct brw_texture *)pt;
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   offset = tex->level_offset[level];
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset += tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset += tex->image_offset[level][zslice];
+   }
+   else {
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->block = pt->block;
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = tex->stride;
+      ps->offset = offset;
+   }
+   return ps;
+}
+
+
+void
+brw_init_texture_functions(struct brw_context *brw)
+{
+//   brw->pipe.texture_update = brw_texture_update;
+}
+
+
+void
+brw_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create  = brw_texture_create_screen;
+   screen->texture_destroy = brw_texture_destroy_screen;
+   screen->get_tex_surface = brw_get_tex_surface_screen;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.h b/src/gallium/drivers/i965simple/brw_tex_layout.h
new file mode 100644
index 0000000000..a6b6ba8146
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+
+#ifndef BRW_TEX_LAYOUT_H
+#define BRW_TEX_LAYOUT_H
+
+
+struct brw_context;
+struct pipe_screen;
+
+
+extern void
+brw_init_texture_functions(struct brw_context *brw);
+
+extern void
+brw_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_urb.c b/src/gallium/drivers/i965simple/brw_urb.c
new file mode 100644
index 0000000000..101a4367b9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_urb.c
@@ -0,0 +1,186 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+//#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_defines.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/* XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct {
+   unsigned min_nr_entries;
+   unsigned preferred_nr_entries;
+   unsigned min_entry_size;
+   unsigned max_entry_size;
+} limits[CS+1] = {
+   { 8, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 6, 8,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static boolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256;
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static void recalculate_urb_fence( struct brw_context *brw )
+{
+   unsigned csize = brw->curbe.total_size;
+   unsigned vsize = brw->vs.prog_data->urb_entry_size;
+   unsigned sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
+				 brw->urb.sfsize > brw->urb.sfsize ||
+				 brw->urb.csize > brw->urb.csize))) {
+
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;
+
+	 brw->urb.constrained = 1;
+
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    debug_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
+      }
+      else
+	 brw->urb.constrained = 0;
+
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start,
+		      256);
+
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = recalculate_urb_fence
+};
+
+
+
+
+
+void brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start;
+   uf.bits0.clp_fence = brw->urb.sf_start;
+   uf.bits1.sf_fence  = brw->urb.cs_start;
+   uf.bits1.cs_fence  = 256;
+
+   BRW_BATCH_STRUCT(brw, &uf);
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.c b/src/gallium/drivers/i965simple/brw_util.c
new file mode 100644
index 0000000000..42391d7c8c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.c
@@ -0,0 +1,104 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_util.h"
+#include "brw_defines.h"
+
+#include "pipe/p_defines.h"
+
+unsigned brw_count_bits( unsigned val )
+{
+   unsigned i;
+   for (i = 0; val ; val >>= 1)
+      if (val & 1)
+	 i++;
+   return i;
+}
+
+
+unsigned brw_translate_blend_equation( int mode )
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BRW_BLENDFUNCTION_ADD;
+   case PIPE_BLEND_MIN:
+      return BRW_BLENDFUNCTION_MIN;
+   case PIPE_BLEND_MAX:
+      return BRW_BLENDFUNCTION_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BRW_BLENDFUNCTION_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT;
+   default:
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+unsigned brw_translate_blend_factor( int factor )
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BRW_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BRW_BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BRW_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BRW_BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BRW_BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BRW_BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BRW_BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BRW_BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.h b/src/gallium/drivers/i965simple/brw_util.h
new file mode 100644
index 0000000000..d60e5934db
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.h
@@ -0,0 +1,43 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "pipe/p_state.h"
+
+extern unsigned brw_count_bits( unsigned val );
+extern unsigned brw_translate_blend_factor( int factor );
+extern unsigned brw_translate_blend_equation( int mode );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs.c b/src/gallium/drivers/i965simple/brw_vs.c
new file mode 100644
index 0000000000..92327e896d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.c
@@ -0,0 +1,120 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+
+
+static void do_vs_prog( struct brw_context *brw,
+			const struct brw_vertex_program *vp,
+			struct brw_vs_prog_key *key )
+{
+   unsigned program_size;
+   const unsigned *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(&c.func);
+   c.vp = vp;
+
+   c.prog_data.outputs_written = vp->info.num_outputs;
+   c.prog_data.inputs_read = vp->info.num_inputs;
+
+#if 0
+   if (c.key.copy_edgeflag) {
+      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
+   }
+#endif
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /*
+    */
+   brw->vs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_VS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->vs.prog_data);
+}
+
+
+static void brw_upload_vs_prog( struct brw_context *brw )
+{
+   struct brw_vs_prog_key key;
+   const struct brw_vertex_program *vp = brw->attribs.VertexProgram;
+
+   assert(vp);
+
+   memset(&key, 0, sizeof(key));
+
+   /* Just upload the program verbatim for now.  Always send it all
+    * the inputs it asks for, whether they are varying or not.
+    */
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw->attribs.Clip.nr;
+   key.copy_edgeflag = (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+			brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_VS_PROG],
+			&key, sizeof(key),
+			&brw->vs.prog_data,
+			&brw->vs.prog_gs_offset))
+       return;
+
+   do_vs_prog(brw, vp, &key);
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_VS,
+      .cache = 0
+   },
+   .update = brw_upload_vs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_vs.h b/src/gallium/drivers/i965simple/brw_vs.h
new file mode 100644
index 0000000000..070f9dfcae
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.h
@@ -0,0 +1,82 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+struct brw_vs_prog_key {
+   unsigned program_string_id;
+   unsigned nr_userclip:4;
+   unsigned copy_edgeflag:1;
+   unsigned know_w_is_one:1;
+   unsigned pad:26;
+};
+
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+
+   const struct brw_vertex_program *vp;
+
+   unsigned nr_inputs;
+
+   unsigned first_output;
+   unsigned nr_outputs;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[12][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {
+       boolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+};
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
new file mode 100644
index 0000000000..3ee82d95b3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -0,0 +1,1330 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_vs.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+struct brw_prog_info {
+   unsigned num_temps;
+   unsigned num_addrs;
+   unsigned num_consts;
+
+   unsigned writes_psize;
+
+   unsigned pos_idx;
+   unsigned result_edge_idx;
+   unsigned edge_flag_idx;
+   unsigned psize_idx;
+};
+
+/* Do things as simply as possible.  Allocate and populate all regs
+ * ahead of time.
+ */
+static void brw_vs_alloc_regs( struct brw_vs_compile *c,
+                               struct brw_prog_info *info )
+{
+   unsigned i, reg = 0, mrf;
+   unsigned nr_params;
+
+   /* r0 -- reserved as usual
+    */
+   c->r0 = brw_vec8_grf(reg, 0); reg++;
+
+   /* User clip planes from curbe:
+    */
+   if (c->key.nr_userclip) {
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
+      }
+
+      /* Deal with curbe alignment:
+       */
+      reg += ((6+c->key.nr_userclip+3)/4)*2;
+   }
+
+   /* Vertex program parameters from curbe:
+    */
+   nr_params = c->prog_data.max_const;
+   for (i = 0; i < nr_params; i++) {
+      c->regs[TGSI_FILE_CONSTANT][i] = stride(brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+   }
+   reg += (nr_params+1)/2;
+   c->prog_data.curb_read_length = reg - 1;
+
+
+
+   /* Allocate input regs:
+    */
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+	 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+   }
+
+
+   /* Allocate outputs: TODO: could organize the non-position outputs
+    * to go straight into message regs.
+    */
+   c->nr_outputs = 0;
+   c->first_output = reg;
+   mrf = 4;
+   for (i = 0; i < c->vp->info.num_outputs; i++) {
+      c->nr_outputs++;
+#if 0
+      if (i == VERT_RESULT_HPOS) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+      else if (i == VERT_RESULT_PSIZ) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+         mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#else
+      /*treat pos differently for now */
+      if (i == info->pos_idx) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      } else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#endif
+   }
+
+   /* Allocate program temporaries:
+    */
+   for (i = 0; i < info->num_temps; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* Address reg(s).  Don't try to use the internal address reg until
+    * deref time.
+    */
+   for (i = 0; i < info->num_addrs; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+                                               reg,
+                                               0,
+                                               BRW_REGISTER_TYPE_D,
+                                               BRW_VERTICAL_STRIDE_8,
+                                               BRW_WIDTH_8,
+                                               BRW_HORIZONTAL_STRIDE_1,
+                                               BRW_SWIZZLE_XXXX,
+                                               TGSI_WRITEMASK_X);
+      reg++;
+   }
+
+   for (i = 0; i < 128; i++) {
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
+   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+   reg += 2;
+
+
+   /* Some opcodes need an internal temporary:
+    */
+   c->first_tmp = reg;
+   c->last_tmp = reg;		/* for allocation purposes */
+
+   /* Each input reg holds data from two vertices.  The
+    * urb_read_length is the number of registers read from *each*
+    * vertex urb, so is half the amount:
+    */
+   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
+
+   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
+   c->prog_data.total_grf = reg;
+}
+
+
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+static void unalias1( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if (dst.file == arg0.file && dst.nr == arg0.nr) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0);
+   }
+}
+
+static void unalias2( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1);
+   }
+}
+
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1,
+		      unsigned cond)
+{
+   brw_push_insn_state(p);
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_pop_insn_state(p);
+}
+
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
+static void emit_slt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_max( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg1, arg0);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void emit_min( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+static void emit_math1( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			unsigned precision)
+{
+   /* There are various odd behaviours with SEND on the simulator.  In
+    * addition there are documented issues with the fact that the GEN4
+    * processor doesn't do dependency control properly on SEND
+    * results.  So, on balance, this kludge to get around failures
+    * with writemasked math results looks like it might be necessary
+    * whether that turns out to be a simulator bug or not:
+    */
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+static void emit_math2( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			unsigned precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_message_reg(3), arg1);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+ 	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+static void emit_exp_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X) {
+      struct brw_reg tmp = get_tmp(c);
+      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+
+      /* tmp_d = floor(arg0.x) */
+      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
+
+      /* result[0] = 2.0 ^ tmp */
+
+      /* Adjust exponent for floating point:
+       * exp += 127
+       */
+      brw_ADD(p, brw_writemask(tmp_d, TGSI_WRITEMASK_X), tmp_d, brw_imm_d(127));
+
+      /* Install exponent and sign.
+       * Excess drops off the edge:
+       */
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), TGSI_WRITEMASK_X),
+	      tmp_d, brw_imm_d(23));
+
+      release_tmp(c, tmp);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y) {
+      /* result[1] = arg0.x - floor(arg0.x) */
+      brw_FRC(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* As with the LOG instruction, we might be better off just
+       * doing a taylor expansion here, seeing as we have to do all
+       * the prep work.
+       *
+       * If mathbox partial precision is too low, consider also:
+       * result[3] = result[0] * EXP(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_EXP,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(arg0, 0),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+}
+
+
+static void emit_log_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) {
+      tmp = get_tmp(c);
+      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   }
+
+   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+    * according to spec:
+    *
+    * These almost look likey they could be joined up, but not really
+    * practical:
+    *
+    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_XZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1U<<31)-1));
+
+      brw_SHR(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      tmp_ud,
+	      brw_imm_ud(23));
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_X),
+	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
+	      brw_imm_d(-127));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_YZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1<<23)-1));
+
+      brw_OR(p,
+	     brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	     tmp_ud,
+	     brw_imm_ud(127<<23));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* result[2] = result[0] + LOG2(result[1]); */
+
+      /* Why bother?  The above is just a hint how to do this with a
+       * taylor series.  Maybe we *should* use a taylor series as by
+       * the time all the above has been done it's almost certainly
+       * quicker than calling the mathbox, even with low precision.
+       *
+       * Options are:
+       *    - result[0] + mathbox.LOG2(result[1])
+       *    - mathbox.LOG2(arg0.x)
+       *    - result[0] + inline_taylor_approx(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_LOG,
+		 brw_writemask(tmp, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 1),
+		 BRW_MATH_PRECISION_FULL);
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_Z),
+	      brw_swizzle1(tmp, 2),
+	      brw_swizzle1(tmp, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+
+/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
+ */
+static void emit_dst_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0,
+			      struct brw_reg arg1)
+{
+   struct brw_compile *p = &c->func;
+
+   /* There must be a better way to do this:
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, TGSI_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), arg1);
+}
+
+static void emit_xpd( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg t,
+		      struct brw_reg u)
+{
+   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
+   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
+}
+
+
+
+static void emit_lit_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_YZ), brw_imm_f(0));
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_XW), brw_imm_f(1));
+
+   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_8);
+   {
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0,0));
+
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      emit_math2(c,
+		 BRW_MATH_FUNCTION_POW,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 2),
+		 brw_swizzle1(arg0, 3),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   brw_ENDIF(p, if_insn);
+}
+
+
+
+
+
+/* TODO: relative addressing!
+ */
+static struct brw_reg get_reg( struct brw_vs_compile *c,
+			       unsigned file,
+			       unsigned index )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+      assert(c->regs[file][index].nr != 0);
+      return c->regs[file][index];
+   case TGSI_FILE_CONSTANT:
+      assert(c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm];
+   case TGSI_FILE_IMMEDIATE:
+      assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index];
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:			/* undef values */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+
+static struct brw_reg deref( struct brw_vs_compile *c,
+			     struct brw_reg arg,
+			     int offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg vp_address = retype(vec1(get_reg(c, TGSI_FILE_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   unsigned byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* This is pretty clunky - load the address register twice and
+       * fetch each 4-dword value in turn.  There must be a way to do
+       * this in a single pass, but I couldn't get it to work.
+       */
+      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
+      brw_MOV(p, tmp, indirect);
+
+      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
+      brw_MOV(p, suboffset(tmp, 4), indirect);
+
+      brw_pop_insn_state(p);
+   }
+
+   return vec8(tmp);
+}
+
+
+static void emit_arl( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_RNDD(p, tmp, arg0);
+   brw_MUL(p, dst, tmp, brw_imm_d(16));
+
+   if (need_tmp)
+      release_tmp(c, tmp);
+}
+
+
+/* Will return mangled results for SWZ op.  The emit_swz() function
+ * ignores this result and recalculates taking extended swizzles into
+ * account.
+ */
+static struct brw_reg get_arg( struct brw_vs_compile *c,
+			       struct tgsi_src_register *src )
+{
+   struct brw_reg reg;
+
+   if (src->File == TGSI_FILE_NULL)
+      return brw_null_reg();
+
+#if 0
+   if (src->RelAddr)
+      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
+   else
+#endif
+      reg = get_reg(c, src->File, src->Index);
+
+   /* Convert 3-bit swizzle to 2-bit.
+    */
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SwizzleX,
+				       src->SwizzleY,
+				       src->SwizzleZ,
+				       src->SwizzleW);
+
+   /* Note this is ok for non-swizzle instructions:
+    */
+   reg.negate = src->Negate ? 1 : 0;
+
+   return reg;
+}
+
+
+static struct brw_reg get_dst( struct brw_vs_compile *c,
+			       const struct tgsi_dst_register *dst )
+{
+   struct brw_reg reg = get_reg(c, dst->File, dst->Index);
+
+   reg.dw1.bits.writemask = dst->WriteMask;
+
+   return reg;
+}
+
+
+
+
+static void emit_swz( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct tgsi_src_register src )
+{
+   struct brw_compile *p = &c->func;
+   unsigned zeros_mask = 0;
+   unsigned ones_mask = 0;
+   unsigned src_mask = 0;
+   ubyte src_swz[4];
+   boolean need_tmp = (src.Negate &&
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+   struct brw_reg tmp = dst;
+   unsigned i;
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   for (i = 0; i < 4; i++) {
+      if (dst.dw1.bits.writemask & (1<<i)) {
+	 ubyte s = 0;
+         switch(i) {
+         case 0:
+            s = src.SwizzleX;
+            break;
+            s = src.SwizzleY;
+         case 1:
+            break;
+            s = src.SwizzleZ;
+         case 2:
+            break;
+            s = src.SwizzleW;
+         case 3:
+            break;
+         }
+	 switch (s) {
+	 case TGSI_SWIZZLE_X:
+	 case TGSI_SWIZZLE_Y:
+	 case TGSI_SWIZZLE_Z:
+	 case TGSI_SWIZZLE_W:
+	    src_mask |= 1<<i;
+	    src_swz[i] = s;
+	    break;
+	 case TGSI_EXTSWIZZLE_ZERO:
+	    zeros_mask |= 1<<i;
+	    break;
+	 case TGSI_EXTSWIZZLE_ONE:
+	    ones_mask |= 1<<i;
+	    break;
+	 }
+      }
+   }
+
+   /* Do src first, in case dst aliases src:
+    */
+   if (src_mask) {
+      struct brw_reg arg0;
+
+#if 0
+      if (src.RelAddr)
+	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
+      else
+#endif
+	 arg0 = get_reg(c, src.File, src.Index);
+
+      arg0 = brw_swizzle(arg0,
+			 src_swz[0], src_swz[1],
+			 src_swz[2], src_swz[3]);
+
+      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
+   }
+
+   if (zeros_mask)
+      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
+
+   if (ones_mask)
+      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
+
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+/* Post-vertex-program processing.  Send the results to the URB.
+ */
+static void emit_vertex_write( struct brw_vs_compile *c, struct brw_prog_info *info)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg m0 = brw_message_reg(0);
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][info->pos_idx];
+   struct brw_reg ndc;
+
+   if (c->key.copy_edgeflag) {
+      brw_MOV(p,
+	      get_reg(c, TGSI_FILE_OUTPUT, info->result_edge_idx),
+	      get_reg(c, TGSI_FILE_INPUT, info->edge_flag_idx));
+   }
+
+
+   /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
+    */
+   if (!c->key.know_w_is_one) {
+      ndc = get_tmp(c);
+      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+      brw_MUL(p, brw_writemask(ndc, TGSI_WRITEMASK_XYZ), pos, ndc);
+   }
+   else {
+      ndc = pos;
+   }
+
+   /* This includes the workaround for -ve rhw, so is no longer an
+    * optional step:
+    */
+   if (info->writes_psize ||
+       c->key.nr_userclip ||
+       !c->key.know_w_is_one)
+   {
+      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+      unsigned i;
+
+      brw_MOV(p, header1, brw_imm_ud(0));
+
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      if (info->writes_psize) {
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][info->psize_idx];
+	 brw_MUL(p, brw_writemask(header1, TGSI_WRITEMASK_W),
+                 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1,
+                 brw_imm_ud(0x7ff<<8));
+      }
+
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (!c->key.know_w_is_one) {
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      release_tmp(c, header1);
+   }
+   else {
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+   }
+
+
+   /* Emit the (interleaved) headers for the two vertices - an 8-reg
+    * of zeros followed by two sets of NDC coordinates:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, offset(m0, 2), ndc);
+   brw_MOV(p, offset(m0, 3), pos);
+
+
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 0,		/* starting mrf reg nr */
+		 c->r0,		/* src */
+		 0,		/* allocate */
+		 1,		/* used */
+		 c->nr_outputs + 3, /* msg len */
+		 0,		/* response len */
+		 1, 		/* eot */
+		 1, 		/* writes complete */
+		 0, 		/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+}
+
+static void
+post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+{
+   struct tgsi_parse_context parse;
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   tgsi_parse_init(&parse, tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
+#if 0
+         struct brw_instruction *brw_inst1, *brw_inst2;
+         const struct tgsi_full_instruction *inst1, *inst2;
+         int offset;
+         inst1 = &parse.FullToken.FullInstruction;
+         brw_inst1 = inst1->Data;
+         switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	 case TGSI_OPCODE_BRA:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->vp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 case TGSI_OPCODE_END:
+	    offset = end_inst - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+         }
+#endif
+      }
+   }
+   tgsi_parse_free(&parse);
+}
+
+static void process_declaration(const struct tgsi_full_declaration *decl,
+                                struct brw_prog_info *info)
+{
+   int first = decl->DeclarationRange.First;
+   int last = decl->DeclarationRange.Last;
+   
+   switch(decl->Declaration.File) {
+   case TGSI_FILE_CONSTANT: 
+      info->num_consts += last - first + 1;
+      break;
+   case TGSI_FILE_INPUT: {
+   }
+      break;
+   case TGSI_FILE_OUTPUT: {
+      assert(last == first);	/* for now */
+      if (decl->Declaration.Semantic) {
+         switch (decl->Semantic.SemanticName) {
+         case TGSI_SEMANTIC_POSITION: {
+            info->pos_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_COLOR:
+            break;
+         case TGSI_SEMANTIC_BCOLOR:
+            break;
+         case TGSI_SEMANTIC_FOG:
+            break;
+         case TGSI_SEMANTIC_PSIZE: {
+            info->writes_psize = TRUE;
+            info->psize_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_GENERIC:
+            break;
+         }
+      }
+   }
+      break;
+   case TGSI_FILE_TEMPORARY: {
+      info->num_temps += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_SAMPLER: {
+   }
+      break;
+   case TGSI_FILE_ADDRESS: {
+      info->num_addrs += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_IMMEDIATE: {
+   }
+      break;
+   case TGSI_FILE_NULL: {
+   }
+      break;
+   }
+}
+
+static void process_instruction(struct brw_vs_compile *c,
+                                struct tgsi_full_instruction *inst,
+                                struct brw_prog_info *info)
+{
+   struct brw_reg args[3], dst;
+   struct brw_compile *p = &c->func;
+   /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
+   unsigned i;
+   unsigned index;
+   unsigned file;
+   /*FIXME: might not be the only one*/
+   const struct tgsi_dst_register *dst_reg = &inst->FullDstRegisters[0].DstRegister;
+   /*
+   struct brw_instruction *if_inst[MAX_IFSN];
+   unsigned insn, if_insn = 0;
+   */
+
+   for (i = 0; i < 3; i++) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      index = src->SrcRegister.Index;
+      file = src->SrcRegister.File;
+      if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+         args[i] = c->output_regs[index].reg;
+      else
+         args[i] = get_arg(c, &src->SrcRegister);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */
+   index = dst_reg->Index;
+   file = dst_reg->File;
+   if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+      dst = c->output_regs[index].reg;
+   else
+      dst = get_dst(c, dst_reg);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias);
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+#if 0
+      /* The args[0] value can't be used here as it won't have
+       * correctly encoded the full swizzle:
+       */
+      emit_swz(c, dst, inst->SrcReg[0] );
+#endif
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+#if 0
+   case TGSI_OPCODE_IF:
+      assert(if_insn < MAX_IFSN);
+      if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(if_insn > 0);
+      brw_ENDIF(p, if_inst[--if_insn]);
+      break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control_flag_value(p, 0xff);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(4));
+      inst->Data = &p->store[p->nr_insn];
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+#endif
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+#else
+      /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
+#endif
+      break;
+   case TGSI_OPCODE_END:
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   default:
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      break;
+   }
+
+   if (dst_reg->File == TGSI_FILE_OUTPUT
+       && dst_reg->Index != info->pos_idx
+       && c->output_regs[dst_reg->Index].used_in_src)
+      brw_MOV(p, get_dst(c, dst_reg), dst);
+
+   release_tmps(c);
+}
+
+/* Emit the fragment program instructions here.
+ */
+void brw_vs_emit(struct brw_vs_compile *c)
+{
+#define MAX_IFSN 32
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *end_inst;
+   struct tgsi_parse_context parse;
+   struct brw_indirect stack_index = brw_indirect(0, 0);
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   struct brw_prog_info prog_info;
+   unsigned allocated_registers = 0;
+   memset(&prog_info, 0, sizeof(struct brw_prog_info));
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   tgsi_parse_init(&parse, tokens);
+   /* Message registers can't be read, so copy the output into GRF register
+      if they are used in source registers */
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      unsigned i;
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         for (i = 0; i < 3; ++i) {
+            const struct tgsi_src_register *src = &inst->FullSrcRegisters[i].SrcRegister;
+            unsigned index = src->Index;
+            unsigned file = src->File;
+            if (file == TGSI_FILE_OUTPUT)
+               c->output_regs[index].used_in_src = TRUE;
+         }
+      }
+         break;
+      default:
+         /* nothing */
+         break;
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION: {
+         struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+         process_declaration(decl, &prog_info);
+      }
+         break;
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+         struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
+         assert(imm->Immediate.NrTokens == 4 + 1);
+         c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u[0].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u[1].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u[2].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][3] = imm->u[3].Float;
+         c->prog_data.num_imm++;
+      }
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         if (!allocated_registers) {
+            /* first instruction (declerations finished).
+             * now that we know what vars are being used allocate
+             * registers for them.*/
+            c->prog_data.num_consts = prog_info.num_consts;
+            c->prog_data.max_const = prog_info.num_consts + c->prog_data.num_imm;
+            brw_vs_alloc_regs(c, &prog_info);
+
+	    brw_set_access_mode(p, BRW_ALIGN_1);
+            brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+	    brw_set_access_mode(p, BRW_ALIGN_16);
+            allocated_registers = 1;
+         }
+         process_instruction(c, inst, &prog_info);
+      }
+         break;
+      }
+   }
+
+   end_inst = &p->store[p->nr_insn];
+   emit_vertex_write(c, &prog_info);
+   post_vs_emit(c, end_inst);
+   tgsi_parse_free(&parse);
+
+}
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
new file mode 100644
index 0000000000..1eaff87892
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_state.c
@@ -0,0 +1,103 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+static void upload_vs_unit( struct brw_context *brw )
+{
+   struct brw_vs_unit_state vs;
+
+   memset(&vs, 0, sizeof(vs));
+
+   /* CACHE_NEW_VS_PROG */
+   vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
+   vs.thread0.grf_reg_count = align(brw->vs.prog_data->total_grf, 16) / 16 - 1;
+   vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+
+
+   /* BRW_NEW_URB_FENCE  */
+   vs.thread4.nr_urb_entries = brw->urb.nr_vs_entries;
+   vs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   vs.thread4.max_threads = MIN2(
+      MAX2(0, (brw->urb.nr_vs_entries - 6) / 2 - 1),
+      15);
+
+
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
+   if (0 /*brw->attribs.Clip->ClipPlanesEnabled*/) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+   }
+   else {
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
+   }
+
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   vs.thread3.urb_entry_read_offset = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   brw->vs.state_gs_offset = brw_cache_data( &brw->cache[BRW_VS_UNIT], &vs );
+}
+
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CLIP |
+		BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_vs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_winsys.h b/src/gallium/drivers/i965simple/brw_winsys.h
new file mode 100644
index 0000000000..ec1e400418
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_winsys.h
@@ -0,0 +1,209 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that i965simple requires any window system
+ * hosting it to implement.  This is the only include file in i965simple
+ * which is public.
+ *
+ */
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+
+#include "pipe/p_defines.h"
+
+
+/* Pipe drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+
+struct pipe_buffer;
+struct pipe_fence_handle;
+struct pipe_winsys;
+struct pipe_screen;
+
+
+/* The pipe driver currently understands the following chipsets:
+ */
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+
+/* These are the names of all the state caches managed by the driver.
+ * 
+ * When data is uploaded to a buffer with buffer_subdata, we use the
+ * special version of that function below so that information about
+ * what type of data this is can be passed to the winsys backend.
+ * That in turn allows the correct flags to be set in the aub file
+ * dump to allow human-readable file dumps later on.
+ */
+
+enum brw_cache_id {
+   BRW_CC_VP,
+   BRW_CC_UNIT,
+   BRW_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER,
+   BRW_WM_UNIT,
+   BRW_SF_PROG,
+   BRW_SF_VP,
+   BRW_SF_UNIT,
+   BRW_VS_UNIT,
+   BRW_VS_PROG,
+   BRW_GS_UNIT,
+   BRW_GS_PROG,
+   BRW_CLIP_VP,
+   BRW_CLIP_UNIT,
+   BRW_CLIP_PROG,
+   BRW_SS_SURFACE,
+   BRW_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+#define BRW_CONSTANT_BUFFER BRW_MAX_CACHE
+
+/**
+ * Additional winsys interface for i965simple.
+ *
+ * It is an over-simple batchbuffer mechanism.  Will want to improve the
+ * performance of this, perhaps based on the cmdstream stuff.  It
+ * would be pretty impossible to implement swz on top of this
+ * interface.
+ *
+ * Will also need additions/changes to implement static/dynamic
+ * indirect state.
+ */
+struct brw_winsys {
+
+   void (*destroy)(struct brw_winsys *);
+   
+   /**
+    * Reserve space on batch buffer.
+    *
+    * Returns a null pointer if there is insufficient space in the batch buffer
+    * to hold the requested number of dwords and relocations.
+    *
+    * The number of dwords should also include the number of relocations.
+    */
+   unsigned *(*batch_start)(struct brw_winsys *sws,
+                            unsigned dwords,
+                            unsigned relocs);
+
+   void (*batch_dword)(struct brw_winsys *sws,
+                       unsigned dword);
+
+   /**
+    * Emit a relocation to a buffer.
+    *
+    * Used not only when the buffer addresses are not pinned, but also to
+    * ensure refered buffers will not be destroyed until the current batch
+    * buffer execution is finished.
+    *
+    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and
+    * I915_BUFFER_ACCESS_READ macros.
+    */
+   void (*batch_reloc)(struct brw_winsys *sws,
+                       struct pipe_buffer *buf,
+                       unsigned access_flags,
+                       unsigned delta);
+
+
+   /* Not used yet, but really want this:
+    */
+   void (*batch_end)( struct brw_winsys *sws );
+
+   /**
+    * Flush the batch buffer.
+    *
+    * Fence argument must point to NULL or to a previous fence, and the caller
+    * must call fence_reference when done with the fence.
+    */
+   void (*batch_flush)(struct brw_winsys *sws,
+                       struct pipe_fence_handle **fence);
+
+
+   /* A version of buffer_subdata that includes information for the
+    * simulator:
+    */
+   void (*buffer_subdata_typed)(struct brw_winsys *sws, 
+				struct pipe_buffer *buf,
+				unsigned long offset, 
+				unsigned long size, 
+				const void *data,
+				unsigned data_type);
+   
+
+   /* A cheat so we don't have to think about relocations in a couple
+    * of places yet:
+    */
+   unsigned (*get_buffer_offset)( struct brw_winsys *sws,
+				  struct pipe_buffer *buf,
+				  unsigned flags );
+
+};
+
+#define BRW_BUFFER_ACCESS_WRITE   0x1
+#define BRW_BUFFER_ACCESS_READ    0x2
+
+#define BRW_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+
+
+struct pipe_context *brw_create(struct pipe_screen *,
+                                struct brw_winsys *,
+                                unsigned pci_id);
+
+static inline boolean brw_batchbuffer_data(struct brw_winsys *winsys,
+                                           const void *data,
+                                           unsigned bytes)
+{
+   static const unsigned incr = sizeof(unsigned);
+   uint i;
+   const unsigned *udata = (const unsigned*)(data);
+   unsigned size = bytes/incr;
+
+   winsys->batch_start(winsys, size, 0);
+   for (i = 0; i < size; ++i) {
+      winsys->batch_dword(winsys, udata[i]);
+   }
+   winsys->batch_end(winsys);
+
+   return (i == size);
+}
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
new file mode 100644
index 0000000000..10161f2d2f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -0,0 +1,209 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+
+
+
+static void do_wm_prog( struct brw_context *brw,
+			struct brw_fragment_program *fp,
+			struct brw_wm_prog_key *key)
+{
+   struct brw_wm_compile *c = CALLOC_STRUCT(brw_wm_compile);
+   const unsigned *program;
+   unsigned program_size;
+
+   c->key = *key;
+   c->fp = fp;
+   
+   c->delta_xy[0] = brw_null_reg();
+   c->delta_xy[1] = brw_null_reg();
+   c->pixel_xy[0] = brw_null_reg();
+   c->pixel_xy[1] = brw_null_reg();
+   c->pixel_w = brw_null_reg();
+
+
+   debug_printf("XXXXXXXX FP\n");
+   
+   brw_wm_glsl_emit(c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c->func, &program_size);
+
+   /*
+    */
+   brw->wm.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_WM_PROG],
+					      &c->key,
+					      sizeof(c->key),
+					      program,
+					      program_size,
+					      &c->prog_data,
+					      &brw->wm.prog_data );
+
+   FREE(c);
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *)brw->attribs.FragmentProgram;
+   unsigned lookup = 0;
+   unsigned line_aa;
+   
+   memset(key, 0, sizeof(*key));
+
+   /* Build the index for table lookup
+    */
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (fp->info.uses_kill ||
+       brw->attribs.DepthStencil->alpha.enabled)
+      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fp->info.writes_z)
+      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled)
+      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled &&
+       brw->attribs.DepthStencil->depth.writemask) /* ?? */
+      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
+	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+   }
+
+   /* XXX: when should this be disabled?
+    */
+   if (1)
+      lookup |= IZ_EARLY_DEPTH_TEST_BIT;
+
+
+   line_aa = AA_NEVER;
+
+   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
+   if (brw->attribs.Raster->line_smooth) {
+      if (brw->reduced_primitive == PIPE_PRIM_LINES) {
+	 line_aa = AA_ALWAYS;
+      }
+      else if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+	 if (brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE ||
+		(brw->attribs.Raster->cull_mode == PIPE_WINDING_CW))
+	       line_aa = AA_ALWAYS;
+	 }
+	 else if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->cull_mode == PIPE_WINDING_CCW)
+	       line_aa = AA_ALWAYS;
+	 }
+      }
+   }
+
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    key);
+
+
+#if 0
+   /* BRW_NEW_SAMPLER 
+    *
+    * Not doing any of this at the moment:
+    */
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct pipe_sampler_state *unit = brw->attribs.Samplers[i];
+
+      if (unit) {
+
+	 if (unit->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+	    key->shadowtex_mask |= 1<<i;
+	 }
+	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
+	    key->yuvtex_mask |= 1<<i;
+      }
+   }
+#endif
+
+
+   /* Extra info:
+    */
+   key->program_string_id = fp->id;
+
+}
+
+
+static void brw_upload_wm_prog( struct brw_context *brw )
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_program *fp = (struct brw_fragment_program *)
+      brw->attribs.FragmentProgram;
+
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_WM_PROG],
+			&key, sizeof(key),
+			&brw->wm.prog_data,
+			&brw->wm.prog_gs_offset))
+      return;
+
+   do_wm_prog(brw, fp, &key);
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_FS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = 0
+   },
+   .update = brw_upload_wm_prog
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm.h b/src/gallium/drivers/i965simple/brw_wm.h
new file mode 100644
index 0000000000..b29c4393f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.h
@@ -0,0 +1,142 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_EARLY_DEPTH_TEST_BIT     0x40
+#define IZ_BIT_MAX                  0x80
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   unsigned source_depth_reg:3;
+   unsigned aa_dest_stencil_reg:3;
+   unsigned dest_depth_reg:3;
+   unsigned nr_depth_regs:3;
+   unsigned shadowtex_mask:8;
+   unsigned computes_depth:1;	/* could be derived from program string */
+   unsigned source_depth_to_render_target:1;
+   unsigned runtime_check_aads_emit:1;
+
+   unsigned yuvtex_mask:8;
+
+   unsigned program_string_id;
+};
+
+
+
+
+
+#define PROGRAM_INTERNAL_PARAM
+#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_MAX_ATTRIBS + 3)
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+
+#define PAYLOAD_DEPTH     (PIPE_MAX_ATTRIBS)
+
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data; /* result */
+
+   struct brw_fragment_program *fp;
+
+   unsigned grf_limit;
+   unsigned max_wm_grf;
+
+
+   struct brw_reg pixel_xy[2];
+   struct brw_reg delta_xy[2];
+   struct brw_reg pixel_w;
+
+
+   struct brw_reg wm_regs[8][32][4];
+
+   struct brw_reg payload_depth[4];
+   struct brw_reg payload_coef[16];
+
+   struct brw_reg emit_mask_reg;
+
+   struct brw_instruction *if_inst[MAX_IFSN];
+   int if_insn;
+
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   int loop_insn;
+
+   struct brw_instruction *inst0;
+   struct brw_instruction *inst1;
+
+   struct brw_reg stack;
+   struct brw_indirect stack_index;
+
+   unsigned reg_index;
+
+   unsigned tmp_start;
+   unsigned tmp_index;
+};
+
+
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key );
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c);
+void brw_wm_emit_decls(struct brw_wm_compile *c);
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
new file mode 100644
index 0000000000..d50e66f613
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -0,0 +1,392 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+
+static int is_null( struct brw_reg reg )
+{
+   return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	   reg.nr == BRW_ARF_NULL);
+}
+
+static void emit_pixel_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_xy[0])) {
+
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+      c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+      c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+
+      /* Calculate pixel centers by adding 1 or 0 to each of the
+       * micro-tile coordinates passed in r1.
+       */
+      brw_ADD(p,
+	      c->pixel_xy[0],
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+
+      brw_ADD(p,
+	      c->pixel_xy[1],
+	      stride(suboffset(r1_uw, 5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+}
+
+
+
+
+
+
+static void emit_delta_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->delta_xy[0])) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+      emit_pixel_xy(c);
+
+      c->delta_xy[0] = alloc_tmp(c);
+      c->delta_xy[1] = alloc_tmp(c);
+
+      /* Calc delta X,Y by subtracting origin in r1 from the pixel
+       * centers.
+       */
+      brw_ADD(p,
+	      c->delta_xy[0],
+	      retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+
+      brw_ADD(p,
+	      c->delta_xy[1],
+	      retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+   }
+}
+
+
+
+#if 0
+static void emit_pixel_w( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_w)) {
+      struct brw_compile *p = &c->func;
+
+      struct brw_reg interp_wpos = c->coef_wpos;
+      
+      c->pixel_w = alloc_tmp(c);
+
+      emit_delta_xy(c);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
+      brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
+
+      /* Calc w */
+      brw_math_16( p, 
+		   c->pixel_w,
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, 
+		   brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+#endif
+
+
+static void emit_cinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_MOV(p, dst, suboffset(interp[i],3));
+      }
+   }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   emit_delta_xy(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+      }
+   }
+}
+
+#if 0
+static void emit_pinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   get_delta_xy(c);
+   get_pixel_w(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+	 brw_MUL(p, dst, dst, c->pixel_w);
+      }
+   }
+}
+#endif
+
+
+
+#if 0
+static void emit_wpos( )
+{ 
+   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
+   struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+   struct tgsi_full_src_register deltas = get_delta_xy(c);
+   struct tgsi_full_src_register arg2;
+   unsigned opcode;
+
+   opcode = WM_LINTERP;
+   arg2 = src_undef();
+
+   /* Have to treat wpos.xy specially:
+    */
+   emit_op(c,
+	   WM_WPOSXY,
+	   dst_mask(dst, WRITEMASK_XY),
+	   0, 0, 0,
+	   get_pixel_xy(c),
+	   src_undef(),
+	   src_undef());
+      
+   dst = dst_mask(dst, WRITEMASK_ZW);
+
+   /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+    */
+   emit_op(c,
+	   WM_LINTERP,
+	   dst,
+	   0, 0, 0,
+	   interp,
+	   deltas,
+	   arg2);
+}
+#endif
+
+
+
+
+/* Perform register allocation:
+ * 
+ *  -- r0???
+ *  -- passthrough depth regs (and stencil/aa??)
+ *  -- curbe ??
+ *  -- inputs (coefficients)
+ *
+ * Use a totally static register allocation.  This will perform poorly
+ * but is an easy way to get started (again).
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+   int i, j;
+   int nr_curbe_regs = 0;
+
+   /* R0, then some depth related regs:
+    */
+   for (i = 0; i < c->key.nr_depth_regs; i++) {
+      c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
+      c->reg_index += 2;
+   }
+
+
+   /* Then a copy of our part of the CURBE entry:
+    */
+   {
+      int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      int index = 0;
+
+      /* XXX number of constants, or highest numbered constant? */
+      assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
+
+      c->prog_data.max_const = 4*nr_constants;
+      for (i = 0; i < nr_constants; i++) {
+	 for (j = 0; j < 4; j++, index++) 
+	    c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
+								index%8);
+      }
+
+      nr_curbe_regs = 2*((4*nr_constants+15)/16);
+      c->reg_index += nr_curbe_regs;
+   }
+
+   /* Adjust for parameter coefficients for position, which are
+    * currently always provided.
+    */
+//   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Next we receive the plane coefficients for parameter
+    * interpolation:
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
+      c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
+      c->reg_index += 2;
+   }
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
+   c->prog_data.curb_read_length = nr_curbe_regs;
+
+   /* That's the end of the payload, now we can start allocating registers.
+    */
+   c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index++;
+
+   c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Now allocate room for the interpolated inputs and staging
+    * registers for the outputs:
+    */
+   /* XXX do we want to loop over the _number_ of inputs/outputs or loop
+    * to the highest input/output index that's used?
+    *  Probably the same, actually.
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
+   assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   /* Beyond this we should only need registers for internal temporaries:
+    */
+   c->tmp_start = c->reg_index;
+}
+
+
+
+
+
+/* Need to interpolate fragment program inputs in as a preamble to the
+ * shader.  A more sophisticated compiler would do this on demand, but
+ * we'll do it up front:
+ */
+void brw_wm_emit_decls(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   prealloc_reg(c);
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned first = decl->DeclarationRange.First;
+	 unsigned last = decl->DeclarationRange.Last;
+	 unsigned mask = decl->Declaration.UsageMask; /* ? */
+	 unsigned i;
+
+	 if (decl->Declaration.File != TGSI_FILE_INPUT)
+	    break;
+
+	 for( i = first; i <= last; i++ ) {
+	    switch (decl->Declaration.Interpolate) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       emit_cinterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_LINEAR:
+	       emit_linterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       //emit_pinterp(c, i, mask);
+	       emit_linterp(c, i, mask);
+	       break;
+	    }
+	 }
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+         done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   release_tmps(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
new file mode 100644
index 0000000000..db75963932
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -0,0 +1,1076 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+static int get_scalar_dst_index(struct tgsi_full_instruction *inst)
+{
+   struct tgsi_dst_register dst = inst->FullDstRegisters[0].DstRegister;
+   int i;
+   for (i = 0; i < 4; i++)
+      if (dst.WriteMask & (1<<i))
+	 break;
+   return i;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+static struct brw_reg
+get_reg(struct brw_wm_compile *c, int file, int index, int component )
+{
+   switch (file) {
+   case TGSI_FILE_NULL:
+      return brw_null_reg();
+
+   case TGSI_FILE_SAMPLER:
+      /* Should never get here:
+       */
+      assert (0);	       
+      return brw_null_reg();
+
+   case TGSI_FILE_IMMEDIATE:
+      /* These need a different path:
+       */
+      assert(0);
+      return brw_null_reg();
+
+       
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+      return c->wm_regs[file][index][component];
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_instruction *inst, 
+				  int component)
+{
+   return get_reg(c, 
+		  inst->FullDstRegisters[0].DstRegister.File, 
+		  inst->FullDstRegisters[0].DstRegister.Index,
+		  component);
+}
+
+static int get_swz( struct tgsi_src_register src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.SwizzleX;
+   case 1: return src.SwizzleY;
+   case 2: return src.SwizzleZ;
+   case 3: return src.SwizzleW;
+   default: return 0;
+   }
+}
+
+static int get_ext_swz( struct tgsi_src_register_ext_swz src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.ExtSwizzleX;
+   case 1: return src.ExtSwizzleY;
+   case 2: return src.ExtSwizzleZ;
+   case 3: return src.ExtSwizzleW;
+   default: return 0;
+   }
+}
+
+static struct brw_reg get_src_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_src_register *src, 
+				  int index)
+{
+   struct brw_reg reg;
+   int component = index;
+   int neg = 0;
+   int abs = 0;
+
+   if (src->SrcRegister.Negate)
+      neg = 1;
+
+   component = get_swz(src->SrcRegister, component);
+
+   /* Yes, there are multiple negates:
+    */
+   switch (component & 3) {
+   case 0: neg ^= src->SrcRegisterExtSwz.NegateX; break;
+   case 1: neg ^= src->SrcRegisterExtSwz.NegateY; break;
+   case 2: neg ^= src->SrcRegisterExtSwz.NegateZ; break;
+   case 3: neg ^= src->SrcRegisterExtSwz.NegateW; break;
+   }
+
+   /* And multiple swizzles, fun isn't it:
+    */
+   component = get_ext_swz(src->SrcRegisterExtSwz, component);
+
+   /* Not handling indirect lookups yet:
+    */
+   assert(src->SrcRegister.Indirect == 0);
+
+   /* Don't know what dimension means:
+    */
+   assert(src->SrcRegister.Dimension == 0);
+
+   /* Will never handle any of this stuff: 
+    */
+   assert(src->SrcRegisterExtMod.Complement == 0);
+   assert(src->SrcRegisterExtMod.Bias == 0);
+   assert(src->SrcRegisterExtMod.Scale2X == 0);
+
+   if (src->SrcRegisterExtMod.Absolute)
+      abs = 1;
+
+   /* Another negate!  This is a post-absolute negate, which we
+    * can't do.  Need to clean the crap out of tgsi somehow.
+    */
+   assert(src->SrcRegisterExtMod.Negate == 0);
+
+   switch( component ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      reg = get_reg(c, 
+		    src->SrcRegister.File, 
+		    src->SrcRegister.Index, 
+		    component );
+
+      if (neg) 
+	 reg = negate(reg);
+   
+      if (abs)
+	 reg = brw_abs(reg);
+
+      break;
+
+      /* XXX: this won't really work in the general case, but we know
+       * that the extended swizzle is only allowed in the SWZ
+       * instruction (right??), in which case using an immediate
+       * directly will work.
+       */
+   case TGSI_EXTSWIZZLE_ZERO:
+      reg = brw_imm_f(0);
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      if (neg && !abs)
+	 reg = brw_imm_f(-1.0);
+      else
+	 reg = brw_imm_f(1.0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+    
+   return reg;
+}
+
+static void emit_abs( struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst)
+{
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+
+   int i;
+   struct brw_compile *p = &c->func;
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 struct brw_reg src, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_MOV(p, dst, brw_abs(src)); /* NOTE */
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_xpd(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   for (i = 0; i < 4; i++) {
+      unsigned i2 = (i+2)%3;
+      unsigned i1 = (i+1)%3;
+      if (mask & (1<<i)) {
+	 struct brw_reg src0, src1, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = negate(get_src_reg(c, &inst->FullSrcRegisters[0], i2));
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i1);
+	 brw_MUL(p, brw_null_reg(), src0, src1);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i1);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i2);
+	 brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+	 brw_MAC(p, dst, src0, src1);
+	 brw_set_saturate(p, 0);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[3], src1[3], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 3; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_ADD(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_math1(struct brw_wm_compile *c,
+		       struct tgsi_full_instruction *inst, unsigned func)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_math(p,
+	    dst,
+	    func,
+	    ((inst->Instruction.Saturate != TGSI_SAT_NONE) 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+
+static void emit_alu2(struct brw_wm_compile *c,		      
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, src1, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_alu2(p, opcode, dst, src0, src1);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_alu1(struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_alu1(p, opcode, dst, src0);
+      }
+   }
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_max(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_min(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg dst, src0, src1;
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   src1 = get_src_reg(c, &inst->FullSrcRegisters[1], 0);
+
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_MOV(p, brw_message_reg(3), src1);
+
+   brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->Instruction.Saturate != TGSI_SAT_NONE 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+   int i;
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+
+	 if (src1.nr == dst.nr) {
+	    tmp1 = alloc_tmp(c);
+	    brw_MOV(p, tmp1, src1);
+	 } else
+	    tmp1 = src1;
+
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 if (src2.nr == dst.nr) {
+	    tmp2 = alloc_tmp(c);
+	    brw_MOV(p, tmp2, src2);
+	 } else
+	    tmp2 = src2;
+
+	 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst, tmp2);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MAC(p, dst, src0, tmp1);
+	 brw_set_saturate(p, 0);
+      }
+      release_tmps(c);
+   }
+}
+
+static void emit_kil(struct brw_wm_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, depth, c->emit_mask_reg, depth);
+   brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1, src2;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 brw_MUL(p, dst, src0, src1);
+
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_ADD(p, dst, dst, src2);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst, unsigned cond)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1;
+   int i;
+
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst, brw_imm_f(0.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, brw_imm_f(1.0));
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+
+static void emit_ddx(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   nr = src0.nr;
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, interp[i]);
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_ddy(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   nr = src0.nr;
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, suboffset(interp[i], 1));
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+/* TODO
+   BIAS on SIMD8 not workind yet...
+*/
+static void emit_txb(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned i;
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   default:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), src[2]);
+      break;
+   }
+#else
+   brw_MOV(p, brw_message_reg(2), src[0]);
+   brw_MOV(p, brw_message_reg(3), src[1]);
+   brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+#endif
+
+   brw_MOV(p, brw_message_reg(5), src[3]);
+   brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      4,
+	      4,
+	      0);
+#endif
+}
+
+static void emit_tex(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned msg_len;
+   unsigned i, nr;
+   unsigned emit;
+   boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
+
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      emit = WRITEMASK_X;
+      nr = 1;
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      emit = WRITEMASK_XY;
+      nr = 2;
+      break;
+   default:
+      emit = WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   }
+#else
+   emit = WRITEMASK_XY;
+   nr = 2;
+#endif
+
+   msg_len = 1;
+
+   for (i = 0; i < nr; i++) {
+      static const unsigned swz[4] = {0,1,2,2};
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+      msg_len += 1;
+   }
+
+   if (shadow) {
+      brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), src[2]);
+   }
+
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+	      4,
+	      shadow ? 6 : 4,
+	      0);
+
+   if (shadow)
+      brw_MOV(p, dst[3], brw_imm_f(1.0));
+#endif
+}
+
+
+
+
+
+
+
+
+static void emit_fb_write(struct brw_wm_compile *c,
+			  struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   int nr = 2;
+   int channel;
+   int base_reg = 0;
+
+   // src0 = output color
+   // src1 = payload_depth[0]
+   // src2 = output depth
+   // dst = ???
+
+
+
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
+
+   {
+      brw_push_insn_state(p);
+      for (channel = 0; channel < 4; channel++) {
+	 struct brw_reg src0 = c->wm_regs[TGSI_FILE_OUTPUT][0][channel];
+
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+	 brw_MOV(p, brw_message_reg(nr + channel), src0);
+      }
+      /* skip over the regs populated above: */
+      nr += 8;
+      brw_pop_insn_state(p);
+   }
+    
+
+   /* Pass through control information:
+    */
+   /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_MOV(p,
+	      brw_message_reg(base_reg + 1),
+	      brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   /* Send framebuffer write message: */
+   brw_fb_WRITE(p,
+		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		0,              /* render surface always 0 */
+		nr,
+		0,
+		1);
+
+}
+
+
+static void brw_wm_emit_instruction( struct brw_wm_compile *c,
+				     struct tgsi_full_instruction *inst )
+{
+   struct brw_compile *p = &c->func;
+
+#if 0   
+   if (inst->CondUpdate)
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   else
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#else
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#endif
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_abs(c, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+      emit_alu2(c, inst, BRW_OPCODE_ADD);
+      break;
+   case TGSI_OPCODE_SUB:
+      assert(0);
+//      emit_alu2(c, inst, BRW_OPCODE_SUB);
+      break;
+   case TGSI_OPCODE_FRC:
+      emit_alu1(c, inst, BRW_OPCODE_FRC);
+      break;
+   case TGSI_OPCODE_FLR:
+      assert(0);
+//      emit_alu1(c, inst, BRW_OPCODE_FLR);
+      break;
+   case TGSI_OPCODE_LRP:
+      emit_lrp(c, inst);
+      break;
+   case TGSI_OPCODE_INT:
+      emit_alu1(c, inst, BRW_OPCODE_RNDD);
+      break;
+   case TGSI_OPCODE_MOV:
+      emit_alu1(c, inst, BRW_OPCODE_MOV);
+      break;
+   case TGSI_OPCODE_DP3:
+      emit_dp3(c, inst);
+      break;
+   case TGSI_OPCODE_DP4:
+      emit_dp4(c, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(c, inst);
+      break;
+   case TGSI_OPCODE_DPH:
+      emit_dph(c, inst);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(c, inst);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(c, inst);
+      break;
+   case TGSI_OPCODE_DDX:
+      emit_ddx(c, inst);
+      break;
+   case TGSI_OPCODE_DDY:
+      emit_ddy(c, inst);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_sop(c, inst, BRW_CONDITIONAL_L);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sop(c, inst, BRW_CONDITIONAL_LE);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sop(c, inst, BRW_CONDITIONAL_G);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sop(c, inst, BRW_CONDITIONAL_GE);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+      break;
+   case TGSI_OPCODE_MUL:
+      emit_alu2(c, inst, BRW_OPCODE_MUL);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(c, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+      emit_mad(c, inst);
+      break;
+   case TGSI_OPCODE_TEX:
+      emit_tex(c, inst);
+      break;
+   case TGSI_OPCODE_TXB:
+      emit_txb(c, inst);
+      break;
+   case TGSI_OPCODE_TEXKILL:
+      emit_kil(c);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_insn < MAX_IFSN);
+      c->if_inst[c->if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_insn-1]  = brw_ELSE(p, c->if_inst[c->if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_insn > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_insn]);
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1ud(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+//      orig_inst = inst->Data;
+//      orig_inst->Data = &p->store[p->nr_insn];
+      assert(0);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_pop_insn_state(p);
+      break;
+
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1ud(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_pop_insn_state(p);
+#else
+      emit_fb_write(c, inst);
+#endif
+
+      break;
+   case TGSI_OPCODE_BGNFOR:
+      c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDFOR:
+      c->loop_insn--;
+      c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
+      /* patch all the BREAK instructions from
+         last BGNFOR */
+      while (c->inst0 > c->loop_inst[c->loop_insn]) {
+	 c->inst0--;
+	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0 + 1;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 } else if (c->inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+      break;
+   case TGSI_OPCODE_END:
+      emit_fb_write(c, inst);
+      break;
+
+   default:
+      debug_printf("unsupported IR in fragment shader %d\n",
+		   inst->Instruction.Opcode);
+   }
+#if 0
+   if (inst->CondUpdate)
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   else
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+}
+
+
+
+
+
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   struct brw_compile *p = &c->func;
+
+   brw_init_compile(&c->func);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   c->reg_index = 0;
+   c->if_insn = 0;
+   c->loop_insn = 0;
+   c->stack_index = brw_indirect(0,0);
+
+   /* Do static register allocation and parameter interpolation:
+    */
+   brw_wm_emit_decls( c );
+
+   /* Emit the actual program.  All done with very direct translation,
+    * hopefully we can improve on this shortly...
+    */
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* already done */
+	 break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* not handled yet */
+	 assert(0);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         brw_wm_emit_instruction(c, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   /* Fix up call targets:
+    */
+#if 0
+   {
+      unsigned nr_insns = c->fp->program.Base.NumInstructions;
+      unsigned insn, target_insn;
+      struct tgsi_full_instruction *inst1, *inst2;
+      struct brw_instruction *brw_inst1, *brw_inst2;
+      int offset;
+      for (insn = 0; insn < nr_insns; insn++) {
+	 inst1 = &c->fp->program.Base.Instructions[insn];
+	 brw_inst1 = inst1->Data;
+	 switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->fp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+	 }
+      }
+   }
+#endif
+
+   c->prog_data.total_grf = c->reg_index;
+   c->prog_data.total_scratch = 0;
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_iz.c b/src/gallium/drivers/i965simple/brw_wm_iz.c
new file mode 100644
index 0000000000..6c5f25bf39
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_iz.c
@@ -0,0 +1,214 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   unsigned mode:2;
+   unsigned sd_present:1;
+   unsigned sd_to_rt:1;
+   unsigned dd_present:1;
+   unsigned ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key )
+{
+   unsigned reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
new file mode 100644
index 0000000000..52b2909a65
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
@@ -0,0 +1,275 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+static int intel_translate_shadow_compare_func(unsigned func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+       return COMPAREFUNC_ALWAYS;
+   case PIPE_FUNC_LESS:
+       return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_LEQUAL:
+       return COMPAREFUNC_LESS;
+   case PIPE_FUNC_GREATER:
+       return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_ALWAYS:
+       return COMPAREFUNC_NEVER;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return COMPAREFUNC_NEVER;
+}
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned translate_wrap_mode( int wrap )
+{
+   switch( wrap ) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return BRW_TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return BRW_TEXCOORDMODE_CLAMP;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return BRW_TEXCOORDMODE_MIRROR;
+   default:
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+
+static unsigned U_FIXED(float value, unsigned frac_bits)
+{
+   value *= (1<<frac_bits);
+   return value < 0 ? 0 : value;
+}
+
+static int S_FIXED(float value, unsigned frac_bits)
+{
+   return value * (1<<frac_bits);
+}
+
+
+static unsigned upload_default_color( struct brw_context *brw,
+                                      const float *color )
+{
+   struct brw_sampler_default_color sdc;
+
+   COPY_4V(sdc.color, color);
+
+   return brw_cache_data( &brw->cache[BRW_SAMPLER_DEFAULT_COLOR], &sdc );
+}
+
+
+/*
+ */
+static void brw_update_sampler_state( const struct pipe_sampler_state *pipe_sampler,
+				      unsigned sdc_gs_offset,
+				      struct brw_sampler_state *sampler)
+{
+   memset(sampler, 0, sizeof(*sampler));
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC;
+      break;
+   default:
+      break;
+   }
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   default:
+      break;
+   }
+   /* Set Anisotropy:
+    */
+   switch (pipe_sampler->mag_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   if (pipe_sampler->max_anisotropy > 2.0) {
+      sampler->ss3.max_aniso = MAX2((pipe_sampler->max_anisotropy - 2) / 2,
+                                    BRW_ANISORATIO_16);
+   }
+
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_s);
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_r);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_t);
+
+   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
+    */
+#if 0
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
+      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+   }
+#endif
+
+   /* Set shadow function:
+    */
+   if (pipe_sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function = intel_translate_shadow_compare_func(pipe_sampler->compare_func);
+   }
+
+   /* Set LOD bias:
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(pipe_sampler->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD:
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(pipe_sampler->max_lod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(pipe_sampler->min_lod, 0), 13), 6);
+
+   sampler->ss2.default_color_pointer = sdc_gs_offset >> 5;
+}
+
+
+
+/* All samplers must be uploaded in a single contiguous array, which
+ * complicates various things.  However, this is still too confusing -
+ * FIXME: simplify all the different new texture state flags.
+ */
+static void upload_wm_samplers(struct brw_context *brw)
+{
+   unsigned unit;
+   unsigned sampler_count = 0;
+
+   /* BRW_NEW_SAMPLER */
+   for (unit = 0; unit < brw->num_textures && unit < brw->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      if (brw->attribs.Texture[unit]) {
+         const struct pipe_sampler_state *sampler = brw->attribs.Samplers[unit];
+	 unsigned sdc_gs_offset = upload_default_color(brw, sampler->border_color);
+
+	 brw_update_sampler_state(sampler,
+				  sdc_gs_offset,
+				  &brw->wm.sampler[unit]);
+
+	 sampler_count = unit + 1;
+      }
+   }
+
+   if (brw->wm.sampler_count != sampler_count) {
+      brw->wm.sampler_count = sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   brw->wm.sampler_gs_offset = 0;
+
+   if (brw->wm.sampler_count)
+      brw->wm.sampler_gs_offset =
+	 brw_cache_data_sz(&brw->cache[BRW_SAMPLER],
+			   brw->wm.sampler,
+			   sizeof(struct brw_sampler_state) * brw->wm.sampler_count);
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .brw = BRW_NEW_SAMPLER,
+      .cache = 0
+   },
+   .update = upload_wm_samplers
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
new file mode 100644
index 0000000000..37a9bf919c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_state.c
@@ -0,0 +1,195 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+static void upload_wm_unit(struct brw_context *brw )
+{
+   struct brw_wm_unit_state wm;
+   unsigned max_threads;
+   unsigned per_thread;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      max_threads = 0;
+   else
+      max_threads = 31;
+
+
+   memset(&wm, 0, sizeof(wm));
+
+   /* CACHE_NEW_WM_PROG */
+   wm.thread0.grf_reg_count = align(brw->wm.prog_data->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
+   wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   wm.thread3.const_urb_entry_read_length = brw->wm.prog_data->curb_read_length;
+
+   wm.wm5.max_threads = max_threads;
+
+   per_thread = align(brw->wm.prog_data->total_scratch, 1024);
+   assert(per_thread <= 12 * 1024);
+
+#if 0
+   if (brw->wm.prog_data->total_scratch) {
+      unsigned total = per_thread * (max_threads + 1);
+
+      /* Scratch space -- just have to make sure there is sufficient
+       * allocated for the active program and current number of threads.
+       */
+      brw->wm.scratch_buffer_size = total;
+      if (brw->wm.scratch_buffer &&
+	  brw->wm.scratch_buffer_size > brw->wm.scratch_buffer->size) {
+	 dri_bo_unreference(brw->wm.scratch_buffer);
+	 brw->wm.scratch_buffer = NULL;
+      }
+      if (!brw->wm.scratch_buffer) {
+	 brw->wm.scratch_buffer = dri_bo_alloc(intel->intelScreen->bufmgr,
+					       "wm scratch",
+					       brw->wm.scratch_buffer_size,
+					       4096, DRM_BO_FLAG_MEM_TT);
+      }
+   }
+   /* XXX: Scratch buffers are not implemented correectly.
+    *
+    * The scratch offset to be programmed into wm is relative to the general
+    * state base address.  However, using dri_bo_alloc/dri_bo_emit_reloc (or
+    * the previous bmGenBuffers scheme), we get an offset relative to the
+    * start of framebuffer.  Even before then, it was broken in other ways,
+    * so just fail for now if we hit that path.
+    */
+   assert(brw->wm.prog_data->total_scratch == 0);
+#endif
+
+   /* CACHE_NEW_SURFACE */
+   wm.thread1.binding_table_entry_count = brw->wm.nr_surfaces;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   wm.thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
+
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   /* CACHE_NEW_SAMPLER */
+   wm.wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
+   wm.wm4.sampler_state_pointer = brw->wm.sampler_gs_offset >> 5;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   {
+      const struct brw_fragment_program *fp = brw->attribs.FragmentProgram;
+
+      if (fp->UsesDepth)
+	 wm.wm5.program_uses_depth = 1; /* as far as we can tell */
+
+      if (fp->info.writes_z)
+	 wm.wm5.program_computes_depth = 1;
+
+      /* BRW_NEW_ALPHA_TEST */
+      if (fp->info.uses_kill ||
+	  brw->attribs.DepthStencil->alpha.enabled)
+	 wm.wm5.program_uses_killpixel = 1;
+
+      wm.wm5.enable_8_pix = 1;
+   }
+
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   /* BRW_NEW_RASTERIZER */
+   if (brw->attribs.Raster->poly_stipple_enable)
+      wm.wm5.polygon_stipple = 1;
+
+#if 0
+   if (brw->attribs.Polygon->OffsetFill) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = brw->attribs.Polygon->OffsetUnits * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = brw->attribs.Polygon->OffsetFactor;
+   }
+#endif
+
+   if (brw->attribs.Raster->line_stipple_enable) {
+      wm.wm5.line_stipple = 1;
+   }
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      wm.wm4.stats_enable = 1;
+
+   brw->wm.state_gs_offset = brw_cache_data( &brw->cache[BRW_WM_UNIT], &wm );
+
+   if (brw->wm.prog_data->total_scratch) {
+      /*
+      dri_emit_reloc(brw->cache[BRW_WM_UNIT].pool->buffer,
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     (per_thread / 1024) - 1,
+		     brw->wm.state_gs_offset +
+		     ((char *)&wm.thread2 - (char *)&wm),
+		     brw->wm.scratch_buffer);
+      */
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+   }
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .brw = (BRW_NEW_RASTERIZER |
+	      BRW_NEW_ALPHA_TEST |
+	      BRW_NEW_FS |
+	      BRW_NEW_CURBE_OFFSETS),
+
+      .cache = (CACHE_NEW_SURFACE |
+		CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .update = upload_wm_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_surface_state.c b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
new file mode 100644
index 0000000000..b5b9e0e702
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
@@ -0,0 +1,305 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+static unsigned translate_tex_target( enum pipe_texture_target target )
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      return BRW_SURFACE_1D;
+
+   case PIPE_TEXTURE_2D:
+      return BRW_SURFACE_2D;
+
+   case PIPE_TEXTURE_3D:
+      return BRW_SURFACE_3D;
+
+   case PIPE_TEXTURE_CUBE:
+      return BRW_SURFACE_CUBE;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_tex_format( enum pipe_format pipe_format )
+{
+   switch( pipe_format ) {
+   case PIPE_FORMAT_L8_UNORM:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case PIPE_FORMAT_I8_UNORM:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return BRW_SURFACEFORMAT_A8_UNORM;
+
+   case PIPE_FORMAT_A8L8_UNORM:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      assert(0);		/* not supported for sampling */
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case PIPE_FORMAT_YCBCR_REV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case PIPE_FORMAT_YCBCR:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return BRW_SURFACEFORMAT_FXT1;
+#endif
+
+   case PIPE_FORMAT_Z16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+#if 0
+   case PIPE_FORMAT_RGB_DXT1:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case PIPE_FORMAT_RGBA_DXT1:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT3:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT5:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   case PIPE_FORMAT_SRGBA8:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case PIPE_FORMAT_SRGB_DXT1:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+#endif
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned brw_buffer_offset(struct brw_context *brw,
+                                  struct pipe_buffer *buffer)
+{
+   return brw->winsys->get_buffer_offset(brw->winsys,
+                                         buffer,
+                                         0);
+}
+
+static
+void brw_update_texture_surface( struct brw_context *brw,
+				 unsigned unit )
+{
+   const struct brw_texture *tObj = brw->attribs.Texture[unit];
+   struct brw_surface_state surf;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = translate_tex_target(tObj->base.target);
+   surf.ss0.surface_format = translate_tex_format(tObj->base.format);
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+   /* Updated in emit_reloc */
+   surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
+
+   surf.ss2.mip_count = tObj->base.last_level;
+   surf.ss2.width = tObj->base.width[0] - 1;
+   surf.ss2.height = tObj->base.height[0] - 1;
+
+   surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+   surf.ss3.tiled_surface = 0; /* always zero */
+   surf.ss3.pitch = tObj->stride - 1;
+   surf.ss3.depth = tObj->base.depth[0] - 1;
+
+   surf.ss4.min_lod = 0;
+
+   if (tObj->base.target == PIPE_TEXTURE_CUBE) {
+      surf.ss0.cube_pos_x = 1;
+      surf.ss0.cube_pos_y = 1;
+      surf.ss0.cube_pos_z = 1;
+      surf.ss0.cube_neg_x = 1;
+      surf.ss0.cube_neg_y = 1;
+      surf.ss0.cube_neg_z = 1;
+   }
+
+   brw->wm.bind.surf_ss_offset[unit + 1] =
+      brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+}
+
+
+
+#define OFFSET(TYPE, FIELD) ( (unsigned)&(((TYPE *)0)->FIELD) )
+
+
+static void upload_wm_surfaces(struct brw_context *brw )
+{
+   unsigned i;
+
+   {
+      struct brw_surface_state surf;
+
+      /* BRW_NEW_FRAMEBUFFER
+       */
+      struct pipe_surface *pipe_surface = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
+      struct brw_texture *tex = (struct brw_texture *)pipe_surface->texture;
+
+      memset(&surf, 0, sizeof(surf));
+
+      if (pipe_surface != NULL) {
+	 if (pipe_surface->block.size == 4)
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 else
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+	 surf.ss0.surface_type = BRW_SURFACE_2D;
+
+	 surf.ss1.base_addr = brw_buffer_offset( brw, tex->buffer );
+
+	 surf.ss2.width = pipe_surface->width - 1;
+	 surf.ss2.height = pipe_surface->height - 1;
+	 surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+	 surf.ss3.tiled_surface = 0;
+	 surf.ss3.pitch = pipe_surface->stride - 1;
+      } else {
+	 surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 surf.ss0.surface_type = BRW_SURFACE_NULL;
+      }
+
+      /* BRW_NEW_BLEND */
+      surf.ss0.color_blend = (!brw->attribs.Blend->logicop_enable &&
+			      brw->attribs.Blend->blend_enable);
+
+
+      surf.ss0.writedisable_red =   !(brw->attribs.Blend->colormask & PIPE_MASK_R);
+      surf.ss0.writedisable_green = !(brw->attribs.Blend->colormask & PIPE_MASK_G);
+      surf.ss0.writedisable_blue =  !(brw->attribs.Blend->colormask & PIPE_MASK_B);
+      surf.ss0.writedisable_alpha = !(brw->attribs.Blend->colormask & PIPE_MASK_A);
+
+
+
+
+      brw->wm.bind.surf_ss_offset[0] = brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+
+      brw->wm.nr_surfaces = 1;
+   }
+
+
+   /* BRW_NEW_TEXTURE
+    */
+   for (i = 0; i < brw->num_textures && i < brw->num_samplers; i++) {
+      const struct brw_texture *texUnit = brw->attribs.Texture[i];
+
+      if (texUnit &&
+	  texUnit->base.reference.count/*(texUnit->reference.count > 0) == really used */) {
+
+	 brw_update_texture_surface(brw, i);
+
+	 brw->wm.nr_surfaces = i+2;
+      }
+      else {
+	 brw->wm.bind.surf_ss_offset[i+1] = 0;
+      }
+   }
+
+   brw->wm.bind_ss_offset = brw_cache_data( &brw->cache[BRW_SS_SURF_BIND],
+					    &brw->wm.bind );
+}
+
+
+/* KW: Will find a different way to acheive this, see for example the
+ * state caches with relocs in the i915 swz driver.
+ */
+#if 0
+static void emit_reloc_wm_surfaces(struct brw_context *brw)
+{
+   int unit;
+
+   if (brw->state.draw_region != NULL) {
+      /* Emit framebuffer relocation */
+      dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     0,
+		     brw->wm.bind.surf_ss_offset[0] +
+		     offsetof(struct brw_surface_state, ss1),
+		     brw->state.draw_region->buffer);
+   }
+
+   /* Emit relocations for texture buffers */
+   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
+      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
+      struct gl_texture_object *tObj = texUnit->_Current;
+      struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
+      if (texUnit->_ReallyEnabled && intelObj->mt != NULL) {
+	 dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+			0,
+			brw->wm.bind.surf_ss_offset[unit + 1] +
+			offsetof(struct brw_surface_state, ss1),
+			intelObj->mt->region->buffer);
+      }
+   }
+}
+#endif
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .brw = (BRW_NEW_FRAMEBUFFER |
+	      BRW_NEW_BLEND |
+	      BRW_NEW_TEXTURE),
+      .cache = 0
+   },
+   .update = upload_wm_surfaces,
+};