40 files changed, 907 insertions, 2599 deletions
diff --git a/src/gallium/drivers/i965/brw_bo.c b/src/gallium/drivers/i965/brw_bo.c
new file mode 100644
index 0000000000..e7a4dac666
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_bo.c
@@ -0,0 +1,12 @@
+
+
+void brw_buffer_subdata()
+{
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 1088a7a607..9ab5638137 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -62,84 +62,21 @@ const struct brw_tracked_state brw_cc_vp = {
 };
 
 struct brw_cc_unit_key {
-   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
-
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-   GLclampf alpha_ref;
-
-   GLboolean dither;
-
-   GLboolean depth_test, depth_write;
-   GLenum depth_func;
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend; /* no color mask */
 };
 
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
-
    memset(key, 0, sizeof(*key));
+   
+   key->dsa = brw->curr.dsa.base;
+   key->blend = brw->curr.blend.base;
 
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
-
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_ref[1] = ctx->Stencil.Ref[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
-   }
-
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
-      key->blend_eq_a = ctx->Color.BlendEquationA;
-      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
-      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
-      key->blend_src_a = ctx->Color.BlendSrcA;
-      key->blend_dst_a = ctx->Color.BlendDstA;
-   }
-
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-      key->alpha_ref = ctx->Color.AlphaRef;
-   }
-
-   key->dither = ctx->Color.DitherFlag;
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
+   /* Clear non-respected values:
+    */
+   key->blend.colormask = 0xf;
 }
 
 /**
@@ -153,103 +90,16 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 
    memset(&cc, 0, sizeof(cc));
 
-   /* _NEW_STENCIL */
-   if (key->stencil) {
-      cc.cc0.stencil_enable = 1;
-      cc.cc0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
-      cc.cc0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
-      cc.cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
-      cc.cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      cc.cc1.stencil_ref = key->stencil_ref[0];
-      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
-      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
-
-      if (key->stencil_two_side) {
-	 cc.cc0.bf_stencil_enable = 1;
-	 cc.cc0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
-	 cc.cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
-	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
-      }
-
-      /* Not really sure about this:
-       */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 cc.cc0.stencil_write_enable = 1;
-   }
-
-   /* _NEW_COLOR */
-   if (key->logic_op != GL_COPY) {
-      cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
-      }
-
-      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
-      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
-
-      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
-      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
-
-      cc.cc3.blend_enable = 1;
-      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
-				dstA != dstRGB ||
-				eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
-      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
-   }
-
-   if (key->dither) {
-      cc.cc5.dither_enable = 1;
-      cc.cc6.y_dither_offset = 0;
-      cc.cc6.x_dither_offset = 0;
-   }
-
-   /* _NEW_DEPTH */
-   if (key->depth_test) {
-      cc.cc2.depth_test = 1;
-      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
-      cc.cc2.depth_write_enable = key->depth_write;
-   }
+   cc.cc0 = brw->dsa.cc0;
+   cc.cc1 = brw->dsa.cc1;
+   cc.cc2 = brw->dsa.cc2;
+   cc.cc3 = brw->dsa.cc3 | brw->blend.cc3;
 
    /* CACHE_NEW_CC_VP */
    cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */
 
-   if (INTEL_DEBUG & DEBUG_STATS)
-      cc.cc5.statistics_enable = 1;
+   cc.cc5 = brw->blend.cc5 | brw->debug.cc5;
+
 
    bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
 			 key, sizeof(*key),
@@ -286,7 +136,7 @@ static void prepare_cc_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_cc_unit = {
    .dirty = {
-      .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
+      .mesa = PIPE_NEW_DEPTH_STENCIL_ALPHA | PIPE_NEW_BLEND,
       .brw = 0,
       .cache = CACHE_NEW_CC_VP
    },
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 20a927cf38..df1b3718d0 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -29,9 +29,9 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_state.h"
+
+#include "util/u_math.h"
 
 #include "intel_batchbuffer.h"
 
@@ -83,7 +83,7 @@ static void compile_clip_prog( struct brw_context *brw,
 	 delta += ATTR_SIZE;
       }
 
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -104,16 +104,16 @@ static void compile_clip_prog( struct brw_context *brw,
     * do all three:
     */
    switch (key->primitive) {
-   case GL_TRIANGLES: 
+   case PIPE_PRIM_TRIANGLES: 
       if (key->do_unfilled)
 	 brw_emit_unfilled_clip( &c );
       else
 	 brw_emit_tri_clip( &c );
       break;
-   case GL_LINES:
+   case PIPE_PRIM_LINES:
       brw_emit_line_clip( &c );
       break;
-   case GL_POINTS:
+   case PIPE_PRIM_POINTS:
       brw_emit_point_clip( &c );
       break;
    default:
@@ -143,7 +143,6 @@ static void compile_clip_prog( struct brw_context *brw,
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -151,101 +150,51 @@ static void upload_clip_prog(struct brw_context *brw)
    /* Populate the key:
     */
    /* BRW_NEW_REDUCED_PRIMITIVE */
-   key.primitive = brw->intel.reduced_primitive;
+   key.primitive = brw->reduced_primitive;
    /* CACHE_NEW_VS_PROG */
    key.attrs = brw->vs.prog_data->outputs_written;
-   /* _NEW_LIGHT */
-   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
-   /* _NEW_TRANSFORM */
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_RAST */
+   key.do_flat_shading = brw->rast.base.flatshade;
+   /* PIPE_NEW_UCP */
+   key.nr_userclip = brw->nr_ucp;
 
    if (BRW_IS_IGDNG(brw))
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
 
-   /* _NEW_POLYGON */
-   if (key.primitive == GL_TRIANGLES) {
-      if (ctx->Polygon.CullFlag &&
-	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+   /* PIPE_NEW_RAST */
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
 	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
       else {
-	 GLuint fill_front = CLIP_CULL;
-	 GLuint fill_back = CLIP_CULL;
-	 GLuint offset_front = 0;
-	 GLuint offset_back = 0;
-
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_FRONT) {
-	    switch (ctx->Polygon.FrontMode) {
-	    case GL_FILL: 
-	       fill_front = CLIP_FILL; 
-	       offset_front = 0;
-	       break;
-	    case GL_LINE:
-	       fill_front = CLIP_LINE;
-	       offset_front = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_front = CLIP_POINT;
-	       offset_front = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
 	 }
 
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_BACK) {
-	    switch (ctx->Polygon.BackMode) {
-	    case GL_FILL: 
-	       fill_back = CLIP_FILL; 
-	       offset_back = 0;
-	       break;
-	    case GL_LINE:
-	       fill_back = CLIP_LINE;
-	       offset_back = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_back = CLIP_POINT;
-	       offset_back = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
 	 }
 
-	 if (ctx->Polygon.BackMode != GL_FILL ||
-	     ctx->Polygon.FrontMode != GL_FILL) {
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
 	    key.do_unfilled = 1;
-
-	    /* Most cases the fixed function units will handle.  Cases where
-	     * one or more polygon faces are unfilled will require help:
-	     */
 	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.offset_ccw;
+	 key.offset_cw = brw->rast.offset_cw;
+
+	 if (brw->rast.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
 
-	    if (offset_back || offset_front) {
-	       /* _NEW_POLYGON, _NEW_BUFFERS */
-	       key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale;
-	       key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD;
-	    }
-
-	    switch (ctx->Polygon.FrontFace) {
-	    case GL_CCW:
-	       key.fill_ccw = fill_front;
-	       key.fill_cw = fill_back;
-	       key.offset_ccw = offset_front;
-	       key.offset_cw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_cw != CLIP_CULL) 
-		  key.copy_bfc_cw = 1;
-	       break;
-	    case GL_CW:
-	       key.fill_cw = fill_front;
-	       key.fill_ccw = fill_back;
-	       key.offset_cw = offset_front;
-	       key.offset_ccw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_ccw != CLIP_CULL) 
-		  key.copy_bfc_ccw = 1;
-	       break;
-	    }
+	 if (brw->rast.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
 	 }
       }
    }
@@ -262,10 +211,8 @@ static void upload_clip_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT | 
-		_NEW_TRANSFORM |
-		_NEW_POLYGON | 
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_RAST | 
+		PIPE_NEW_UCP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index 957df441ab..d80ec819b9 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -43,6 +43,7 @@
  */
 struct brw_clip_prog_key {
    GLuint attrs:32;		
+
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -51,12 +52,10 @@ struct brw_clip_prog_key {
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:17;
-
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:27;
+   GLuint pad1:12;
    
    GLfloat offset_factor;
    GLfloat offset_units;
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
index 048ca620fa..6b4da25644 100644
--- a/src/gallium/drivers/i965/brw_clip_line.c
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c
index 8458f61c5a..b2cf7b2011 100644
--- a/src/gallium/drivers/i965/brw_clip_point.c
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 234b3744bf..72e27205e2 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -32,7 +32,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_clip_unit_key {
    unsigned int total_grf;
@@ -66,8 +65,8 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_clip_entries;
    key->urb_size = brw->urb.vsize;
 
-   /* _NEW_TRANSOFORM */
-   key->depth_clamp = ctx->Transform.DepthClamp;
+   /*  */
+   key->depth_clamp = 0; // XXX: add this to gallium: ctx->Transform.DepthClamp;
 }
 
 static dri_bo *
@@ -175,7 +174,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM,
+      .mesa  = 0,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
index 0efd77225e..d8feca6a87 100644
--- a/src/gallium/drivers/i965/brw_clip_tri.c
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index ad1bfa435f..4baff55806 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,11 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 5a73abdfee..7a6c46ce07 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -30,13 +30,6 @@
   */
 
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index c300c33adc..bf0ec89e13 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -52,122 +52,77 @@
 #include "utils.h"
 
 
-/***************************************
- * Mesa's Driver Functions
- ***************************************/
-
-static void brwUseProgram(GLcontext *ctx, GLuint program)
-{
-   _mesa_use_program(ctx, program);
-}
-
-static void brwInitProgFuncs( struct dd_function_table *functions )
-{
-   functions->UseProgram = brwUseProgram;
-}
-static void brwInitDriverFunctions( struct dd_function_table *functions )
-{
-   intelInitDriverFunctions( functions );
-
-   brwInitFragProgFuncs( functions );
-   brwInitProgFuncs( functions );
-   brw_init_queryobj_functions(functions);
-
-   functions->Viewport = intel_viewport;
-}
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate)
 {
-   struct dd_function_table functions;
    struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
-   struct intel_context *intel = &brw->intel;
-   GLcontext *ctx = &intel->ctx;
 
    if (!brw) {
-      _mesa_printf("%s: failed to alloc context\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   brwInitVtbl( brw );
-   brwInitDriverFunctions( &functions );
-
-   if (!intelInitContext( intel, mesaVis, driContextPriv,
-			  sharedContextPrivate, &functions )) {
-      _mesa_printf("%s: failed to init intel context\n", __FUNCTION__);
-      FREE(brw);
+      debug_printf("%s: failed to alloc context\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   /* Initialize swrast, tnl driver tables: */
-   intelInitSpanFuncs(ctx);
-
-   TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
-
-   ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT;
-   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
-   ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
-                                     ctx->Const.MaxTextureImageUnits);
-   ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
-
-   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
-    */
-   ctx->Const.MaxTextureLevels = 13;
-   ctx->Const.Max3DTextureLevels = 9;
-   ctx->Const.MaxCubeTextureLevels = 12;
-   ctx->Const.MaxTextureRectSize = (1<<12);
-   
-   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-
-   /* if conformance mode is set, swrast can handle any size AA point */
-   ctx->Const.MaxPointSizeAA = 255.0;
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    ctx->Shader.EmitCondCodes = GL_TRUE;
    ctx->Shader.EmitNVTempInitialization = GL_TRUE;
 
-   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.VertexProgram.MaxAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
-   ctx->Const.VertexProgram.MaxNativeTemps = 256;
-   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
-   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
-   ctx->Const.VertexProgram.MaxEnvParams =
-      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
-	   ctx->Const.VertexProgram.MaxEnvParams);
-
-   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
-   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
-   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
-   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
-   ctx->Const.FragmentProgram.MaxEnvParams =
-      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
-	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   brw_init_query( brw );
    brw_init_state( brw );
+   brw_draw_init( brw );
 
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0;
 
    brw->emit_state_always = 0;
 
-   ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-   ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
    make_empty_list(&brw->query.active_head);
 
-   brw_draw_init( brw );
 
    return GL_TRUE;
 }
 
+/**
+ * called from intelDestroyContext()
+ */
+static void brw_destroy_context( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   int i;
+
+   brw_destroy_state(brw);
+   brw_draw_destroy( brw );
+
+   _mesa_free(brw->wm.compile_data);
+
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
+   intel_region_release(&brw->state.depth_region);
+
+   dri_bo_unreference(brw->curbe.curbe_bo);
+   dri_bo_unreference(brw->vs.prog_bo);
+   dri_bo_unreference(brw->vs.state_bo);
+   dri_bo_unreference(brw->vs.bind_bo);
+   dri_bo_unreference(brw->gs.prog_bo);
+   dri_bo_unreference(brw->gs.state_bo);
+   dri_bo_unreference(brw->clip.prog_bo);
+   dri_bo_unreference(brw->clip.state_bo);
+   dri_bo_unreference(brw->clip.vp_bo);
+   dri_bo_unreference(brw->sf.prog_bo);
+   dri_bo_unreference(brw->sf.state_bo);
+   dri_bo_unreference(brw->sf.vp_bo);
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
+      dri_bo_unreference(brw->wm.sdc_bo[i]);
+   dri_bo_unreference(brw->wm.bind_bo);
+   for (i = 0; i < BRW_WM_MAX_SURF; i++)
+      dri_bo_unreference(brw->wm.surf_bo[i]);
+   dri_bo_unreference(brw->wm.sampler_bo);
+   dri_bo_unreference(brw->wm.prog_bo);
+   dri_bo_unreference(brw->wm.state_bo);
+   dri_bo_unreference(brw->cc.prog_bo);
+   dri_bo_unreference(brw->cc.state_bo);
+   dri_bo_unreference(brw->cc.vp_bo);
+}
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index fa3e32c7ff..009e28b227 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -115,7 +115,6 @@
  * Handles blending and (presumably) depth and stencil testing.
  */
 
-#define BRW_FALLBACK_TEXTURE		 0x1
 #define BRW_MAX_CURBE                    (32*16)
 
 struct brw_context;
@@ -450,11 +449,9 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean tmp_fallback;
    GLboolean no_batch_wrap;
 
    struct {
@@ -692,7 +689,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 /*======================================================================
  * brw_queryobj.c
  */
-void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_init_query(struct brw_context *brw);
 void brw_prepare_query_begin(struct brw_context *brw);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
@@ -730,7 +727,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst);
  * macros used previously:
  */
 static INLINE struct brw_context *
-brw_context( GLcontext *ctx )
+brw_context( struct pipe_context *ctx )
 {
    return (struct brw_context *)ctx;
 }
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 4be6c77aa1..3e32c4983d 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,14 +30,6 @@
   */
 
 
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
@@ -64,31 +56,17 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_UCP */
+   if (brw->nr_ucp) {
+      GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
 
 
    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
 
-   /* This can happen - what to do?  Probably rather than falling
-    * back, the best thing to do is emit programs which code the
-    * constants as immediate values.  Could do this either as a static
-    * cap on WM and VS, or adaptively.
-    *
-    * Unfortunately, this is currently dependent on the results of the
-    * program generation process (in the case of wm), so this would
-    * introduce the need to re-generate programs in the event of a
-    * curbe allocation failure.
-    */
-   /* Max size is 32 - just large enough to
-    * hold the 128 parameters allowed by
-    * the fragment and vertex program
-    * api's.  It's not clear what happens
-    * when both VP and FP want to use 128
-    * parameters, though. 
+   /* When this is > 32, want to use a true constant buffer to hold
+    * the extra constants.
     */
    assert(total_regs <= 32);
 
@@ -113,8 +91,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
       brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
       brw->curbe.total_size = reg;
 
-      if (0)
-	 _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+      if (BRW_DEBUG & DEBUG_CURBE)
+	 debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
 		      brw->curbe.wm_start,
 		      brw->curbe.wm_size,
 		      brw->curbe.clip_start,
@@ -129,7 +107,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = _NEW_TRANSFORM,
+      .mesa = PIPE_NEW_UCP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -204,11 +182,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
+      /* map fs constant buffer */
 
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
 	 buf[offset + i] = *brw->wm.prog_data->param[i];
+
+      /* unmap fs constant buffer */
    }
 
 
@@ -228,18 +208,15 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
       }
 
-      /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
-       * clip-space:
+      /* Clip planes:
        */
-      assert(MAX_CLIP_PLANES == 6);
-      for (j = 0; j < MAX_CLIP_PLANES; j++) {
-	 if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
-	    buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0];
-	    buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1];
-	    buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2];
-	    buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3];
-	    i++;
-	 }
+      assert(brw->nr_ucp <= 6);
+      for (j = 0; j < brw->nr_ucp; j++) {
+	 buf[offset + i * 4 + 0] = brw->ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->ucp[j][3];
+	 i++;
       }
    }
 
@@ -248,13 +225,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
+      /* map vs constant buffer */
 
       /* XXX just use a memcpy here */
       for (i = 0; i < nr; i++) {
@@ -264,14 +235,16 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 2] = value[2];
 	 buf[offset + i * 4 + 3] = value[3];
       }
+
+      /* unmap vs constant buffer */
    }
 
    if (0) {
       for (i = 0; i < sz*16; i+=4) 
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -282,12 +255,12 @@ static void prepare_constant_buffer(struct brw_context *brw)
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
       /* constants have not changed */
-      _mesa_free(buf);
+      FREE(buf);
    } 
    else {
       /* constants have changed */
       if (brw->curbe.last_buf)
-	 _mesa_free(brw->curbe.last_buf);
+	 FREE(brw->curbe.last_buf);
 
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
@@ -353,15 +326,11 @@ static void emit_constant_buffer(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-/* This tracked state is unique in that the state it monitors varies
- * dynamically depending on the parameters tracked by the fragment and
- * vertex programs.  This is the template used as a starting point,
- * each context will maintain a copy of this internally and update as
- * required.
- */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .mesa = (PIPE_NEW_FS_CONSTANTS |
+	       PIPE_NEW_VS_CONSTANTS |
+	       PIPE_NEW_UCP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 78d457ad2b..282c5b18f4 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
index 9fef230507..a84c581c03 100644
--- a/src/gallium/drivers/i965/brw_disasm.c
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -27,8 +27,6 @@
 #include <unistd.h>
 #include <stdarg.h>
 
-#include "main/mtypes.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 44bb7bd588..8cd117c24f 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -39,14 +39,13 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
 
-static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
+static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
    _3DPRIM_POINTLIST,
    _3DPRIM_LINELIST,
    _3DPRIM_LINELOOP,
@@ -60,19 +59,6 @@ static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
 };
 
 
-static const GLenum reduced_prim[GL_POLYGON+1] = {  
-   GL_POINTS,
-   GL_LINES,
-   GL_LINES,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES
-};
-
 
 /* When the primitive changes, set a state bit and re-validate.  Not
  * the nicest and would rather deal with this by having all the
@@ -196,102 +182,6 @@ static void brw_merge_inputs( struct brw_context *brw,
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
 }
 
-/* XXX: could split the primitive list to fallback only on the
- * non-conformant primitives.
- */
-static GLboolean check_fallbacks( struct brw_context *brw,
-				  const struct _mesa_prim *prim,
-				  GLuint nr_prims )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   GLuint i;
-
-   /* If we don't require strict OpenGL conformance, never 
-    * use fallbacks.  If we're forcing fallbacks, always
-    * use fallfacks.
-    */
-   if (brw->intel.conformance_mode == 0)
-      return GL_FALSE;
-
-   if (brw->intel.conformance_mode == 2)
-      return GL_TRUE;
-
-   if (ctx->Polygon.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware will do AA lines, but they are non-conformant it
-    * seems.  TBD whether we keep this fallback:
-    */
-   if (ctx->Line.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_LINES) 
-	    return GL_TRUE;
-   }
-
-   /* Stipple -- these fallbacks could be resolved with a little
-    * bit of work?
-    */
-   if (ctx->Line.StippleFlag) {
-      for (i = 0; i < nr_prims; i++) {
-	 /* GS doesn't get enough information to know when to reset
-	  * the stipple counter?!?
-	  */
-	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
-	    return GL_TRUE;
-	    
-	 if (prim[i].mode == GL_POLYGON &&
-	     (ctx->Polygon.FrontMode == GL_LINE ||
-	      ctx->Polygon.BackMode == GL_LINE))
-	    return GL_TRUE;
-      }
-   }
-
-   if (ctx->Point.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (prim[i].mode == GL_POINTS) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
-    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
-    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
-    * we want strict conformance, force the fallback.
-    * Right now, we only do this for 2D textures.
-    */
-   {
-      int u;
-      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
-         if (texUnit->Enabled) {
-            if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-         }
-      }
-   }
-      
-   /* Nothing stopping us from the fast path now */
-   return GL_FALSE;
-}
-
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
@@ -308,23 +198,12 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    GLboolean retval = GL_FALSE;
    GLboolean warn = GL_FALSE;
    GLboolean first_time = GL_TRUE;
+   uint32_t hw_prim;
    GLuint i;
 
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
-   /* We have to validate the textures *before* checking for fallbacks;
-    * otherwise, the software fallback won't be able to rely on the
-    * texture state, the firstLevel and lastLevel fields won't be
-    * set in the intel texture object (they'll both be 0), and the 
-    * software fallback will segfault if it attempts to access any
-    * texture level other than level 0.
-    */
-   brw_validate_textures( brw );
-
-   if (check_fallbacks(brw, prim, nr_prims))
-      return GL_FALSE;
-
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs( brw, arrays );
@@ -336,90 +215,30 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    brw->vb.max_index = max_index;
    brw->state.dirty.brw |= BRW_NEW_VERTICES;
 
-   /* Have to validate state quite late.  Will rebuild tnl_program,
-    * which depends on varying information.  
-    * 
-    * Note this is where brw->vs->prog_data.inputs_read is calculated,
-    * so can't access it earlier.
-    */
-
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
-
-   for (i = 0; i < nr_prims; i++) {
-      uint32_t hw_prim;
-
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.  This fraction is just a guess (minimal full state plus
-       * a primitive is around 512 bytes), and would be better if we had
-       * an upper bound of how much we might emit in a single
-       * brw_try_draw_prims().
-       */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
-
-      hw_prim = brw_set_prim(brw, prim[i].mode);
-
-      if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) {
-	 first_time = GL_FALSE;
-
-	 brw_validate_state(brw);
-
-	 /* Various fallback checks:  */
-	 if (brw->intel.Fallback)
-	    goto out;
-
-	 /* Check that we can fit our state in with our existing batchbuffer, or
-	  * flush otherwise.
-	  */
-	 if (dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-					     brw->state.validated_bo_count)) {
-	    static GLboolean warned;
-	    intel_batchbuffer_flush(intel->batch);
-
-	    /* Validate the state after we flushed the batch (which would have
-	     * changed the set of dirty state).  If we still fail to
-	     * check_aperture, warn of what's happening, but attempt to continue
-	     * on since it may succeed anyway, and the user would probably rather
-	     * see a failure and a warning than a fallback.
-	     */
-	    brw_validate_state(brw);
-	    if (!warned &&
-		dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-						brw->state.validated_bo_count)) {
-	       warn = GL_TRUE;
-	       warned = GL_TRUE;
-	    }
-	 }
-
-	 brw_upload_state(brw);
-      }
+   hw_prim = brw_set_prim(brw, prim[i].mode);
 
-      brw_emit_prim(brw, &prim[i], hw_prim);
+   brw_validate_state(brw);
 
-      retval = GL_TRUE;
-   }
+   /* Check that we can fit our state in with our existing batchbuffer, or
+    * flush otherwise.
+    */
+   ret = dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					 brw->state.validated_bo_count);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_state(brw);
+   if (ret)
+      return ret;
+   
+   ret = brw_emit_prim(brw, &prim[i], hw_prim);
+   if (ret)
+      return ret;
 
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
- out:
-   UNLOCK_HARDWARE(intel);
-
-   brw_state_cache_check_size(brw);
-
-   if (warn)
-      fprintf(stderr, "i965: Single primitive emit potentially exceeded "
-	      "available aperture space\n");
 
-   if (!retval)
-      DBG("%s failed\n", __FUNCTION__);
-
-   return retval;
+   return 0;
 }
 
 void brw_draw_prims( GLcontext *ctx,
@@ -431,37 +250,26 @@ void brw_draw_prims( GLcontext *ctx,
 		     GLuint min_index,
 		     GLuint max_index )
 {
-   GLboolean retval;
+   enum pipe_error ret;
 
    if (!vbo_all_varyings_in_vbos(arrays)) {
       if (!index_bounds_valid)
 	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-
-      /* Decide if we want to rebase.  If so we end up recursing once
-       * only into this function.
-       */
-      if (min_index != 0) {
-	 vbo_rebase_prims(ctx, arrays,
-			  prim, nr_prims,
-			  ib, min_index, max_index,
-			  brw_draw_prims );
-	 return;
-      }
    }
 
    /* Make a first attempt at drawing:
     */
-   retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
    /* Otherwise, we really are out of memory.  Pass the drawing
     * command to the software tnl module and which will in turn call
     * swrast to do the drawing.
     */
-   if (!retval) {
-       _swsetup_Wakeup(ctx);
-      _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   if (ret != 0) {
+      intel_batchbuffer_flush(intel->batch);
+      ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+      assert(ret == 0);
    }
-
 }
 
 void brw_draw_init( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index a3ff6c58d8..ad3ef6b7dd 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -25,13 +25,9 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_context.h"
 
-#include "main/glheader.h"
-#include "main/bufferobj.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/api_validate.h"
-#include "main/enums.h"
+#include "util/u_upload_mgr.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
@@ -43,303 +39,157 @@
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
-static GLuint double_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
-};
-
-static GLuint float_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
-};
-
-static GLuint uint_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_UNORM,
-   BRW_SURFACEFORMAT_R32G32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_UNORM
-};
-
-static GLuint uint_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_USCALED,
-   BRW_SURFACEFORMAT_R32G32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_USCALED
-};
-
-static GLuint int_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SNORM,
-   BRW_SURFACEFORMAT_R32G32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_SNORM
-};
-
-static GLuint int_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
-};
-
-static GLuint ushort_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_UNORM,
-   BRW_SURFACEFORMAT_R16G16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_UNORM
-};
-
-static GLuint ushort_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_USCALED,
-   BRW_SURFACEFORMAT_R16G16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_USCALED
-};
-
-static GLuint short_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SNORM,
-   BRW_SURFACEFORMAT_R16G16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_SNORM
-};
-
-static GLuint short_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
-};
 
-static GLuint ubyte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_UNORM,
-   BRW_SURFACEFORMAT_R8G8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_UNORM
-};
 
-static GLuint ubyte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_USCALED,
-   BRW_SURFACEFORMAT_R8G8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_USCALED
-};
-
-static GLuint byte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SNORM,
-   BRW_SURFACEFORMAT_R8G8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_SNORM
-};
 
-static GLuint byte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_SSCALED
-};
-
-
-/**
- * Given vertex array type/size/format/normalized info, return
- * the appopriate hardware surface type.
- * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
- */
-static GLuint get_surface_type( GLenum type, GLuint size,
-                                GLenum format, GLboolean normalized )
+unsigned brw_translate_surface_format( unsigned id )
 {
-   if (INTEL_DEBUG & DEBUG_VERTS)
-      _mesa_printf("type %s size %d normalized %d\n", 
-		   _mesa_lookup_enum_by_nr(type), size, normalized);
-
-   if (normalized) {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_norm[size];
-      case GL_SHORT: return short_types_norm[size];
-      case GL_BYTE: return byte_types_norm[size];
-      case GL_UNSIGNED_INT: return uint_types_norm[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
-      case GL_UNSIGNED_BYTE:
-         if (format == GL_BGRA) {
-            /* See GL_EXT_vertex_array_bgra */
-            assert(size == 4);
-            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-         }
-         else {
-            return ubyte_types_norm[size];
-         }
-      default: assert(0); return 0;
-      }      
-   }
-   else {
-      assert(format == GL_RGBA); /* sanity check */
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_scale[size];
-      case GL_SHORT: return short_types_scale[size];
-      case GL_BYTE: return byte_types_scale[size];
-      case GL_UNSIGNED_INT: return uint_types_scale[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_scale[size];
-      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size];
-      default: assert(0); return 0;
-      }      
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
    }
 }
 
-
-static GLuint get_size( GLenum type )
-{
-   switch (type) {
-   case GL_DOUBLE: return sizeof(GLdouble);
-   case GL_FLOAT: return sizeof(GLfloat);
-   case GL_INT: return sizeof(GLint);
-   case GL_SHORT: return sizeof(GLshort);
-   case GL_BYTE: return sizeof(GLbyte);
-   case GL_UNSIGNED_INT: return sizeof(GLuint);
-   case GL_UNSIGNED_SHORT: return sizeof(GLushort);
-   case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
-   default: return 0;
-   }      
-}
-
-static GLuint get_index_type(GLenum type) 
+static unsigned get_index_type(int type)
 {
    switch (type) {
-   case GL_UNSIGNED_BYTE:  return BRW_INDEX_BYTE;
-   case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD;
-   case GL_UNSIGNED_INT:   return BRW_INDEX_DWORD;
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
    default: assert(0); return 0;
    }
 }
 
-static void wrap_buffers( struct brw_context *brw,
-			  GLuint size )
-{
-   if (size < BRW_UPLOAD_INIT_SIZE)
-      size = BRW_UPLOAD_INIT_SIZE;
-
-   brw->vb.upload.offset = 0;
-
-   if (brw->vb.upload.bo != NULL)
-      dri_bo_unreference(brw->vb.upload.bo);
-   brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
-				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
-}
-
-static void get_space( struct brw_context *brw,
-		       GLuint size,
-		       dri_bo **bo_return,
-		       GLuint *offset_return )
-{
-   size = ALIGN(size, 64);
-
-   if (brw->vb.upload.bo == NULL ||
-       brw->vb.upload.offset + size > brw->vb.upload.bo->size) {
-      wrap_buffers(brw, size);
-   }
-
-   assert(*bo_return == NULL);
-   dri_bo_reference(brw->vb.upload.bo);
-   *bo_return = brw->vb.upload.bo;
-   *offset_return = brw->vb.upload.offset;
-   brw->vb.upload.offset += size;
-}
-
-static void
-copy_array_to_vbo_array( struct brw_context *brw,
-			 struct brw_vertex_element *element,
-			 GLuint dst_stride)
-{
-   struct intel_context *intel = &brw->intel;
-   GLuint size = element->count * dst_stride;
-
-   get_space(brw, size, &element->bo, &element->offset);
 
-   if (element->glarray->StrideB == 0) {
-      assert(element->count == 1);
-      element->stride = 0;
-   } else {
-      element->stride = dst_stride;
-   }
-
-   if (dst_stride == element->glarray->StrideB) {
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 memcpy((char *)element->bo->virtual + element->offset,
-		element->glarray->Ptr, size);
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			element->glarray->Ptr);
-      }
-   } else {
-      char *dest;
-      const unsigned char *src = element->glarray->Ptr;
-      int i;
-
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 dest = element->bo->virtual;
-	 dest += element->offset;
-
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 void *data;
-
-	 data = _mesa_malloc(dst_stride * element->count);
-	 dest = data;
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			data);
-
-	 _mesa_free(data);
-      }
-   }
-}
 
-static void brw_prepare_vertices(struct brw_context *brw)
+static boolean brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
@@ -358,123 +208,38 @@ static void brw_prepare_vertices(struct brw_context *brw)
    if (0)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
-   /* Accumulate the list of enabled arrays. */
-   brw->vb.nr_enabled = 0;
-   while (vs_inputs) {
-      GLuint i = _mesa_ffsll(vs_inputs) - 1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      vs_inputs &= ~(1 << i);
-      brw->vb.enabled[brw->vb.nr_enabled++] = input;
-   }
-
-   /* XXX: In the rare cases where this happens we fallback all
-    * the way to software rasterization, although a tnl fallback
-    * would be sufficient.  I don't know of *any* real world
-    * cases with > 17 vertex attributes enabled, so it probably
-    * isn't an issue at this point.
-    */
-   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
-      intel->Fallback = 1;
-      return;
-   }
 
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
 
-      if (_mesa_is_bufferobj(input->glarray->BufferObj)) {
-	 struct intel_buffer_object *intel_buffer =
-	    intel_buffer_object(input->glarray->BufferObj);
-
-	 /* Named buffer object: Just reference its contents directly. */
-	 dri_bo_unreference(input->bo);
-	 input->bo = intel_bufferobj_buffer(intel, intel_buffer,
-					    INTEL_READ);
-	 dri_bo_reference(input->bo);
-	 input->offset = (unsigned long)input->glarray->Ptr;
-	 input->stride = input->glarray->StrideB;
-	 input->count = input->glarray->_MaxElement;
-
-	 /* This is a common place to reach if the user mistakenly supplies
-	  * a pointer in place of a VBO offset.  If we just let it go through,
-	  * we may end up dereferencing a pointer beyond the bounds of the
-	  * GTT.  We would hope that the VBO's max_index would save us, but
-	  * Mesa appears to hand us min/max values not clipped to the
-	  * array object's _MaxElement, and _MaxElement frequently appears
-	  * to be wrong anyway.
-	  *
-	  * The VBO spec allows application termination in this case, and it's
-	  * probably a service to the poor programmer to do so rather than
-	  * trying to just not render.
-	  */
-	 assert(input->offset < input->bo->size);
-      } else {
-	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
-	 if (input->bo != NULL) {
-	    /* Already-uploaded vertex data is present from a previous
-	     * prepare_vertices, but we had to re-validate state due to
-	     * check_aperture failing and a new batch being produced.
-	     */
-	    continue;
-	 }
-
-	 /* Queue the buffer object up to be uploaded in the next pass,
-	  * when we've decided if we're doing interleaved or not.
-	  */
-	 if (input->attrib == VERT_ATTRIB_POS) {
-	    /* Position array not properly enabled:
-	     */
-            if (input->glarray->StrideB == 0) {
-               intel->Fallback = 1;
-               return;
-            }
-
-	    interleave = input->glarray->StrideB;
-	    ptr = input->glarray->Ptr;
-	 }
-	 else if (interleave != input->glarray->StrideB ||
-		  (const unsigned char *)input->glarray->Ptr - ptr < 0 ||
-		  (const unsigned char *)input->glarray->Ptr - ptr > interleave)
-	 {
-	    interleave = 0;
-	 }
-
-	 upload[nr_uploads++] = input;
-	 
-	 /* We rebase drawing to start at element zero only when
-	  * varyings are not in vbos, which means we can end up
-	  * uploading non-varying arrays (stride != 0) when min_index
-	  * is zero.  This doesn't matter as the amount to upload is
-	  * the same for these arrays whether the draw call is rebased
-	  * or not - we just have to upload the one element.
-	  */
-	 assert(min_index == 0 || input->glarray->StrideB == 0);
-      }
-   }
-
-   /* Handle any arrays to be uploaded. */
-   if (nr_uploads > 1 && interleave && interleave <= 256) {
-      /* All uploads are interleaved, so upload the arrays together as
-       * interleaved.  First, upload the contents and set up upload[0].
-       */
-      copy_array_to_vbo_array(brw, upload[0], interleave);
-
-      for (i = 1; i < nr_uploads; i++) {
-	 /* Then, just point upload[i] at upload[0]'s buffer. */
-	 upload[i]->stride = interleave;
-	 upload[i]->offset = upload[0]->offset +
-	    ((const unsigned char *)upload[i]->glarray->Ptr - ptr);
-	 upload[i]->bo = upload[0]->bo;
-	 dri_bo_reference(upload[i]->bo);
+      if (brw_is_user_buffer(vb)) {
+	 u_upload_buffer( brw->upload, 
+			  min_index * vb->stride,
+			  (max_index + 1 - min_index) * vb->stride,
+			  &offset,
+			  &buffer );
       }
-   }
-   else {
-      /* Upload non-interleaved arrays */
-      for (i = 0; i < nr_uploads; i++) {
-          copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
+      else
+      {
+	 offset = 0;
+	 buffer = vb->buffer;
+	 count = stride == 0 ? 1 : max_index + 1 - min_index;
       }
+
+      /* Named buffer object: Just reference its contents directly. */
+      dri_bo_unreference(input->bo);
+      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
+					 INTEL_READ);
+      dri_bo_reference(input->bo);
+
+      input->offset = (unsigned long)offset;
+      input->stride = vb->stride;
+      input->count = count;
+
+      assert(input->offset < input->bo->size);
    }
 
    brw_prepare_query_begin(brw);
@@ -632,13 +397,8 @@ static void brw_prepare_indices(struct brw_context *brw)
 
       /* Straight upload
        */
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(bo);
-	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
-	 drm_intel_gem_bo_unmap_gtt(bo);
-      } else {
-	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
-      }
+      brw_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
       brw->ib.start_vertex_offset = 0;
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 48c2b9a41c..5ec0c585fe 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -58,7 +58,7 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
 
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
new file mode 100644
index 0000000000..b351794dce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -0,0 +1,41 @@
+
+   /* _NEW_COLOR */
+   if (key->logic_op != GL_COPY) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
+   } else if (key->color_blend) {
+      GLenum eqRGB = key->blend_eq_rgb;
+      GLenum eqA = key->blend_eq_a;
+      GLenum srcRGB = key->blend_src_rgb;
+      GLenum dstRGB = key->blend_dst_rgb;
+      GLenum srcA = key->blend_src_a;
+      GLenum dstA = key->blend_dst_a;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	 srcRGB = dstRGB = GL_ONE;
+      }
+
+      if (eqA == GL_MIN || eqA == GL_MAX) {
+	 srcA = dstA = GL_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+
+   if (key->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
diff --git a/src/gallium/drivers/i965/brw_pipe_debug.c b/src/gallium/drivers/i965/brw_pipe_debug.c
new file mode 100644
index 0000000000..34d6d4028a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_debug.c
@@ -0,0 +1,2 @@
+   if (INTEL_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
new file mode 100644
index 0000000000..da29bc8bcb
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -0,0 +1,52 @@
+   /* _NEW_STENCIL */
+   if (key->dsa.stencil[0].enable) {
+      cc.cc0.stencil_enable = 1;
+      cc.cc0.stencil_func =
+	 intel_translate_compare_func(key->stencil_func[0]);
+      cc.cc0.stencil_fail_op =
+	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+      cc.cc0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+      cc.cc0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
+      cc.cc1.stencil_ref = key->stencil_ref[0];
+      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
+      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
+
+      if (key->stencil_two_side) {
+	 cc.cc0.bf_stencil_enable = 1;
+	 cc.cc0.bf_stencil_func =
+	    intel_translate_compare_func(key->stencil_func[1]);
+	 cc.cc0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
+	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
+	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
+	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
+      }
+
+      /* Not really sure about this:
+       */
+      if (key->stencil_write_mask[0] ||
+	  (key->stencil_two_side && key->stencil_write_mask[1]))
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+
+   if (key->alpha_enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+   }
+
+   /* _NEW_DEPTH */
+   if (key->depth_test) {
+      cc.cc2.depth_test = 1;
+      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
+      cc.cc2.depth_write_enable = key->depth_write;
+   }
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
new file mode 100644
index 0000000000..d4ae332f46
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -0,0 +1,25 @@
+
+/**
+ * called from intelDrawBuffer()
+ */
+static void brw_set_draw_region( struct intel_context *intel, 
+                                 struct intel_region *color_regions[],
+                                 struct intel_region *depth_region,
+                                 GLuint num_color_regions)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   GLuint i;
+
+   /* release old color/depth regions */
+   if (brw->state.depth_region != depth_region)
+      brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
+   intel_region_release(&brw->state.depth_region);
+
+   /* reference new color/depth regions */
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
+   intel_region_reference(&brw->state.depth_region, depth_region);
+   brw->state.nr_color_regions = num_color_regions;
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
new file mode 100644
index 0000000000..008f623151
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -0,0 +1,64 @@
+
+/**
+ * called from intel_batchbuffer_flush and children before sending a
+ * batchbuffer off.
+ */
+static void brw_finish_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   brw_emit_query_end(brw);
+}
+
+
+/**
+ * called from intelFlushBatchLocked
+ */
+static void brw_new_batch( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   /* Check that we didn't just wrap our batchbuffer at a bad time. */
+   assert(!brw->no_batch_wrap);
+
+   brw->curbe.need_new_bo = GL_TRUE;
+
+   /* Mark all context state as needing to be re-emitted.
+    * This is probably not as severe as on 915, since almost all of our state
+    * is just in referenced buffers.
+    */
+   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+
+   /* Move to the end of the current upload buffer so that we'll force choosing
+    * a new buffer next time.
+    */
+   if (brw->vb.upload.bo != NULL) {
+      dri_bo_unreference(brw->vb.upload.bo);
+      brw->vb.upload.bo = NULL;
+      brw->vb.upload.offset = 0;
+   }
+}
+
+
+static void brw_note_fence( struct intel_context *intel, GLuint fence )
+{
+   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
+}
+
+/* called from intelWaitForIdle() and intelFlush()
+ *
+ * For now, just flush everything.  Could be smarter later.
+ */
+static GLuint brw_flush_cmd( void )
+{
+   struct brw_mi_flush flush;
+   flush.opcode = CMD_MI_FLUSH;
+   flush.pad = 0;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
+   return *(GLuint *)&flush;
+}
+
+
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
new file mode 100644
index 0000000000..d199d0b81a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -0,0 +1,27 @@
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && 
+	     irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e1c2c7777b..90513245ee 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -59,9 +59,9 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
 
    c.key = *key;
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_attrs = util_count_bits(c.key.attrs & DO_SETUP_BITS);
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index ca8f97f9f9..4cc427a935 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -150,7 +150,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
@@ -188,7 +188,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b817b741e7..6801084616 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -270,7 +270,7 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
 /***********************************************************************
  * Emit all state:
  */
-void brw_validate_state( struct brw_context *brw )
+enum pipe_error brw_validate_state( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
@@ -278,10 +278,6 @@ void brw_validate_state( struct brw_context *brw )
    GLuint i;
 
    brw_clear_validated_bos(brw);
-
-   state->mesa |= brw->intel.NewGLState;
-   brw->intel.NewGLState = 0;
-
    brw_add_validated_bo(brw, intel->batch->buf);
 
    if (brw->emit_state_always) {
@@ -290,36 +286,23 @@ void brw_validate_state( struct brw_context *brw )
       state->cache |= ~0;
    }
 
-   if (brw->fragment_program != ctx->FragmentProgram._Current) {
-      brw->fragment_program = ctx->FragmentProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-   }
-
-   if (brw->vertex_program != ctx->VertexProgram._Current) {
-      brw->vertex_program = ctx->VertexProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-   }
-
    if (state->mesa == 0 &&
        state->cache == 0 &&
        state->brw == 0)
-      return;
+      return 0;
 
    if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
       brw_clear_batch_cache(brw);
 
-   brw->intel.Fallback = 0;
-
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
       const struct brw_tracked_state *atom = atoms[i];
 
-      if (brw->intel.Fallback)
-         break;
-
       if (check_state(state, &atom->dirty)) {
          if (atom->prepare) {
-            atom->prepare(brw);
+            ret = atom->prepare(brw);
+	    if (ret)
+	       return ret;
         }
       }
    }
@@ -329,17 +312,18 @@ void brw_validate_state( struct brw_context *brw )
     * If this fails, we can experience GPU lock-ups.
     */
    {
-      const struct brw_fragment_program *fp;
-      fp = brw_fragment_program_const(brw->fragment_program);
+      const struct brw_fragment_program *fp = brw->fragment_program;
       if (fp) {
-         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
-                == fp->tex_units_used);
+         assert(fp->info.max_sampler <= brw->nr_samplers &&
+		fp->info.max_texture <= brw->nr_textures);
       }
    }
+
+   return 0;
 }
 
 
-void brw_upload_state(struct brw_context *brw)
+enum pipe_error brw_upload_state(struct brw_context *brw)
 {
    struct brw_state_flags *state = &brw->state.dirty;
    int i;
@@ -356,7 +340,7 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < Elements(atoms); i++) {	 
+      for (i = 0; i < Elements(atoms); i++) {
 	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
@@ -364,12 +348,11 @@ void brw_upload_state(struct brw_context *brw)
 		atom->dirty.brw ||
 		atom->dirty.cache);
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
 
@@ -388,12 +371,11 @@ void brw_upload_state(struct brw_context *brw)
       for (i = 0; i < Elements(atoms); i++) {	 
 	 const struct brw_tracked_state *atom = atoms[i];
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
       }
@@ -407,10 +389,11 @@ void brw_upload_state(struct brw_context *brw)
 	 brw_print_dirty_count(mesa_bits, state->mesa);
 	 brw_print_dirty_count(brw_bits, state->brw);
 	 brw_print_dirty_count(cache_bits, state->cache);
-	 fprintf(stderr, "\n");
+	 debug_printf("\n");
       }
    }
-
-   if (!brw->intel.Fallback)
-      memset(state, 0, sizeof(*state));
+   
+   /* Clear dirty flags:
+    */
+   memset(state, 0, sizeof(*state));
 }
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
new file mode 100644
index 0000000000..6684f442d5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -0,0 +1,114 @@
+
+/* XXX: could split the primitive list to fallback only on the
+ * non-conformant primitives.
+ */
+static GLboolean check_fallbacks( struct brw_context *brw,
+				  const struct _mesa_prim *prim,
+				  GLuint nr_prims )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   GLuint i;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
+      return GL_FALSE;
+
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
+   if (ctx->Polygon.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware will do AA lines, but they are non-conformant it
+    * seems.  TBD whether we keep this fallback:
+    */
+   if (ctx->Line.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_LINES) 
+	    return GL_TRUE;
+   }
+
+   /* Stipple -- these fallbacks could be resolved with a little
+    * bit of work?
+    */
+   if (ctx->Line.StippleFlag) {
+      for (i = 0; i < nr_prims; i++) {
+	 /* GS doesn't get enough information to know when to reset
+	  * the stipple counter?!?
+	  */
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
+	    return GL_TRUE;
+	    
+	 if (prim[i].mode == GL_POLYGON &&
+	     (ctx->Polygon.FrontMode == GL_LINE ||
+	      ctx->Polygon.BackMode == GL_LINE))
+	    return GL_TRUE;
+      }
+   }
+
+   if (ctx->Point.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (prim[i].mode == GL_POINTS) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
+
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->nr_ve == 0 ||
+       brw->nr_ve >= BRW_VEP_MAX) {
+      return TRUE;
+   }
+
+   /* Position array with zero stride?
+    */
+   if (brw->vs[brw->ve[0]]->stride == 0)
+      return TRUE;
+
+
+      
+   /* Nothing stopping us from the fast path now */
+   return GL_FALSE;
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
new file mode 100644
index 0000000000..32b62848da
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -0,0 +1,11 @@
+#ifndef BRW_TYPES_H
+#define BRW_TYPES_H
+
+typedef GLuint uint32_t;
+typedef GLubyte uint8_t;
+typedef GLushort uint16_t;
+/* no GLenum, translate all away */
+
+typedef GLboolean uint8_t;
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c
index ce21aa4869..17f671a8fa 100644
--- a/src/gallium/drivers/i965/brw_util.c
+++ b/src/gallium/drivers/i965/brw_util.c
@@ -35,14 +35,6 @@
 #include "brw_util.h"
 #include "brw_defines.h"
 
-GLuint brw_count_bits( GLuint val )
-{
-   GLuint i;
-   for (i = 0; val ; val >>= 1)
-      if (val & 1)
-	 i++;
-   return i;
-}
 
 
 GLuint brw_translate_blend_equation( GLenum mode )
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index f0c79efbd9..53a5560105 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -61,9 +61,7 @@ static void do_vs_prog( struct brw_context *brw,
    }
 
    if (0)
-      _mesa_print_program(&c.vp->program.Base);
-
-
+      tgsi_dump(&c.vp->tokens, 0);
 
    /* Emit GEN4 code.
     */
@@ -96,9 +94,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
-   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
-			ctx->Polygon.BackMode != GL_FILL);
+   key.nr_userclip = brw->nr_userclip;
+   key.copy_edgeflag = (brw->rast->fill_ccw != PIPE_POLYGON_MODE_FILL ||
+			brw->rast->fill_cw != PIPE_POLYGON_MODE_FILL);
 
    /* Make an early check for the key.
     */
@@ -116,7 +114,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
+      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 1638ef8111..7f20c4baca 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "main/macros.h"
 #include "shader/program.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "pipe/p_shader_tokens.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 
@@ -129,6 +129,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 reg++;
       }
    }
+
    /* If there are no inputs, we'll still be reading one attribute's worth
     * because it's required -- see urb_read_length setting.
     */
@@ -226,6 +227,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * vertex urb, so is half the amount:
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+
    /* Setting this field to 0 leads to undefined behavior according to the
     * the VS_STATE docs.  Our VUEs will always have at least one attribute
     * sitting in them, even if it's padding.
@@ -960,9 +962,6 @@ static void emit_arl( struct brw_vs_compile *c,
 
 /**
  * Return the brw reg for the given instruction's src argument.
- * Will return mangled results for SWZ op.  The emit_swz() function
- * ignores this result and recalculates taking extended swizzles into
- * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
                                const struct prog_instruction *inst,
@@ -1024,74 +1023,6 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 }
 
 
-static void emit_swz( struct brw_vs_compile *c, 
-		      struct brw_reg dst,
-                      const struct prog_instruction *inst)
-{
-   const GLuint argIndex = 0;
-   const struct prog_src_register src = inst->SrcReg[argIndex];
-   struct brw_compile *p = &c->func;
-   GLuint zeros_mask = 0;
-   GLuint ones_mask = 0;
-   GLuint src_mask = 0;
-   GLubyte src_swz[4];
-   GLboolean need_tmp = (src.Negate &&
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-   struct brw_reg tmp = dst;
-   GLuint i;
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   for (i = 0; i < 4; i++) {
-      if (dst.dw1.bits.writemask & (1<<i)) {
-	 GLubyte s = GET_SWZ(src.Swizzle, i);
-	 switch (s) {
-	 case SWIZZLE_X:
-	 case SWIZZLE_Y:
-	 case SWIZZLE_Z:
-	 case SWIZZLE_W:
-	    src_mask |= 1<<i;
-	    src_swz[i] = s;
-	    break;
-	 case SWIZZLE_ZERO:
-	    zeros_mask |= 1<<i;
-	    break;
-	 case SWIZZLE_ONE:
-	    ones_mask |= 1<<i;
-	    break;
-	 }
-      }
-   }
-   
-   /* Do src first, in case dst aliases src:
-    */
-   if (src_mask) {
-      struct brw_reg arg0;
-
-      arg0 = get_src_reg(c, inst, argIndex);
-
-      arg0 = brw_swizzle(arg0, 
-			 src_swz[0], src_swz[1], 
-			 src_swz[2], src_swz[3]);
-
-      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
-   } 
-   
-   if (zeros_mask) 
-      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
-
-   if (ones_mask) 
-      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
-
-   if (src.Negate)
-      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
-   
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
 
 
 /**
@@ -1332,20 +1263,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
-   for (insn = 0; insn < nr_insns; insn++) {
-       GLuint i;
-       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
-       for (i = 0; i < 3; i++) {
-	   struct prog_src_register *src = &inst->SrcReg[i];
-	   GLuint index = src->Index;
-	   GLuint file = src->File;	
-	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
-	       c->output_regs[index].used_in_src = GL_TRUE;
-       }
-   }
-
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
@@ -1362,18 +1279,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
       _mesa_print_instruction(inst);
 #endif
 
-      /* Get argument regs.  SWZ is special and does this itself.
+      /* Get argument regs.
        */
-      if (inst->Opcode != OPCODE_SWZ)
-	  for (i = 0; i < 3; i++) {
-	      const struct prog_src_register *src = &inst->SrcReg[i];
-	      index = src->Index;
-	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-		  args[i] = c->output_regs[index].reg;
-	      else
-                  args[i] = get_arg(c, inst, i);
-	  }
+      for (i = 0; i < 3; i++) {
+	 const struct prog_src_register *src = &inst->SrcReg[i];
+	 index = src->Index;
+	 file = src->File;	
+	 args[i] = get_arg(c, inst, i);
+      }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
        * dst and arg, given the static allocation of registers.  So
@@ -1381,10 +1294,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
        */ 
       index = inst->DstReg.Index;
       file = inst->DstReg.File;
-      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-	  dst = c->output_regs[index].reg;
-      else
-	  dst = get_dst(c, inst->DstReg);
+      dst = get_dst(c, inst->DstReg);
 
       if (inst->SaturateMode != SATURATE_OFF) {
 	 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
@@ -1392,151 +1302,144 @@ void brw_vs_emit(struct brw_vs_compile *c )
       }
 
       switch (inst->Opcode) {
-      case OPCODE_ABS:
+      case TGSI_OPCODE_ABS:
 	 brw_MOV(p, dst, brw_abs(args[0]));
 	 break;
-      case OPCODE_ADD:
+      case TGSI_OPCODE_ADD:
 	 brw_ADD(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_COS:
+      case TGSI_OPCODE_COS:
 	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_DP3:
+      case TGSI_OPCODE_DP3:
 	 brw_DP3(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 brw_DP4(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 brw_DPH(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_NRM3:
+      case TGSI_OPCODE_NRM3:
 	 emit_nrm(c, dst, args[0], 3);
 	 break;
-      case OPCODE_NRM4:
+      case TGSI_OPCODE_NRM4:
 	 emit_nrm(c, dst, args[0], 4);
 	 break;
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
 	 break;
-      case OPCODE_EXP:
+      case TGSI_OPCODE_EXP:
 	 unalias1(c, dst, args[0], emit_exp_noalias);
 	 break;
-      case OPCODE_EX2:
+      case TGSI_OPCODE_EX2:
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_ARL:
+      case TGSI_OPCODE_ARL:
 	 emit_arl(c, dst, args[0]);
 	 break;
-      case OPCODE_FLR:
+      case TGSI_OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
 	 break;
-      case OPCODE_FRC:
+      case TGSI_OPCODE_FRC:
 	 brw_FRC(p, dst, args[0]);
 	 break;
-      case OPCODE_LOG:
+      case TGSI_OPCODE_LOG:
 	 unalias1(c, dst, args[0], emit_log_noalias);
 	 break;
-      case OPCODE_LG2:
+      case TGSI_OPCODE_LG2:
 	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 unalias1(c, dst, args[0], emit_lit_noalias);
 	 break;
-      case OPCODE_LRP:
+      case TGSI_OPCODE_LRP:
 	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
 	 break;
-      case OPCODE_MAD:
+      case TGSI_OPCODE_MAD:
 	 brw_MOV(p, brw_acc_reg(), args[2]);
 	 brw_MAC(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MAX:
+      case TGSI_OPCODE_MAX:
 	 emit_max(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MIN:
+      case TGSI_OPCODE_MIN:
 	 emit_min(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MOV:
+      case TGSI_OPCODE_MOV:
 	 brw_MOV(p, dst, args[0]);
 	 break;
-      case OPCODE_MUL:
+      case TGSI_OPCODE_MUL:
 	 brw_MUL(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
 	 break;
-      case OPCODE_RCP:
+      case TGSI_OPCODE_RCP:
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_RSQ:
+      case TGSI_OPCODE_RSQ:
 	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-
-      case OPCODE_SEQ:
+      case TGSI_OPCODE_SEQ:
          emit_seq(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SIN:
+      case TGSI_OPCODE_SIN:
 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_SNE:
+      case TGSI_OPCODE_SNE:
          emit_sne(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SGE:
+      case TGSI_OPCODE_SGE:
 	 emit_sge(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SGT:
+      case TGSI_OPCODE_SGT:
          emit_sgt(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SLT:
+      case TGSI_OPCODE_SLT:
 	 emit_slt(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SLE:
+      case TGSI_OPCODE_SLE:
          emit_sle(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SUB:
+      case TGSI_OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
 	 break;
-      case OPCODE_SWZ:
-	 /* The args[0] value can't be used here as it won't have
-	  * correctly encoded the full swizzle:
-	  */
-	 emit_swz(c, dst, inst);
-	 break;
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_TRUNC:
          /* round toward zero */
 	 brw_RNDZ(p, dst, args[0]);
 	 break;
-      case OPCODE_XPD:
+      case TGSI_OPCODE_XPD:
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_IF:
+      case TGSI_OPCODE_IF:
 	 assert(if_depth < MAX_IF_DEPTH);
 	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
 	 /* Note that brw_IF smashes the predicate_control field. */
 	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
 	 if_depth++;
 	 break;
-      case OPCODE_ELSE:
+      case TGSI_OPCODE_ELSE:
 	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
-      case OPCODE_ENDIF:
+      case TGSI_OPCODE_ENDIF:
          assert(if_depth > 0);
 	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
-      case OPCODE_BGNLOOP:
+      case TGSI_OPCODE_BGNLOOP:
          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
          break;
-      case OPCODE_BRK:
+      case TGSI_OPCODE_BRK:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_BREAK(p);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CONT:
+      case TGSI_OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_CONT(p);
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
+      case TGSI_OPCODE_ENDLOOP: 
          {
             struct brw_instruction *inst0, *inst1;
 	    GLuint br = 1;
@@ -1550,23 +1453,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
             while (inst0 > loop_inst[loop_depth]) {
                inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+               if (inst0->header.opcode == BRW_TGSI_OPCODE_BREAK) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
                   inst0->bits3.if_else.pop_count = 0;
                }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+               else if (inst0->header.opcode == BRW_TGSI_OPCODE_CONTINUE) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                   inst0->bits3.if_else.pop_count = 0;
                }
             }
          }
          break;
-      case OPCODE_BRA:
+      case TGSI_OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CAL:
+      case TGSI_OPCODE_CAL:
 	 brw_set_access_mode(p, BRW_ALIGN_1);
 	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
@@ -1575,27 +1478,27 @@ void brw_vs_emit(struct brw_vs_compile *c )
          brw_save_call(p, inst->Comment, p->nr_insn);
 	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_RET:
+      case TGSI_OPCODE_RET:
 	 brw_ADD(p, get_addr_reg(stack_index),
 			 get_addr_reg(stack_index), brw_imm_d(-4));
 	 brw_set_access_mode(p, BRW_ALIGN_1);
          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
 	 break;
-      case OPCODE_END:	
+      case TGSI_OPCODE_END:	
          end_offset = p->nr_insn;
          /* this instruction will get patched later to jump past subroutine
           * code, etc.
           */
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_PRINT:
+      case TGSI_OPCODE_PRINT:
          /* no-op */
          break;
-      case OPCODE_BGNSUB:
+      case TGSI_OPCODE_BGNSUB:
          brw_save_label(p, inst->Comment, p->nr_insn);
          break;
-      case OPCODE_ENDSUB:
+      case TGSI_OPCODE_ENDSUB:
          /* no-op */
          break;
       default:
@@ -1618,33 +1521,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
       }
 
-      if ((inst->DstReg.File == PROGRAM_OUTPUT)
-          && (inst->DstReg.Index != VERT_RESULT_HPOS)
-          && c->output_regs[inst->DstReg.Index].used_in_src) {
-         brw_MOV(p, get_dst(c, inst->DstReg), dst);
-      }
-
-      /* Result color clamping.
-       *
-       * When destination register is an output register and
-       * it's primary/secondary front/back color, we have to clamp
-       * the result to [0,1]. This is done by enabling the
-       * saturation bit for the last instruction.
-       *
-       * We don't use brw_set_saturate() as it modifies
-       * p->current->header.saturate, which affects all the subsequent
-       * instructions. Instead, we directly modify the header
-       * of the last (already stored) instruction.
-       */
-      if (inst->DstReg.File == PROGRAM_OUTPUT) {
-         if ((inst->DstReg.Index == VERT_RESULT_COL0)
-             || (inst->DstReg.Index == VERT_RESULT_COL1)
-             || (inst->DstReg.Index == VERT_RESULT_BFC0)
-             || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
-            p->store[p->nr_insn-1].header.saturate = 1;
-         }
-      }
-
       release_tmps(c);
    }
 
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 2292de94c4..20d31880b4 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -269,61 +269,46 @@ static void brw_wm_populate_key( struct brw_context *brw,
 		    uses_depth,
 		    key);
 
+   /* Revisit this, figure out if it's really useful, and either push
+    * it into the state tracker so that everyone benefits (use to
+    * create fs varients with TEX rather than TXP), or discard.
+    */
+   key->proj_attrib_mask = ~0; /*brw->wm.input_size_masks[4-1];*/
 
-   /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
-
-   /* _NEW_LIGHT */
-   key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
+   /* PIPE_NEW_RAST */
+   key->flat_shade = brw->rast.flat_shade;
 
-   /* _NEW_HINT */
-   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+   /* This can be determined by looking at the INTERP mode each input decl.
+    */
+   key->linear_color = 0;
 
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
-
-      if (unit->_ReallyEnabled) {
-         const struct gl_texture_object *t = unit->_Current;
-         const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+      if (i < brw->nr_textures) {
+	 const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
+	 const struct gl_texture_object *t = unit->_Current;
+	 const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 
 	 if (img->InternalFormat == GL_YCBCR_MESA) {
 	    key->yuvtex_mask |= 1 << i;
 	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
-		key->yuvtex_swap_mask |= 1 << i;
+	       key->yuvtex_swap_mask |= 1 << i;
 	 }
 
-         key->tex_swizzles[i] = t->_Swizzle;
+	 key->tex_swizzles[i] = t->_Swizzle;
+	 
+	 if (0)
+	    key->shadowtex_mask |= 1<<i;
       }
       else {
          key->tex_swizzles[i] = SWIZZLE_NOOP;
       }
    }
 
-   /* Shadow */
-   key->shadowtex_mask = fp->program.Base.ShadowSamplers;
 
-   /* _NEW_BUFFERS */
-   /*
-    * Include the draw buffer origin and height so that we can calculate
-    * fragment position values relative to the bottom left of the drawable,
-    * from the incoming screen origin relative position we get as part of our
-    * payload.
-    *
-    * We could avoid recompiling by including this as a constant referenced by
-    * our program, but if we were to do that it would also be nice to handle
-    * getting that constant updated at batchbuffer submit time (when we
-    * hold the lock and know where the buffer really is) rather than at emit
-    * time when we don't hold the lock and are just guessing.  We could also
-    * just avoid using this as key data if the program doesn't use
-    * fragment.position.
-    *
-    * This pretty much becomes moot with DRI2 and redirected buffers anyway,
-    * as our origins will always be zero then.
-    */
+   /* _NEW_FRAMEBUFFER */
    if (brw->intel.driDrawable != NULL) {
-      key->origin_x = brw->intel.driDrawable->x;
-      key->origin_y = brw->intel.driDrawable->y;
-      key->drawable_height = brw->intel.driDrawable->h;
+      key->drawable_height = brw->fb.cbufs[0].height;
    }
 
    /* CACHE_NEW_VS_PROG */
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 872b1f3ecf..756a680150 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -76,7 +76,6 @@ struct brw_wm_prog_key {
    GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
-   GLuint origin_x, origin_y;
    GLuint drawable_height;
    GLuint vp_outputs_written;
 };
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index bf80a2942a..9c47c46a3d 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -125,23 +125,21 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   /* Calculate the pixel offset from window bottom left into destination
-    * X and Y channels.
-    */
    if (mask & WRITEMASK_X) {
-      /* X' = X - origin */
-      brw_ADD(p,
+      /* X' = X */
+      brw_MOV(p,
 	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_W),
-	      brw_imm_d(0 - c->key.origin_x));
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
    }
 
+   /* XXX: is this needed any more, or is this a NOOP?
+    */
    if (mask & WRITEMASK_Y) {
-      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
+      /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
 	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
-	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+	      brw_imm_d(c->key.drawable_height - 1));
    }
 }
 
@@ -1376,7 +1374,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_MOV:
-      case OPCODE_SWZ:
 	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 	 break;
 
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 4e3edfbbff..5f47d86f71 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -30,25 +30,12 @@
   */
                
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_shader_constants.h"
+
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_util.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
-
-
-/** An invalid texture target */
-#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS
-
-/** An invalid texture unit */
-#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT
-
-#define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
 
 #define X    0
 #define Y    1
@@ -68,11 +55,6 @@ static const char *wm_opcode_strings[] = {
    "FRONTFACING",
 };
 
-#if 0
-static const char *wm_file_strings[] = {   
-   "PAYLOAD"
-};
-#endif
 
 
 /***********************************************************************
@@ -165,13 +147,13 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    }
 
    c->fp_temp |= 1<<(bit-1);
-   return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1));
+   return dst_reg(PROGRAM_TEMPORARY, c->first_internal_temp+(bit-1));
 }
 
 
 static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
 {
-   c->fp_temp &= ~(1 << (temp.Index - FIRST_INTERNAL_TEMP));
+   c->fp_temp &= ~(1 << (temp.Index - c->first_internal_temp));
 }
 
 
@@ -192,58 +174,29 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
    return inst;
 }
 
-static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       GLuint tex_src_unit,
-				       GLuint tex_src_target,
-				       GLuint tex_shadow,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
+static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+					 GLuint op,
+					 struct prog_dst_register dest,
+					 GLuint saturate,
+					 struct prog_src_register src0,
+					 struct prog_src_register src1,
+					 struct prog_src_register src2 )
 {
    struct prog_instruction *inst = get_fp_inst(c);
       
-   assert(tex_src_unit < BRW_MAX_TEX_UNIT ||
-          tex_src_unit == TEX_UNIT_NONE);
-   assert(tex_src_target < NUM_TEXTURE_TARGETS ||
-          tex_src_target == TEX_TARGET_NONE);
-
-   /* update mask of which texture units are referenced by this program */
-   if (tex_src_unit != TEX_UNIT_NONE)
-      c->fp->tex_units_used |= (1 << tex_src_unit);
-
    memset(inst, 0, sizeof(*inst));
 
    inst->Opcode = op;
    inst->DstReg = dest;
    inst->SaturateMode = saturate;   
-   inst->TexSrcUnit = tex_src_unit;
-   inst->TexSrcTarget = tex_src_target;
-   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
    return inst;
 }
-   
-
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
-{
-   return emit_tex_op(c, op, dest, saturate,
-                      TEX_UNIT_NONE, TEX_TARGET_NONE, 0,  /* unit, tgt, shadow */
-                      src0, src1, src2);
-}
 
 
-/* Many Mesa opcodes produce the same value across all the result channels.
+/* Many opcodes produce the same value across all the result channels.
  * We'd rather not have to support that splatting in the opcode implementations,
  * and brw_wm_pass*.c wants to optimize them out by shuffling references around
  * anyway.  We can easily get both by emitting the opcode to one channel, and
@@ -267,7 +220,7 @@ static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
    other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
    if (other_channel_mask != 0) {
       inst = emit_op(c,
-		     OPCODE_MOV,
+		     TGSI_OPCODE_MOV,
 		     dst_mask(inst0->DstReg, other_channel_mask),
 		     0,
 		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
@@ -356,7 +309,9 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 }
 
 static void emit_interp( struct brw_wm_compile *c,
-			 GLuint idx )
+			 GLuint semantic,
+			 GLuint semantic_index,
+			 GLuint interp_mode )
 {
    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
@@ -366,7 +321,7 @@ static void emit_interp( struct brw_wm_compile *c,
     * multiplied by 1/W in the SF program, and LINTERP on those
     * which have not:
     */
-   switch (idx) {
+   switch (semantic) {
    case FRAG_ATTRIB_WPOS:
       /* Have to treat wpos.xy specially:
        */
@@ -390,8 +345,8 @@ static void emit_interp( struct brw_wm_compile *c,
 	      deltas,
 	      src_undef());
       break;
-   case FRAG_ATTRIB_COL0:
-   case FRAG_ATTRIB_COL1:
+
+   case TGSI_SEMANTIC_COLOR:
       if (c->key.flat_shade) {
 	 emit_op(c,
 		 WM_CINTERP,
@@ -402,25 +357,13 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-         if (c->key.linear_color) {
-            emit_op(c,
-                    WM_LINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    src_undef());
-         }
-         else {
-            /* perspective-corrected color interpolation */
-            emit_op(c,
-                    WM_PINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    get_pixel_w(c));
-         }
+	 emit_op(c,
+		 translate_interp_mode(interp_mode),
+		 dst,
+		 0,
+		 interp,
+		 deltas,
+		 src_undef());
       }
       break;
    case FRAG_ATTRIB_FOGC:
@@ -434,7 +377,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
@@ -468,7 +411,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
@@ -482,7 +425,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
    default:
       emit_op(c,
-	      WM_PINTERP,
+	      translate_interp_mode(interp_mode),
 	      dst,
 	      0,
 	      interp,
@@ -490,8 +433,6 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
       break;
    }
-
-   c->fp_interp_emitted |= 1<<idx;
 }
 
 /***********************************************************************
@@ -581,7 +522,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(dst, WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
@@ -596,7 +537,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.xz = swz src0.1zzz
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
@@ -609,7 +550,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.w = mov src1.w
        */
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
@@ -631,7 +572,7 @@ static void precalc_lit( struct brw_wm_compile *c,
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
@@ -643,7 +584,7 @@ static void precalc_lit( struct brw_wm_compile *c,
 
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
-	      OPCODE_LIT,
+	      TGSI_OPCODE_LIT,
 	      dst_mask(dst, WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
@@ -681,7 +622,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        coord = src_reg_from_dst(tmpcoord);
 
        /* tmpcoord = src0 (i.e.: coord = src0) */
-       out = emit_op(c, OPCODE_MOV,
+       out = emit_op(c, TGSI_OPCODE_MOV,
                      tmpcoord,
                      0,
                      src0,
@@ -691,7 +632,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        out->SrcReg[0].Abs = 1;
 
        /* tmp0 = MAX(coord.X, coord.Y) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp0,
                0,
                src_swizzle1(coord, X),
@@ -699,7 +640,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp1 = MAX(tmp0, coord.Z) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp1,
                0,
                tmp0src,
@@ -707,7 +648,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp0 = 1 / tmp1 */
-       emit_op(c, OPCODE_RCP,
+       emit_op(c, TGSI_OPCODE_RCP,
                dst_mask(tmp0, WRITEMASK_X),
                0,
                tmp1src,
@@ -715,7 +656,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmpCoord = src0 * tmp0 */
-       emit_op(c, OPCODE_MUL,
+       emit_op(c, TGSI_OPCODE_MUL,
                tmpcoord,
                0,
                src0,
@@ -738,7 +679,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* coord.xy   = MUL inst->SrcReg[0], { 1/width, 1/height }
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      tmpcoord,
 	      0,
 	      inst->SrcReg[0],
@@ -785,7 +726,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp     = TEX ...
        */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   tmp,
                   inst->SaturateMode,
                   unit,
@@ -798,7 +739,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp.xyz =  ADD TMP, C0
        */
       emit_op(c,
-	      OPCODE_ADD,
+	      TGSI_OPCODE_ADD,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
@@ -809,7 +750,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
 	      0,
 	      tmpsrc,
@@ -824,7 +765,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
@@ -834,7 +775,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
        */
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
@@ -846,7 +787,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    else {
       /* ordinary RGBA tex instruction */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   inst->DstReg,
                   inst->SaturateMode,
                   unit,
@@ -861,7 +802,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
       /* swizzle the result of the TEX instruction */
       struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
-      emit_op(c, OPCODE_SWZ,
+      emit_op(c, TGSI_OPCODE_MOV,
               inst->DstReg,
               SATURATE_OFF, /* saturate already done above */
               src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
@@ -884,7 +825,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
    const struct prog_src_register src = inst->SrcReg[0];
    GLboolean retVal;
 
-   assert(inst->Opcode == OPCODE_TXP);
+   assert(inst->Opcode == TGSI_OPCODE_TXP);
 
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
@@ -921,7 +862,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.w = RCP inst.arg[0][3]
        */
       emit_op(c,
-	      OPCODE_RCP,
+	      TGSI_OPCODE_RCP,
 	      dst_mask(tmp, WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
@@ -931,7 +872,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      src0,
@@ -1015,6 +956,7 @@ static void validate_src_regs( struct brw_wm_compile *c,
 	 GLuint idx = inst->SrcReg[i].Index;
 	 if (!(c->fp_interp_emitted & (1<<idx))) {
 	    emit_interp(c, idx);
+	    c->fp_interp_emitted |= 1<<idx;
 	 }
       }
    }
@@ -1094,71 +1036,64 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        */
 
       switch (inst->Opcode) {
-      case OPCODE_SWZ: 
+      case TGSI_OPCODE_ABS:
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
-	 break;
-	 
-      case OPCODE_ABS:
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
+	 out->Opcode = TGSI_OPCODE_MOV;
 	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
-      case OPCODE_SUB: 
+      case TGSI_OPCODE_SUB: 
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_ADD;
+	 out->Opcode = TGSI_OPCODE_ADD;
 	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
-      case OPCODE_SCS: 
+      case TGSI_OPCODE_SCS: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XY;
 	 break;
 	 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 precalc_dst(c, inst);
 	 break;
 
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 precalc_lit(c, inst);
 	 break;
 
-      case OPCODE_TEX:
+      case TGSI_OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
 
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TXP:
 	 precalc_txp(c, inst);
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 out = emit_insn(c, inst);
 	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
          assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_KIL: 
+      case TGSI_OPCODE_KIL: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask = 0;
 	 break;
-      case OPCODE_END:
+      case TGSI_OPCODE_END:
 	 emit_fb_write(c);
 	 break;
-      case OPCODE_PRINT:
-	 break;
       default:
 	 if (brw_wm_is_scalar_result(inst->Opcode))
 	    emit_scalar_insn(c, inst);
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index c9fe1dd8ad..d836e2fb34 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -6,9 +6,6 @@
 #include "brw_eu.h"
 #include "brw_wm.h"
 
-enum _subroutine {
-    SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
-};
 
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
                                   const struct prog_instruction *inst,
@@ -32,10 +29,6 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
 	    case OPCODE_RET:
-	    case OPCODE_NOISE1:
-	    case OPCODE_NOISE2:
-	    case OPCODE_NOISE3:
-	    case OPCODE_NOISE4:
 	    case OPCODE_BGNLOOP:
 		return GL_TRUE; 
 	    default:
@@ -1495,1036 +1488,7 @@ static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 		   0, 16, 2 );
 }
 
-/* One-, two- and three-dimensional Perlin noise, similar to the description
-   in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
-static void noise1_sub( struct brw_wm_compile *c ) {
 
-    struct brw_compile *p = &c->func;
-    struct brw_reg param,
-	x0, x1, /* gradients at each end */       
-	t, tmp[ 2 ], /* float temporaries */
-	itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0 = alloc_tmp( c );
-    x1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    tmp[ 0 ] = alloc_tmp( c );
-    tmp[ 1 ] = alloc_tmp( c );
-    itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
-    itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
-    itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
-    
-    param = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-
-    /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
-       be hashed.  Also compute the remainder (offset within the unit
-       length), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
-    brw_FRC( p, param, param );
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
-    brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-
-    /* We're now ready to perform the hashing.  The two hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the two gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 31 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
-    brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
-
-    brw_MUL( p, x0, x0, param );
-    brw_MUL( p, x1, x1, t );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
-					   pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_MUL( p, param, tmp[ 0 ], param );
-    brw_MUL( p, x1, x1, param );
-    brw_ADD( p, x0, x0, x1 );    
-    /* scale by pow( 2, -30 ), to compensate for the format conversion
-       above and an extra factor of 2 so that a single gradient covers
-       the [-1,1] range */
-    brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise1( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src, param, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src = get_src_reg( c, inst, 0, 0 );
-
-    param = alloc_tmp( c );
-
-    brw_MOV( p, param, src );
-
-    invoke_subroutine( c, SUB_NOISE1, noise1_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-static void noise2_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */       
-	t, tmp[ 4 ], /* float temporaries */
-	itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 4; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-    }
-    itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
-    itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
-    itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
-    
-    param0 = lookup_tmp( c, mark - 3 );
-    param1 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       square), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-    brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
-	     low_words( itmp[ 1 ] ) );
-    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
-    brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
-    brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
-
-    /* We're now ready to perform the hashing.  The four hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the four gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
-    
-    brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
-    brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
-    brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
-
-    brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_MUL( p, param0, tmp[ 0 ], param0 );
-    brw_MUL( p, param1, tmp[ 1 ], param1 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, param1 );
-    brw_MUL( p, x1y1, x1y1, param1 );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  There are horrible register dependencies here,
-       but we have nothing else to do. */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, param0 );
-    brw_ADD( p, x0y0, x0y0, x1y0 );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise2( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, param0, param1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-
-    invoke_subroutine( c, SUB_NOISE2, noise2_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-
-/**
- * The three-dimensional case is much like the one- and two- versions above,
- * but since the number of corners is rapidly growing we now pack 16 16-bit
- * hashes into each register to extract more parallelism from the EUs.
- */
-static void noise3_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1, param2,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	xi, yi, zi, /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    xi = alloc_tmp( c );
-    yi = alloc_tmp( c );
-    zi = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-    
-    param0 = lookup_tmp( c, mark - 4 );
-    param1 = lookup_tmp( c, mark - 3 );
-    param2 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       cube), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_FRC( p, param2, param2 );
-    /* Since we now have only 16 bits of precision in the hash, we must
-       be more careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
-    brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
-    brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* The interpolation coefficients are still around from last time, so
-       again interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* The final interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise3( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, param0, param1, param2, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-
-    invoke_subroutine( c, SUB_NOISE3, noise3_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-/**
- * For the four-dimensional case, the little micro-optimisation benefits
- * we obtain by unrolling all the loops aren't worth the massive bloat it
- * now causes.  Instead, we loop twice around performing a similar operation
- * to noise3, once for the w=0 cube and once for the w=1, with a bit more
- * code to glue it all together.
- */
-static void noise4_sub( struct brw_wm_compile *c )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg param[ 4 ],
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	w0, /* noise for the w=0 cube */
-	floors[ 2 ], /* integer coordinates of base corner of hypercube */
-	interp[ 4 ], /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i, j;
-    int mark = mark_tmps( c );
-    GLuint loop, origin;
-    
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    w0 = alloc_tmp( c );    
-    floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-    floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-
-    for( i = 0; i < 4; i++ ) {
-	param[ i ] = lookup_tmp( c, mark - 5 + i );
-	interp[ i ] = alloc_tmp( c );
-    }
-    
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    /* We only want 16 bits of precision from the integral part of each
-       co-ordinate, but unfortunately the RNDD semantics would saturate
-       at 16 bits if we performed the operation directly to a 16-bit
-       destination.  Therefore, we round to 32-bit temporaries where
-       appropriate, and then store only the lower 16 bits. */
-    brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
-    brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
-    brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
-    brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
-
-    /* Modify the flag register here, because the side effect is useful
-       later (see below).  We know for certain that all flags will be
-       cleared, since the FRC instruction cannot possibly generate
-       negative results.  Even for exceptional inputs (infinities, denormals,
-       NaNs), the architecture guarantees that the L conditional is false. */
-    brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
-    brw_FRC( p, param[ 0 ], param[ 0 ] );
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    for( i = 1; i < 4; i++ )	
-	brw_FRC( p, param[ i ], param[ i ] );
-    
-    /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
-       of all. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
-    for( j = 0; j < 3; j++ )
-	for( i = 0; i < 4; i++ )
-	    brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-
-    /* Mark the current address, as it will be a jump destination.  The
-       following code will be executed twice: first, with the flag
-       register clear indicating the w=0 case, and second with flags
-       set for w=1. */
-    loop = p->nr_insn;
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Since we have only 16 bits of precision in the hash, we
-       must be careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 0xA359 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );    
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Here we interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Interpolate in the y dimension: */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* Another interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-
-    /* Exit the loop if we've computed both cubes... */
-    origin = p->nr_insn;
-    brw_push_insn_state( p );
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
-    brw_pop_insn_state( p );
-
-    /* Save the result for the w=0 case, and increment the w coordinate: */
-    brw_MOV( p, w0, tmp[ 0 ] );
-    brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 1 ) );
-
-    /* Loop around for the other cube.  Explicitly set the flag register
-       (unfortunately we must spend an extra instruction to do this: we
-       can't rely on a side effect of the previous MOV or ADD because
-       conditional modifiers which are normally true might be false in
-       exceptional circumstances, e.g. given a NaN input; the add to
-       brw_ip_reg() is not suitable because the IP is not an 8-vector). */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
-	     brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
-    brw_pop_insn_state( p );
-
-    /* Patch the previous conditional branch now that we know the
-       destination address. */
-    brw_set_src1( p->store + origin,
-		  brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
-
-    /* The very last interpolation. */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );    
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
-
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise4( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-    src3 = get_src_reg( c, inst, 0, 3 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-    param3 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-    brw_MOV( p, param3, src3 );
-
-    invoke_subroutine( c, SUB_NOISE4, noise4_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
                          const struct prog_instruction *inst)
@@ -2543,19 +1507,18 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
      * X and Y channels.
      */
     if (mask & WRITEMASK_X) {
-	/* X' = X - origin_x */
-	brw_ADD(p,
+	/* X' = X */
+	brw_MOV(p,
 		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_W),
-		brw_imm_d(0 - c->key.origin_x));
+		retype(src0[0], BRW_REGISTER_TYPE_W));
     }
 
     if (mask & WRITEMASK_Y) {
-	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
+	/* Y' = height - 1 - Y */
 	brw_ADD(p,
 		dst[1],
 		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
-		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+		brw_imm_d(c->key.drawable_height - 1));
     }
 }
 
@@ -2827,7 +1790,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
 		emit_mov(c, inst);
 		break;
 	    case OPCODE_DP3:
@@ -2903,18 +1865,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_MAD:
 		emit_mad(c, inst);
 		break;
-	    case OPCODE_NOISE1:
-		emit_noise1(c, inst);
-		break;
-	    case OPCODE_NOISE2:
-		emit_noise2(c, inst);
-		break;
-	    case OPCODE_NOISE3:
-		emit_noise3(c, inst);
-		break;
-	    case OPCODE_NOISE4:
-		emit_noise4(c, inst);
-		break;
 	    case OPCODE_TEX:
 		emit_tex(c, inst);
 		break;
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 6279258339..0c411b57f5 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -422,7 +422,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
        */      
       switch (inst->Opcode) {
       case OPCODE_MOV: 
-      case OPCODE_SWZ: 
 	 if (!inst->SaturateMode) {
 	    pass0_precalc_mov(c, inst);
 	 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index b449394029..d940ec09a9 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -120,7 +120,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint writemask;
       GLuint read0, read1, read2;
 
-      if (inst->opcode == OPCODE_KIL) {
+      if (inst->opcode == TGSI_OPCODE_KIL) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
@@ -154,76 +154,75 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       /* Mark all inputs which contribute to the marked outputs:
        */
       switch (inst->opcode) {
-      case OPCODE_ABS:
-      case OPCODE_FLR:
-      case OPCODE_FRC:
-      case OPCODE_MOV:
-      case OPCODE_SWZ:
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_ABS:
+      case TGSI_OPCODE_FLR:
+      case TGSI_OPCODE_FRC:
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_SUB:
-      case OPCODE_SLT:
-      case OPCODE_SLE:
-      case OPCODE_SGE:
-      case OPCODE_SGT:
-      case OPCODE_SEQ:
-      case OPCODE_SNE:
-      case OPCODE_ADD:
-      case OPCODE_MAX:
-      case OPCODE_MIN:
-      case OPCODE_MUL:
+      case TGSI_OPCODE_SUB:
+      case TGSI_OPCODE_SLT:
+      case TGSI_OPCODE_SLE:
+      case TGSI_OPCODE_SGE:
+      case TGSI_OPCODE_SGT:
+      case TGSI_OPCODE_SEQ:
+      case TGSI_OPCODE_SNE:
+      case TGSI_OPCODE_ADD:
+      case TGSI_OPCODE_MAX:
+      case TGSI_OPCODE_MIN:
+      case TGSI_OPCODE_MUL:
 	 read0 = writemask;
 	 read1 = writemask;
 	 break;
 
-      case OPCODE_DDX:
-      case OPCODE_DDY:
+      case TGSI_OPCODE_DDX:
+      case TGSI_OPCODE_DDY:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_MAD:	
-      case OPCODE_CMP:
-      case OPCODE_LRP:
+      case TGSI_OPCODE_MAD:	
+      case TGSI_OPCODE_CMP:
+      case TGSI_OPCODE_LRP:
 	 read0 = writemask;
 	 read1 = writemask;	
 	 read2 = writemask;	
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
 	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
 	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
-      case OPCODE_COS:
-      case OPCODE_EX2:
-      case OPCODE_LG2:
-      case OPCODE_RCP:
-      case OPCODE_RSQ:
-      case OPCODE_SIN:
-      case OPCODE_SCS:
+      case TGSI_OPCODE_COS:
+      case TGSI_OPCODE_EX2:
+      case TGSI_OPCODE_LG2:
+      case TGSI_OPCODE_RCP:
+      case TGSI_OPCODE_RSQ:
+      case TGSI_OPCODE_SIN:
+      case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
 	 read0 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 read0 = WRITEMASK_X;
 	 read1 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_TEX:
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
 	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
@@ -254,28 +253,28 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read2 = WRITEMASK_W; /* pixel w */
 	 break;
 
-      case OPCODE_DP3:	
+      case TGSI_OPCODE_DP3:	
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 read0 = WRITEMASK_XYZW;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_LIT: 
+      case TGSI_OPCODE_LIT: 
 	 read0 = WRITEMASK_XYW;
 	 break;
 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
+      case TGSI_OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/gallium/drivers/i965/intel_chipset.h b/src/gallium/drivers/i965/intel_chipset.h
index 3dc8653a73..3c38f1676c 100644
--- a/src/gallium/drivers/i965/intel_chipset.h
+++ b/src/gallium/drivers/i965/intel_chipset.h
@@ -66,7 +66,6 @@
 #define PCI_CHIP_Q45_G                  0x2E12
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
-#define PCI_CHIP_B43_G                  0x2E42
 
 #define PCI_CHIP_ILD_G                  0x0042
 #define PCI_CHIP_ILM_G                  0x0046
@@ -84,8 +83,7 @@
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
-                                 devid == PCI_CHIP_G41_G || \
-                                 devid == PCI_CHIP_B43_G)
+                                 devid == PCI_CHIP_G41_G)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))