From 2f5f7c07732577f60666e3cee69c75c9b035c145 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 16:55:02 +0100
Subject: i965g: re-starting from the dri driver

---
 src/gallium/drivers/i965/brw_vs.c | 124 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 src/gallium/drivers/i965/brw_vs.c

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
new file mode 100644
index 0000000000..f0c79efbd9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -0,0 +1,124 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+           
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "shader/prog_print.h"
+
+
+
+static void do_vs_prog( struct brw_context *brw, 
+			struct brw_vertex_program *vp,
+			struct brw_vs_prog_key *key )
+{
+   GLuint program_size;
+   const GLuint *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(brw, &c.func);
+   c.vp = vp;
+
+   c.prog_data.outputs_written = vp->program.Base.OutputsWritten;
+   c.prog_data.inputs_read = vp->program.Base.InputsRead;
+
+   if (c.key.copy_edgeflag) {
+      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
+   }
+
+   if (0)
+      _mesa_print_program(&c.vp->program.Base);
+
+
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   dri_bo_unreference(brw->vs.prog_bo);
+   brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
+				       &c.key, sizeof(c.key),
+				       NULL, 0,
+				       program, program_size,
+				       &c.prog_data,
+				       &brw->vs.prog_data );
+}
+
+
+static void brw_upload_vs_prog(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_vs_prog_key key;
+   struct brw_vertex_program *vp = 
+      (struct brw_vertex_program *)brw->vertex_program;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Just upload the program verbatim for now.  Always send it all
+    * the inputs it asks for, whether they are varying or not.
+    */
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
+			ctx->Polygon.BackMode != GL_FILL);
+
+   /* Make an early check for the key.
+    */
+   dri_bo_unreference(brw->vs.prog_bo);
+   brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
+				      &key, sizeof(key),
+				      NULL, 0,
+				      &brw->vs.prog_data);
+   if (brw->vs.prog_bo == NULL)
+      do_vs_prog(brw, vp, &key);
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
+      .brw   = BRW_NEW_VERTEX_PROGRAM,
+      .cache = 0
+   },
+   .prepare = brw_upload_vs_prog
+};
-- 
cgit v1.2.3


From 57a920cb1a0b6051068e730747b3fb475de88aca Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 17:01:32 +0100
Subject: i965g: wip

---
 src/gallium/drivers/i965/brw_bo.c             |   12 +
 src/gallium/drivers/i965/brw_cc.c             |  180 +----
 src/gallium/drivers/i965/brw_clip.c           |  127 +--
 src/gallium/drivers/i965/brw_clip.h           |    5 +-
 src/gallium/drivers/i965/brw_clip_line.c      |    7 -
 src/gallium/drivers/i965/brw_clip_point.c     |    7 -
 src/gallium/drivers/i965/brw_clip_state.c     |    7 +-
 src/gallium/drivers/i965/brw_clip_tri.c       |    7 -
 src/gallium/drivers/i965/brw_clip_unfilled.c  |    5 -
 src/gallium/drivers/i965/brw_clip_util.c      |    7 -
 src/gallium/drivers/i965/brw_context.c        |  135 ++--
 src/gallium/drivers/i965/brw_context.h        |    7 +-
 src/gallium/drivers/i965/brw_curbe.c          |   89 +--
 src/gallium/drivers/i965/brw_defines.h        |    4 +-
 src/gallium/drivers/i965/brw_disasm.c         |    2 -
 src/gallium/drivers/i965/brw_draw.c           |  244 +-----
 src/gallium/drivers/i965/brw_draw_upload.c    |  566 ++++---------
 src/gallium/drivers/i965/brw_gs.c             |    2 +-
 src/gallium/drivers/i965/brw_pipe_blend.c     |   41 +
 src/gallium/drivers/i965/brw_pipe_debug.c     |    2 +
 src/gallium/drivers/i965/brw_pipe_depth.c     |   52 ++
 src/gallium/drivers/i965/brw_pipe_fb.c        |   25 +
 src/gallium/drivers/i965/brw_pipe_flush.c     |   64 ++
 src/gallium/drivers/i965/brw_screen_surface.c |   27 +
 src/gallium/drivers/i965/brw_sf.c             |    4 +-
 src/gallium/drivers/i965/brw_sf_emit.c        |    4 +-
 src/gallium/drivers/i965/brw_state_upload.c   |   63 +-
 src/gallium/drivers/i965/brw_swtnl.c          |  114 +++
 src/gallium/drivers/i965/brw_types.h          |   11 +
 src/gallium/drivers/i965/brw_util.c           |    8 -
 src/gallium/drivers/i965/brw_vs.c             |   12 +-
 src/gallium/drivers/i965/brw_vs_emit.c        |  250 ++----
 src/gallium/drivers/i965/brw_wm.c             |   59 +-
 src/gallium/drivers/i965/brw_wm.h             |    1 -
 src/gallium/drivers/i965/brw_wm_emit.c        |   17 +-
 src/gallium/drivers/i965/brw_wm_fp.c          |  193 ++---
 src/gallium/drivers/i965/brw_wm_glsl.c        | 1060 +------------------------
 src/gallium/drivers/i965/brw_wm_pass0.c       |    1 -
 src/gallium/drivers/i965/brw_wm_pass1.c       |   81 +-
 src/gallium/drivers/i965/intel_chipset.h      |    4 +-
 40 files changed, 907 insertions(+), 2599 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_bo.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_blend.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_debug.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_depth.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_fb.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_flush.c
 create mode 100644 src/gallium/drivers/i965/brw_screen_surface.c
 create mode 100644 src/gallium/drivers/i965/brw_swtnl.c
 create mode 100644 src/gallium/drivers/i965/brw_types.h

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_bo.c b/src/gallium/drivers/i965/brw_bo.c
new file mode 100644
index 0000000000..e7a4dac666
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_bo.c
@@ -0,0 +1,12 @@
+
+
+void brw_buffer_subdata()
+{
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 1088a7a607..9ab5638137 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -62,84 +62,21 @@ const struct brw_tracked_state brw_cc_vp = {
 };
 
 struct brw_cc_unit_key {
-   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
-
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-   GLclampf alpha_ref;
-
-   GLboolean dither;
-
-   GLboolean depth_test, depth_write;
-   GLenum depth_func;
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend; /* no color mask */
 };
 
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
-
    memset(key, 0, sizeof(*key));
+   
+   key->dsa = brw->curr.dsa.base;
+   key->blend = brw->curr.blend.base;
 
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
-
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_ref[1] = ctx->Stencil.Ref[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
-   }
-
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
-      key->blend_eq_a = ctx->Color.BlendEquationA;
-      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
-      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
-      key->blend_src_a = ctx->Color.BlendSrcA;
-      key->blend_dst_a = ctx->Color.BlendDstA;
-   }
-
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-      key->alpha_ref = ctx->Color.AlphaRef;
-   }
-
-   key->dither = ctx->Color.DitherFlag;
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
+   /* Clear non-respected values:
+    */
+   key->blend.colormask = 0xf;
 }
 
 /**
@@ -153,103 +90,16 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 
    memset(&cc, 0, sizeof(cc));
 
-   /* _NEW_STENCIL */
-   if (key->stencil) {
-      cc.cc0.stencil_enable = 1;
-      cc.cc0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
-      cc.cc0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
-      cc.cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
-      cc.cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      cc.cc1.stencil_ref = key->stencil_ref[0];
-      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
-      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
-
-      if (key->stencil_two_side) {
-	 cc.cc0.bf_stencil_enable = 1;
-	 cc.cc0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
-	 cc.cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
-	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
-      }
-
-      /* Not really sure about this:
-       */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 cc.cc0.stencil_write_enable = 1;
-   }
-
-   /* _NEW_COLOR */
-   if (key->logic_op != GL_COPY) {
-      cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
-      }
-
-      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
-      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
-
-      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
-      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
-
-      cc.cc3.blend_enable = 1;
-      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
-				dstA != dstRGB ||
-				eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
-      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
-   }
-
-   if (key->dither) {
-      cc.cc5.dither_enable = 1;
-      cc.cc6.y_dither_offset = 0;
-      cc.cc6.x_dither_offset = 0;
-   }
-
-   /* _NEW_DEPTH */
-   if (key->depth_test) {
-      cc.cc2.depth_test = 1;
-      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
-      cc.cc2.depth_write_enable = key->depth_write;
-   }
+   cc.cc0 = brw->dsa.cc0;
+   cc.cc1 = brw->dsa.cc1;
+   cc.cc2 = brw->dsa.cc2;
+   cc.cc3 = brw->dsa.cc3 | brw->blend.cc3;
 
    /* CACHE_NEW_CC_VP */
    cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */
 
-   if (INTEL_DEBUG & DEBUG_STATS)
-      cc.cc5.statistics_enable = 1;
+   cc.cc5 = brw->blend.cc5 | brw->debug.cc5;
+
 
    bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
 			 key, sizeof(*key),
@@ -286,7 +136,7 @@ static void prepare_cc_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_cc_unit = {
    .dirty = {
-      .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
+      .mesa = PIPE_NEW_DEPTH_STENCIL_ALPHA | PIPE_NEW_BLEND,
       .brw = 0,
       .cache = CACHE_NEW_CC_VP
    },
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 20a927cf38..df1b3718d0 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -29,9 +29,9 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_state.h"
+
+#include "util/u_math.h"
 
 #include "intel_batchbuffer.h"
 
@@ -83,7 +83,7 @@ static void compile_clip_prog( struct brw_context *brw,
 	 delta += ATTR_SIZE;
       }
 
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -104,16 +104,16 @@ static void compile_clip_prog( struct brw_context *brw,
     * do all three:
     */
    switch (key->primitive) {
-   case GL_TRIANGLES: 
+   case PIPE_PRIM_TRIANGLES: 
       if (key->do_unfilled)
 	 brw_emit_unfilled_clip( &c );
       else
 	 brw_emit_tri_clip( &c );
       break;
-   case GL_LINES:
+   case PIPE_PRIM_LINES:
       brw_emit_line_clip( &c );
       break;
-   case GL_POINTS:
+   case PIPE_PRIM_POINTS:
       brw_emit_point_clip( &c );
       break;
    default:
@@ -143,7 +143,6 @@ static void compile_clip_prog( struct brw_context *brw,
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -151,101 +150,51 @@ static void upload_clip_prog(struct brw_context *brw)
    /* Populate the key:
     */
    /* BRW_NEW_REDUCED_PRIMITIVE */
-   key.primitive = brw->intel.reduced_primitive;
+   key.primitive = brw->reduced_primitive;
    /* CACHE_NEW_VS_PROG */
    key.attrs = brw->vs.prog_data->outputs_written;
-   /* _NEW_LIGHT */
-   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
-   /* _NEW_TRANSFORM */
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_RAST */
+   key.do_flat_shading = brw->rast.base.flatshade;
+   /* PIPE_NEW_UCP */
+   key.nr_userclip = brw->nr_ucp;
 
    if (BRW_IS_IGDNG(brw))
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
 
-   /* _NEW_POLYGON */
-   if (key.primitive == GL_TRIANGLES) {
-      if (ctx->Polygon.CullFlag &&
-	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+   /* PIPE_NEW_RAST */
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
 	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
       else {
-	 GLuint fill_front = CLIP_CULL;
-	 GLuint fill_back = CLIP_CULL;
-	 GLuint offset_front = 0;
-	 GLuint offset_back = 0;
-
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_FRONT) {
-	    switch (ctx->Polygon.FrontMode) {
-	    case GL_FILL: 
-	       fill_front = CLIP_FILL; 
-	       offset_front = 0;
-	       break;
-	    case GL_LINE:
-	       fill_front = CLIP_LINE;
-	       offset_front = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_front = CLIP_POINT;
-	       offset_front = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
 	 }
 
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_BACK) {
-	    switch (ctx->Polygon.BackMode) {
-	    case GL_FILL: 
-	       fill_back = CLIP_FILL; 
-	       offset_back = 0;
-	       break;
-	    case GL_LINE:
-	       fill_back = CLIP_LINE;
-	       offset_back = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_back = CLIP_POINT;
-	       offset_back = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
 	 }
 
-	 if (ctx->Polygon.BackMode != GL_FILL ||
-	     ctx->Polygon.FrontMode != GL_FILL) {
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
 	    key.do_unfilled = 1;
-
-	    /* Most cases the fixed function units will handle.  Cases where
-	     * one or more polygon faces are unfilled will require help:
-	     */
 	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.offset_ccw;
+	 key.offset_cw = brw->rast.offset_cw;
+
+	 if (brw->rast.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
 
-	    if (offset_back || offset_front) {
-	       /* _NEW_POLYGON, _NEW_BUFFERS */
-	       key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale;
-	       key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD;
-	    }
-
-	    switch (ctx->Polygon.FrontFace) {
-	    case GL_CCW:
-	       key.fill_ccw = fill_front;
-	       key.fill_cw = fill_back;
-	       key.offset_ccw = offset_front;
-	       key.offset_cw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_cw != CLIP_CULL) 
-		  key.copy_bfc_cw = 1;
-	       break;
-	    case GL_CW:
-	       key.fill_cw = fill_front;
-	       key.fill_ccw = fill_back;
-	       key.offset_cw = offset_front;
-	       key.offset_ccw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_ccw != CLIP_CULL) 
-		  key.copy_bfc_ccw = 1;
-	       break;
-	    }
+	 if (brw->rast.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
 	 }
       }
    }
@@ -262,10 +211,8 @@ static void upload_clip_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT | 
-		_NEW_TRANSFORM |
-		_NEW_POLYGON | 
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_RAST | 
+		PIPE_NEW_UCP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index 957df441ab..d80ec819b9 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -43,6 +43,7 @@
  */
 struct brw_clip_prog_key {
    GLuint attrs:32;		
+
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -51,12 +52,10 @@ struct brw_clip_prog_key {
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:17;
-
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:27;
+   GLuint pad1:12;
    
    GLfloat offset_factor;
    GLfloat offset_units;
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
index 048ca620fa..6b4da25644 100644
--- a/src/gallium/drivers/i965/brw_clip_line.c
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c
index 8458f61c5a..b2cf7b2011 100644
--- a/src/gallium/drivers/i965/brw_clip_point.c
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 234b3744bf..72e27205e2 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -32,7 +32,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_clip_unit_key {
    unsigned int total_grf;
@@ -66,8 +65,8 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_clip_entries;
    key->urb_size = brw->urb.vsize;
 
-   /* _NEW_TRANSOFORM */
-   key->depth_clamp = ctx->Transform.DepthClamp;
+   /*  */
+   key->depth_clamp = 0; // XXX: add this to gallium: ctx->Transform.DepthClamp;
 }
 
 static dri_bo *
@@ -175,7 +174,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM,
+      .mesa  = 0,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
index 0efd77225e..d8feca6a87 100644
--- a/src/gallium/drivers/i965/brw_clip_tri.c
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index ad1bfa435f..4baff55806 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,11 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 5a73abdfee..7a6c46ce07 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -30,13 +30,6 @@
   */
 
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index c300c33adc..bf0ec89e13 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -52,122 +52,77 @@
 #include "utils.h"
 
 
-/***************************************
- * Mesa's Driver Functions
- ***************************************/
-
-static void brwUseProgram(GLcontext *ctx, GLuint program)
-{
-   _mesa_use_program(ctx, program);
-}
-
-static void brwInitProgFuncs( struct dd_function_table *functions )
-{
-   functions->UseProgram = brwUseProgram;
-}
-static void brwInitDriverFunctions( struct dd_function_table *functions )
-{
-   intelInitDriverFunctions( functions );
-
-   brwInitFragProgFuncs( functions );
-   brwInitProgFuncs( functions );
-   brw_init_queryobj_functions(functions);
-
-   functions->Viewport = intel_viewport;
-}
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate)
 {
-   struct dd_function_table functions;
    struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
-   struct intel_context *intel = &brw->intel;
-   GLcontext *ctx = &intel->ctx;
 
    if (!brw) {
-      _mesa_printf("%s: failed to alloc context\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   brwInitVtbl( brw );
-   brwInitDriverFunctions( &functions );
-
-   if (!intelInitContext( intel, mesaVis, driContextPriv,
-			  sharedContextPrivate, &functions )) {
-      _mesa_printf("%s: failed to init intel context\n", __FUNCTION__);
-      FREE(brw);
+      debug_printf("%s: failed to alloc context\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   /* Initialize swrast, tnl driver tables: */
-   intelInitSpanFuncs(ctx);
-
-   TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
-
-   ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT;
-   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
-   ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
-                                     ctx->Const.MaxTextureImageUnits);
-   ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
-
-   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
-    */
-   ctx->Const.MaxTextureLevels = 13;
-   ctx->Const.Max3DTextureLevels = 9;
-   ctx->Const.MaxCubeTextureLevels = 12;
-   ctx->Const.MaxTextureRectSize = (1<<12);
-   
-   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-
-   /* if conformance mode is set, swrast can handle any size AA point */
-   ctx->Const.MaxPointSizeAA = 255.0;
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    ctx->Shader.EmitCondCodes = GL_TRUE;
    ctx->Shader.EmitNVTempInitialization = GL_TRUE;
 
-   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.VertexProgram.MaxAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
-   ctx->Const.VertexProgram.MaxNativeTemps = 256;
-   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
-   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
-   ctx->Const.VertexProgram.MaxEnvParams =
-      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
-	   ctx->Const.VertexProgram.MaxEnvParams);
-
-   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
-   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
-   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
-   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
-   ctx->Const.FragmentProgram.MaxEnvParams =
-      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
-	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   brw_init_query( brw );
    brw_init_state( brw );
+   brw_draw_init( brw );
 
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0;
 
    brw->emit_state_always = 0;
 
-   ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-   ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
    make_empty_list(&brw->query.active_head);
 
-   brw_draw_init( brw );
 
    return GL_TRUE;
 }
 
+/**
+ * called from intelDestroyContext()
+ */
+static void brw_destroy_context( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   int i;
+
+   brw_destroy_state(brw);
+   brw_draw_destroy( brw );
+
+   _mesa_free(brw->wm.compile_data);
+
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
+   intel_region_release(&brw->state.depth_region);
+
+   dri_bo_unreference(brw->curbe.curbe_bo);
+   dri_bo_unreference(brw->vs.prog_bo);
+   dri_bo_unreference(brw->vs.state_bo);
+   dri_bo_unreference(brw->vs.bind_bo);
+   dri_bo_unreference(brw->gs.prog_bo);
+   dri_bo_unreference(brw->gs.state_bo);
+   dri_bo_unreference(brw->clip.prog_bo);
+   dri_bo_unreference(brw->clip.state_bo);
+   dri_bo_unreference(brw->clip.vp_bo);
+   dri_bo_unreference(brw->sf.prog_bo);
+   dri_bo_unreference(brw->sf.state_bo);
+   dri_bo_unreference(brw->sf.vp_bo);
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
+      dri_bo_unreference(brw->wm.sdc_bo[i]);
+   dri_bo_unreference(brw->wm.bind_bo);
+   for (i = 0; i < BRW_WM_MAX_SURF; i++)
+      dri_bo_unreference(brw->wm.surf_bo[i]);
+   dri_bo_unreference(brw->wm.sampler_bo);
+   dri_bo_unreference(brw->wm.prog_bo);
+   dri_bo_unreference(brw->wm.state_bo);
+   dri_bo_unreference(brw->cc.prog_bo);
+   dri_bo_unreference(brw->cc.state_bo);
+   dri_bo_unreference(brw->cc.vp_bo);
+}
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index fa3e32c7ff..009e28b227 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -115,7 +115,6 @@
  * Handles blending and (presumably) depth and stencil testing.
  */
 
-#define BRW_FALLBACK_TEXTURE		 0x1
 #define BRW_MAX_CURBE                    (32*16)
 
 struct brw_context;
@@ -450,11 +449,9 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean tmp_fallback;
    GLboolean no_batch_wrap;
 
    struct {
@@ -692,7 +689,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 /*======================================================================
  * brw_queryobj.c
  */
-void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_init_query(struct brw_context *brw);
 void brw_prepare_query_begin(struct brw_context *brw);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
@@ -730,7 +727,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst);
  * macros used previously:
  */
 static INLINE struct brw_context *
-brw_context( GLcontext *ctx )
+brw_context( struct pipe_context *ctx )
 {
    return (struct brw_context *)ctx;
 }
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 4be6c77aa1..3e32c4983d 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,14 +30,6 @@
   */
 
 
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
@@ -64,31 +56,17 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_UCP */
+   if (brw->nr_ucp) {
+      GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
 
 
    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
 
-   /* This can happen - what to do?  Probably rather than falling
-    * back, the best thing to do is emit programs which code the
-    * constants as immediate values.  Could do this either as a static
-    * cap on WM and VS, or adaptively.
-    *
-    * Unfortunately, this is currently dependent on the results of the
-    * program generation process (in the case of wm), so this would
-    * introduce the need to re-generate programs in the event of a
-    * curbe allocation failure.
-    */
-   /* Max size is 32 - just large enough to
-    * hold the 128 parameters allowed by
-    * the fragment and vertex program
-    * api's.  It's not clear what happens
-    * when both VP and FP want to use 128
-    * parameters, though. 
+   /* When this is > 32, want to use a true constant buffer to hold
+    * the extra constants.
     */
    assert(total_regs <= 32);
 
@@ -113,8 +91,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
       brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
       brw->curbe.total_size = reg;
 
-      if (0)
-	 _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+      if (BRW_DEBUG & DEBUG_CURBE)
+	 debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
 		      brw->curbe.wm_start,
 		      brw->curbe.wm_size,
 		      brw->curbe.clip_start,
@@ -129,7 +107,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = _NEW_TRANSFORM,
+      .mesa = PIPE_NEW_UCP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -204,11 +182,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
+      /* map fs constant buffer */
 
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
 	 buf[offset + i] = *brw->wm.prog_data->param[i];
+
+      /* unmap fs constant buffer */
    }
 
 
@@ -228,18 +208,15 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
       }
 
-      /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
-       * clip-space:
+      /* Clip planes:
        */
-      assert(MAX_CLIP_PLANES == 6);
-      for (j = 0; j < MAX_CLIP_PLANES; j++) {
-	 if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
-	    buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0];
-	    buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1];
-	    buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2];
-	    buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3];
-	    i++;
-	 }
+      assert(brw->nr_ucp <= 6);
+      for (j = 0; j < brw->nr_ucp; j++) {
+	 buf[offset + i * 4 + 0] = brw->ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->ucp[j][3];
+	 i++;
       }
    }
 
@@ -248,13 +225,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
+      /* map vs constant buffer */
 
       /* XXX just use a memcpy here */
       for (i = 0; i < nr; i++) {
@@ -264,14 +235,16 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 2] = value[2];
 	 buf[offset + i * 4 + 3] = value[3];
       }
+
+      /* unmap vs constant buffer */
    }
 
    if (0) {
       for (i = 0; i < sz*16; i+=4) 
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -282,12 +255,12 @@ static void prepare_constant_buffer(struct brw_context *brw)
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
       /* constants have not changed */
-      _mesa_free(buf);
+      FREE(buf);
    } 
    else {
       /* constants have changed */
       if (brw->curbe.last_buf)
-	 _mesa_free(brw->curbe.last_buf);
+	 FREE(brw->curbe.last_buf);
 
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
@@ -353,15 +326,11 @@ static void emit_constant_buffer(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-/* This tracked state is unique in that the state it monitors varies
- * dynamically depending on the parameters tracked by the fragment and
- * vertex programs.  This is the template used as a starting point,
- * each context will maintain a copy of this internally and update as
- * required.
- */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .mesa = (PIPE_NEW_FS_CONSTANTS |
+	       PIPE_NEW_VS_CONSTANTS |
+	       PIPE_NEW_UCP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 78d457ad2b..282c5b18f4 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
index 9fef230507..a84c581c03 100644
--- a/src/gallium/drivers/i965/brw_disasm.c
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -27,8 +27,6 @@
 #include <unistd.h>
 #include <stdarg.h>
 
-#include "main/mtypes.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 44bb7bd588..8cd117c24f 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -39,14 +39,13 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
 
-static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
+static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
    _3DPRIM_POINTLIST,
    _3DPRIM_LINELIST,
    _3DPRIM_LINELOOP,
@@ -60,19 +59,6 @@ static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
 };
 
 
-static const GLenum reduced_prim[GL_POLYGON+1] = {  
-   GL_POINTS,
-   GL_LINES,
-   GL_LINES,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES
-};
-
 
 /* When the primitive changes, set a state bit and re-validate.  Not
  * the nicest and would rather deal with this by having all the
@@ -196,102 +182,6 @@ static void brw_merge_inputs( struct brw_context *brw,
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
 }
 
-/* XXX: could split the primitive list to fallback only on the
- * non-conformant primitives.
- */
-static GLboolean check_fallbacks( struct brw_context *brw,
-				  const struct _mesa_prim *prim,
-				  GLuint nr_prims )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   GLuint i;
-
-   /* If we don't require strict OpenGL conformance, never 
-    * use fallbacks.  If we're forcing fallbacks, always
-    * use fallfacks.
-    */
-   if (brw->intel.conformance_mode == 0)
-      return GL_FALSE;
-
-   if (brw->intel.conformance_mode == 2)
-      return GL_TRUE;
-
-   if (ctx->Polygon.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware will do AA lines, but they are non-conformant it
-    * seems.  TBD whether we keep this fallback:
-    */
-   if (ctx->Line.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_LINES) 
-	    return GL_TRUE;
-   }
-
-   /* Stipple -- these fallbacks could be resolved with a little
-    * bit of work?
-    */
-   if (ctx->Line.StippleFlag) {
-      for (i = 0; i < nr_prims; i++) {
-	 /* GS doesn't get enough information to know when to reset
-	  * the stipple counter?!?
-	  */
-	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
-	    return GL_TRUE;
-	    
-	 if (prim[i].mode == GL_POLYGON &&
-	     (ctx->Polygon.FrontMode == GL_LINE ||
-	      ctx->Polygon.BackMode == GL_LINE))
-	    return GL_TRUE;
-      }
-   }
-
-   if (ctx->Point.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (prim[i].mode == GL_POINTS) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
-    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
-    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
-    * we want strict conformance, force the fallback.
-    * Right now, we only do this for 2D textures.
-    */
-   {
-      int u;
-      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
-         if (texUnit->Enabled) {
-            if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-         }
-      }
-   }
-      
-   /* Nothing stopping us from the fast path now */
-   return GL_FALSE;
-}
-
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
@@ -308,23 +198,12 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    GLboolean retval = GL_FALSE;
    GLboolean warn = GL_FALSE;
    GLboolean first_time = GL_TRUE;
+   uint32_t hw_prim;
    GLuint i;
 
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
-   /* We have to validate the textures *before* checking for fallbacks;
-    * otherwise, the software fallback won't be able to rely on the
-    * texture state, the firstLevel and lastLevel fields won't be
-    * set in the intel texture object (they'll both be 0), and the 
-    * software fallback will segfault if it attempts to access any
-    * texture level other than level 0.
-    */
-   brw_validate_textures( brw );
-
-   if (check_fallbacks(brw, prim, nr_prims))
-      return GL_FALSE;
-
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs( brw, arrays );
@@ -336,90 +215,30 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    brw->vb.max_index = max_index;
    brw->state.dirty.brw |= BRW_NEW_VERTICES;
 
-   /* Have to validate state quite late.  Will rebuild tnl_program,
-    * which depends on varying information.  
-    * 
-    * Note this is where brw->vs->prog_data.inputs_read is calculated,
-    * so can't access it earlier.
-    */
-
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
-
-   for (i = 0; i < nr_prims; i++) {
-      uint32_t hw_prim;
-
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.  This fraction is just a guess (minimal full state plus
-       * a primitive is around 512 bytes), and would be better if we had
-       * an upper bound of how much we might emit in a single
-       * brw_try_draw_prims().
-       */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
-
-      hw_prim = brw_set_prim(brw, prim[i].mode);
-
-      if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) {
-	 first_time = GL_FALSE;
-
-	 brw_validate_state(brw);
-
-	 /* Various fallback checks:  */
-	 if (brw->intel.Fallback)
-	    goto out;
-
-	 /* Check that we can fit our state in with our existing batchbuffer, or
-	  * flush otherwise.
-	  */
-	 if (dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-					     brw->state.validated_bo_count)) {
-	    static GLboolean warned;
-	    intel_batchbuffer_flush(intel->batch);
-
-	    /* Validate the state after we flushed the batch (which would have
-	     * changed the set of dirty state).  If we still fail to
-	     * check_aperture, warn of what's happening, but attempt to continue
-	     * on since it may succeed anyway, and the user would probably rather
-	     * see a failure and a warning than a fallback.
-	     */
-	    brw_validate_state(brw);
-	    if (!warned &&
-		dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-						brw->state.validated_bo_count)) {
-	       warn = GL_TRUE;
-	       warned = GL_TRUE;
-	    }
-	 }
-
-	 brw_upload_state(brw);
-      }
+   hw_prim = brw_set_prim(brw, prim[i].mode);
 
-      brw_emit_prim(brw, &prim[i], hw_prim);
+   brw_validate_state(brw);
 
-      retval = GL_TRUE;
-   }
+   /* Check that we can fit our state in with our existing batchbuffer, or
+    * flush otherwise.
+    */
+   ret = dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					 brw->state.validated_bo_count);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_state(brw);
+   if (ret)
+      return ret;
+   
+   ret = brw_emit_prim(brw, &prim[i], hw_prim);
+   if (ret)
+      return ret;
 
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
- out:
-   UNLOCK_HARDWARE(intel);
-
-   brw_state_cache_check_size(brw);
-
-   if (warn)
-      fprintf(stderr, "i965: Single primitive emit potentially exceeded "
-	      "available aperture space\n");
 
-   if (!retval)
-      DBG("%s failed\n", __FUNCTION__);
-
-   return retval;
+   return 0;
 }
 
 void brw_draw_prims( GLcontext *ctx,
@@ -431,37 +250,26 @@ void brw_draw_prims( GLcontext *ctx,
 		     GLuint min_index,
 		     GLuint max_index )
 {
-   GLboolean retval;
+   enum pipe_error ret;
 
    if (!vbo_all_varyings_in_vbos(arrays)) {
       if (!index_bounds_valid)
 	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-
-      /* Decide if we want to rebase.  If so we end up recursing once
-       * only into this function.
-       */
-      if (min_index != 0) {
-	 vbo_rebase_prims(ctx, arrays,
-			  prim, nr_prims,
-			  ib, min_index, max_index,
-			  brw_draw_prims );
-	 return;
-      }
    }
 
    /* Make a first attempt at drawing:
     */
-   retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
    /* Otherwise, we really are out of memory.  Pass the drawing
     * command to the software tnl module and which will in turn call
     * swrast to do the drawing.
     */
-   if (!retval) {
-       _swsetup_Wakeup(ctx);
-      _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   if (ret != 0) {
+      intel_batchbuffer_flush(intel->batch);
+      ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+      assert(ret == 0);
    }
-
 }
 
 void brw_draw_init( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index a3ff6c58d8..ad3ef6b7dd 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -25,13 +25,9 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_context.h"
 
-#include "main/glheader.h"
-#include "main/bufferobj.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/api_validate.h"
-#include "main/enums.h"
+#include "util/u_upload_mgr.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
@@ -43,303 +39,157 @@
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
-static GLuint double_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
-};
-
-static GLuint float_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
-};
-
-static GLuint uint_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_UNORM,
-   BRW_SURFACEFORMAT_R32G32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_UNORM
-};
-
-static GLuint uint_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_USCALED,
-   BRW_SURFACEFORMAT_R32G32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_USCALED
-};
-
-static GLuint int_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SNORM,
-   BRW_SURFACEFORMAT_R32G32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_SNORM
-};
-
-static GLuint int_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
-};
-
-static GLuint ushort_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_UNORM,
-   BRW_SURFACEFORMAT_R16G16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_UNORM
-};
-
-static GLuint ushort_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_USCALED,
-   BRW_SURFACEFORMAT_R16G16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_USCALED
-};
-
-static GLuint short_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SNORM,
-   BRW_SURFACEFORMAT_R16G16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_SNORM
-};
-
-static GLuint short_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
-};
 
-static GLuint ubyte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_UNORM,
-   BRW_SURFACEFORMAT_R8G8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_UNORM
-};
 
-static GLuint ubyte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_USCALED,
-   BRW_SURFACEFORMAT_R8G8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_USCALED
-};
-
-static GLuint byte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SNORM,
-   BRW_SURFACEFORMAT_R8G8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_SNORM
-};
 
-static GLuint byte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_SSCALED
-};
-
-
-/**
- * Given vertex array type/size/format/normalized info, return
- * the appopriate hardware surface type.
- * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
- */
-static GLuint get_surface_type( GLenum type, GLuint size,
-                                GLenum format, GLboolean normalized )
+unsigned brw_translate_surface_format( unsigned id )
 {
-   if (INTEL_DEBUG & DEBUG_VERTS)
-      _mesa_printf("type %s size %d normalized %d\n", 
-		   _mesa_lookup_enum_by_nr(type), size, normalized);
-
-   if (normalized) {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_norm[size];
-      case GL_SHORT: return short_types_norm[size];
-      case GL_BYTE: return byte_types_norm[size];
-      case GL_UNSIGNED_INT: return uint_types_norm[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
-      case GL_UNSIGNED_BYTE:
-         if (format == GL_BGRA) {
-            /* See GL_EXT_vertex_array_bgra */
-            assert(size == 4);
-            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-         }
-         else {
-            return ubyte_types_norm[size];
-         }
-      default: assert(0); return 0;
-      }      
-   }
-   else {
-      assert(format == GL_RGBA); /* sanity check */
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_scale[size];
-      case GL_SHORT: return short_types_scale[size];
-      case GL_BYTE: return byte_types_scale[size];
-      case GL_UNSIGNED_INT: return uint_types_scale[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_scale[size];
-      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size];
-      default: assert(0); return 0;
-      }      
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
    }
 }
 
-
-static GLuint get_size( GLenum type )
-{
-   switch (type) {
-   case GL_DOUBLE: return sizeof(GLdouble);
-   case GL_FLOAT: return sizeof(GLfloat);
-   case GL_INT: return sizeof(GLint);
-   case GL_SHORT: return sizeof(GLshort);
-   case GL_BYTE: return sizeof(GLbyte);
-   case GL_UNSIGNED_INT: return sizeof(GLuint);
-   case GL_UNSIGNED_SHORT: return sizeof(GLushort);
-   case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
-   default: return 0;
-   }      
-}
-
-static GLuint get_index_type(GLenum type) 
+static unsigned get_index_type(int type)
 {
    switch (type) {
-   case GL_UNSIGNED_BYTE:  return BRW_INDEX_BYTE;
-   case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD;
-   case GL_UNSIGNED_INT:   return BRW_INDEX_DWORD;
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
    default: assert(0); return 0;
    }
 }
 
-static void wrap_buffers( struct brw_context *brw,
-			  GLuint size )
-{
-   if (size < BRW_UPLOAD_INIT_SIZE)
-      size = BRW_UPLOAD_INIT_SIZE;
-
-   brw->vb.upload.offset = 0;
-
-   if (brw->vb.upload.bo != NULL)
-      dri_bo_unreference(brw->vb.upload.bo);
-   brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
-				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
-}
-
-static void get_space( struct brw_context *brw,
-		       GLuint size,
-		       dri_bo **bo_return,
-		       GLuint *offset_return )
-{
-   size = ALIGN(size, 64);
-
-   if (brw->vb.upload.bo == NULL ||
-       brw->vb.upload.offset + size > brw->vb.upload.bo->size) {
-      wrap_buffers(brw, size);
-   }
-
-   assert(*bo_return == NULL);
-   dri_bo_reference(brw->vb.upload.bo);
-   *bo_return = brw->vb.upload.bo;
-   *offset_return = brw->vb.upload.offset;
-   brw->vb.upload.offset += size;
-}
-
-static void
-copy_array_to_vbo_array( struct brw_context *brw,
-			 struct brw_vertex_element *element,
-			 GLuint dst_stride)
-{
-   struct intel_context *intel = &brw->intel;
-   GLuint size = element->count * dst_stride;
-
-   get_space(brw, size, &element->bo, &element->offset);
 
-   if (element->glarray->StrideB == 0) {
-      assert(element->count == 1);
-      element->stride = 0;
-   } else {
-      element->stride = dst_stride;
-   }
-
-   if (dst_stride == element->glarray->StrideB) {
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 memcpy((char *)element->bo->virtual + element->offset,
-		element->glarray->Ptr, size);
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			element->glarray->Ptr);
-      }
-   } else {
-      char *dest;
-      const unsigned char *src = element->glarray->Ptr;
-      int i;
-
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 dest = element->bo->virtual;
-	 dest += element->offset;
-
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 void *data;
-
-	 data = _mesa_malloc(dst_stride * element->count);
-	 dest = data;
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			data);
-
-	 _mesa_free(data);
-      }
-   }
-}
 
-static void brw_prepare_vertices(struct brw_context *brw)
+static boolean brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
@@ -358,123 +208,38 @@ static void brw_prepare_vertices(struct brw_context *brw)
    if (0)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
-   /* Accumulate the list of enabled arrays. */
-   brw->vb.nr_enabled = 0;
-   while (vs_inputs) {
-      GLuint i = _mesa_ffsll(vs_inputs) - 1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      vs_inputs &= ~(1 << i);
-      brw->vb.enabled[brw->vb.nr_enabled++] = input;
-   }
-
-   /* XXX: In the rare cases where this happens we fallback all
-    * the way to software rasterization, although a tnl fallback
-    * would be sufficient.  I don't know of *any* real world
-    * cases with > 17 vertex attributes enabled, so it probably
-    * isn't an issue at this point.
-    */
-   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
-      intel->Fallback = 1;
-      return;
-   }
 
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
 
-      if (_mesa_is_bufferobj(input->glarray->BufferObj)) {
-	 struct intel_buffer_object *intel_buffer =
-	    intel_buffer_object(input->glarray->BufferObj);
-
-	 /* Named buffer object: Just reference its contents directly. */
-	 dri_bo_unreference(input->bo);
-	 input->bo = intel_bufferobj_buffer(intel, intel_buffer,
-					    INTEL_READ);
-	 dri_bo_reference(input->bo);
-	 input->offset = (unsigned long)input->glarray->Ptr;
-	 input->stride = input->glarray->StrideB;
-	 input->count = input->glarray->_MaxElement;
-
-	 /* This is a common place to reach if the user mistakenly supplies
-	  * a pointer in place of a VBO offset.  If we just let it go through,
-	  * we may end up dereferencing a pointer beyond the bounds of the
-	  * GTT.  We would hope that the VBO's max_index would save us, but
-	  * Mesa appears to hand us min/max values not clipped to the
-	  * array object's _MaxElement, and _MaxElement frequently appears
-	  * to be wrong anyway.
-	  *
-	  * The VBO spec allows application termination in this case, and it's
-	  * probably a service to the poor programmer to do so rather than
-	  * trying to just not render.
-	  */
-	 assert(input->offset < input->bo->size);
-      } else {
-	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
-	 if (input->bo != NULL) {
-	    /* Already-uploaded vertex data is present from a previous
-	     * prepare_vertices, but we had to re-validate state due to
-	     * check_aperture failing and a new batch being produced.
-	     */
-	    continue;
-	 }
-
-	 /* Queue the buffer object up to be uploaded in the next pass,
-	  * when we've decided if we're doing interleaved or not.
-	  */
-	 if (input->attrib == VERT_ATTRIB_POS) {
-	    /* Position array not properly enabled:
-	     */
-            if (input->glarray->StrideB == 0) {
-               intel->Fallback = 1;
-               return;
-            }
-
-	    interleave = input->glarray->StrideB;
-	    ptr = input->glarray->Ptr;
-	 }
-	 else if (interleave != input->glarray->StrideB ||
-		  (const unsigned char *)input->glarray->Ptr - ptr < 0 ||
-		  (const unsigned char *)input->glarray->Ptr - ptr > interleave)
-	 {
-	    interleave = 0;
-	 }
-
-	 upload[nr_uploads++] = input;
-	 
-	 /* We rebase drawing to start at element zero only when
-	  * varyings are not in vbos, which means we can end up
-	  * uploading non-varying arrays (stride != 0) when min_index
-	  * is zero.  This doesn't matter as the amount to upload is
-	  * the same for these arrays whether the draw call is rebased
-	  * or not - we just have to upload the one element.
-	  */
-	 assert(min_index == 0 || input->glarray->StrideB == 0);
-      }
-   }
-
-   /* Handle any arrays to be uploaded. */
-   if (nr_uploads > 1 && interleave && interleave <= 256) {
-      /* All uploads are interleaved, so upload the arrays together as
-       * interleaved.  First, upload the contents and set up upload[0].
-       */
-      copy_array_to_vbo_array(brw, upload[0], interleave);
-
-      for (i = 1; i < nr_uploads; i++) {
-	 /* Then, just point upload[i] at upload[0]'s buffer. */
-	 upload[i]->stride = interleave;
-	 upload[i]->offset = upload[0]->offset +
-	    ((const unsigned char *)upload[i]->glarray->Ptr - ptr);
-	 upload[i]->bo = upload[0]->bo;
-	 dri_bo_reference(upload[i]->bo);
+      if (brw_is_user_buffer(vb)) {
+	 u_upload_buffer( brw->upload, 
+			  min_index * vb->stride,
+			  (max_index + 1 - min_index) * vb->stride,
+			  &offset,
+			  &buffer );
       }
-   }
-   else {
-      /* Upload non-interleaved arrays */
-      for (i = 0; i < nr_uploads; i++) {
-          copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
+      else
+      {
+	 offset = 0;
+	 buffer = vb->buffer;
+	 count = stride == 0 ? 1 : max_index + 1 - min_index;
       }
+
+      /* Named buffer object: Just reference its contents directly. */
+      dri_bo_unreference(input->bo);
+      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
+					 INTEL_READ);
+      dri_bo_reference(input->bo);
+
+      input->offset = (unsigned long)offset;
+      input->stride = vb->stride;
+      input->count = count;
+
+      assert(input->offset < input->bo->size);
    }
 
    brw_prepare_query_begin(brw);
@@ -632,13 +397,8 @@ static void brw_prepare_indices(struct brw_context *brw)
 
       /* Straight upload
        */
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(bo);
-	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
-	 drm_intel_gem_bo_unmap_gtt(bo);
-      } else {
-	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
-      }
+      brw_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
       brw->ib.start_vertex_offset = 0;
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 48c2b9a41c..5ec0c585fe 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -58,7 +58,7 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
 
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
new file mode 100644
index 0000000000..b351794dce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -0,0 +1,41 @@
+
+   /* _NEW_COLOR */
+   if (key->logic_op != GL_COPY) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
+   } else if (key->color_blend) {
+      GLenum eqRGB = key->blend_eq_rgb;
+      GLenum eqA = key->blend_eq_a;
+      GLenum srcRGB = key->blend_src_rgb;
+      GLenum dstRGB = key->blend_dst_rgb;
+      GLenum srcA = key->blend_src_a;
+      GLenum dstA = key->blend_dst_a;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	 srcRGB = dstRGB = GL_ONE;
+      }
+
+      if (eqA == GL_MIN || eqA == GL_MAX) {
+	 srcA = dstA = GL_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+
+   if (key->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
diff --git a/src/gallium/drivers/i965/brw_pipe_debug.c b/src/gallium/drivers/i965/brw_pipe_debug.c
new file mode 100644
index 0000000000..34d6d4028a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_debug.c
@@ -0,0 +1,2 @@
+   if (INTEL_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
new file mode 100644
index 0000000000..da29bc8bcb
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -0,0 +1,52 @@
+   /* _NEW_STENCIL */
+   if (key->dsa.stencil[0].enable) {
+      cc.cc0.stencil_enable = 1;
+      cc.cc0.stencil_func =
+	 intel_translate_compare_func(key->stencil_func[0]);
+      cc.cc0.stencil_fail_op =
+	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+      cc.cc0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+      cc.cc0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
+      cc.cc1.stencil_ref = key->stencil_ref[0];
+      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
+      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
+
+      if (key->stencil_two_side) {
+	 cc.cc0.bf_stencil_enable = 1;
+	 cc.cc0.bf_stencil_func =
+	    intel_translate_compare_func(key->stencil_func[1]);
+	 cc.cc0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
+	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
+	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
+	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
+      }
+
+      /* Not really sure about this:
+       */
+      if (key->stencil_write_mask[0] ||
+	  (key->stencil_two_side && key->stencil_write_mask[1]))
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+
+   if (key->alpha_enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+   }
+
+   /* _NEW_DEPTH */
+   if (key->depth_test) {
+      cc.cc2.depth_test = 1;
+      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
+      cc.cc2.depth_write_enable = key->depth_write;
+   }
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
new file mode 100644
index 0000000000..d4ae332f46
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -0,0 +1,25 @@
+
+/**
+ * called from intelDrawBuffer()
+ */
+static void brw_set_draw_region( struct intel_context *intel, 
+                                 struct intel_region *color_regions[],
+                                 struct intel_region *depth_region,
+                                 GLuint num_color_regions)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   GLuint i;
+
+   /* release old color/depth regions */
+   if (brw->state.depth_region != depth_region)
+      brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
+   intel_region_release(&brw->state.depth_region);
+
+   /* reference new color/depth regions */
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
+   intel_region_reference(&brw->state.depth_region, depth_region);
+   brw->state.nr_color_regions = num_color_regions;
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
new file mode 100644
index 0000000000..008f623151
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -0,0 +1,64 @@
+
+/**
+ * called from intel_batchbuffer_flush and children before sending a
+ * batchbuffer off.
+ */
+static void brw_finish_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   brw_emit_query_end(brw);
+}
+
+
+/**
+ * called from intelFlushBatchLocked
+ */
+static void brw_new_batch( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   /* Check that we didn't just wrap our batchbuffer at a bad time. */
+   assert(!brw->no_batch_wrap);
+
+   brw->curbe.need_new_bo = GL_TRUE;
+
+   /* Mark all context state as needing to be re-emitted.
+    * This is probably not as severe as on 915, since almost all of our state
+    * is just in referenced buffers.
+    */
+   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+
+   /* Move to the end of the current upload buffer so that we'll force choosing
+    * a new buffer next time.
+    */
+   if (brw->vb.upload.bo != NULL) {
+      dri_bo_unreference(brw->vb.upload.bo);
+      brw->vb.upload.bo = NULL;
+      brw->vb.upload.offset = 0;
+   }
+}
+
+
+static void brw_note_fence( struct intel_context *intel, GLuint fence )
+{
+   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
+}
+
+/* called from intelWaitForIdle() and intelFlush()
+ *
+ * For now, just flush everything.  Could be smarter later.
+ */
+static GLuint brw_flush_cmd( void )
+{
+   struct brw_mi_flush flush;
+   flush.opcode = CMD_MI_FLUSH;
+   flush.pad = 0;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
+   return *(GLuint *)&flush;
+}
+
+
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
new file mode 100644
index 0000000000..d199d0b81a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -0,0 +1,27 @@
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && 
+	     irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e1c2c7777b..90513245ee 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -59,9 +59,9 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
 
    c.key = *key;
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_attrs = util_count_bits(c.key.attrs & DO_SETUP_BITS);
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index ca8f97f9f9..4cc427a935 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -150,7 +150,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
@@ -188,7 +188,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b817b741e7..6801084616 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -270,7 +270,7 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
 /***********************************************************************
  * Emit all state:
  */
-void brw_validate_state( struct brw_context *brw )
+enum pipe_error brw_validate_state( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
@@ -278,10 +278,6 @@ void brw_validate_state( struct brw_context *brw )
    GLuint i;
 
    brw_clear_validated_bos(brw);
-
-   state->mesa |= brw->intel.NewGLState;
-   brw->intel.NewGLState = 0;
-
    brw_add_validated_bo(brw, intel->batch->buf);
 
    if (brw->emit_state_always) {
@@ -290,36 +286,23 @@ void brw_validate_state( struct brw_context *brw )
       state->cache |= ~0;
    }
 
-   if (brw->fragment_program != ctx->FragmentProgram._Current) {
-      brw->fragment_program = ctx->FragmentProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-   }
-
-   if (brw->vertex_program != ctx->VertexProgram._Current) {
-      brw->vertex_program = ctx->VertexProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-   }
-
    if (state->mesa == 0 &&
        state->cache == 0 &&
        state->brw == 0)
-      return;
+      return 0;
 
    if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
       brw_clear_batch_cache(brw);
 
-   brw->intel.Fallback = 0;
-
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
       const struct brw_tracked_state *atom = atoms[i];
 
-      if (brw->intel.Fallback)
-         break;
-
       if (check_state(state, &atom->dirty)) {
          if (atom->prepare) {
-            atom->prepare(brw);
+            ret = atom->prepare(brw);
+	    if (ret)
+	       return ret;
         }
       }
    }
@@ -329,17 +312,18 @@ void brw_validate_state( struct brw_context *brw )
     * If this fails, we can experience GPU lock-ups.
     */
    {
-      const struct brw_fragment_program *fp;
-      fp = brw_fragment_program_const(brw->fragment_program);
+      const struct brw_fragment_program *fp = brw->fragment_program;
       if (fp) {
-         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
-                == fp->tex_units_used);
+         assert(fp->info.max_sampler <= brw->nr_samplers &&
+		fp->info.max_texture <= brw->nr_textures);
       }
    }
+
+   return 0;
 }
 
 
-void brw_upload_state(struct brw_context *brw)
+enum pipe_error brw_upload_state(struct brw_context *brw)
 {
    struct brw_state_flags *state = &brw->state.dirty;
    int i;
@@ -356,7 +340,7 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < Elements(atoms); i++) {	 
+      for (i = 0; i < Elements(atoms); i++) {
 	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
@@ -364,12 +348,11 @@ void brw_upload_state(struct brw_context *brw)
 		atom->dirty.brw ||
 		atom->dirty.cache);
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
 
@@ -388,12 +371,11 @@ void brw_upload_state(struct brw_context *brw)
       for (i = 0; i < Elements(atoms); i++) {	 
 	 const struct brw_tracked_state *atom = atoms[i];
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
       }
@@ -407,10 +389,11 @@ void brw_upload_state(struct brw_context *brw)
 	 brw_print_dirty_count(mesa_bits, state->mesa);
 	 brw_print_dirty_count(brw_bits, state->brw);
 	 brw_print_dirty_count(cache_bits, state->cache);
-	 fprintf(stderr, "\n");
+	 debug_printf("\n");
       }
    }
-
-   if (!brw->intel.Fallback)
-      memset(state, 0, sizeof(*state));
+   
+   /* Clear dirty flags:
+    */
+   memset(state, 0, sizeof(*state));
 }
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
new file mode 100644
index 0000000000..6684f442d5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -0,0 +1,114 @@
+
+/* XXX: could split the primitive list to fallback only on the
+ * non-conformant primitives.
+ */
+static GLboolean check_fallbacks( struct brw_context *brw,
+				  const struct _mesa_prim *prim,
+				  GLuint nr_prims )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   GLuint i;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
+      return GL_FALSE;
+
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
+   if (ctx->Polygon.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware will do AA lines, but they are non-conformant it
+    * seems.  TBD whether we keep this fallback:
+    */
+   if (ctx->Line.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_LINES) 
+	    return GL_TRUE;
+   }
+
+   /* Stipple -- these fallbacks could be resolved with a little
+    * bit of work?
+    */
+   if (ctx->Line.StippleFlag) {
+      for (i = 0; i < nr_prims; i++) {
+	 /* GS doesn't get enough information to know when to reset
+	  * the stipple counter?!?
+	  */
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
+	    return GL_TRUE;
+	    
+	 if (prim[i].mode == GL_POLYGON &&
+	     (ctx->Polygon.FrontMode == GL_LINE ||
+	      ctx->Polygon.BackMode == GL_LINE))
+	    return GL_TRUE;
+      }
+   }
+
+   if (ctx->Point.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (prim[i].mode == GL_POINTS) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
+
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->nr_ve == 0 ||
+       brw->nr_ve >= BRW_VEP_MAX) {
+      return TRUE;
+   }
+
+   /* Position array with zero stride?
+    */
+   if (brw->vs[brw->ve[0]]->stride == 0)
+      return TRUE;
+
+
+      
+   /* Nothing stopping us from the fast path now */
+   return GL_FALSE;
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
new file mode 100644
index 0000000000..32b62848da
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -0,0 +1,11 @@
+#ifndef BRW_TYPES_H
+#define BRW_TYPES_H
+
+typedef GLuint uint32_t;
+typedef GLubyte uint8_t;
+typedef GLushort uint16_t;
+/* no GLenum, translate all away */
+
+typedef GLboolean uint8_t;
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c
index ce21aa4869..17f671a8fa 100644
--- a/src/gallium/drivers/i965/brw_util.c
+++ b/src/gallium/drivers/i965/brw_util.c
@@ -35,14 +35,6 @@
 #include "brw_util.h"
 #include "brw_defines.h"
 
-GLuint brw_count_bits( GLuint val )
-{
-   GLuint i;
-   for (i = 0; val ; val >>= 1)
-      if (val & 1)
-	 i++;
-   return i;
-}
 
 
 GLuint brw_translate_blend_equation( GLenum mode )
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index f0c79efbd9..53a5560105 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -61,9 +61,7 @@ static void do_vs_prog( struct brw_context *brw,
    }
 
    if (0)
-      _mesa_print_program(&c.vp->program.Base);
-
-
+      tgsi_dump(&c.vp->tokens, 0);
 
    /* Emit GEN4 code.
     */
@@ -96,9 +94,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
-   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
-			ctx->Polygon.BackMode != GL_FILL);
+   key.nr_userclip = brw->nr_userclip;
+   key.copy_edgeflag = (brw->rast->fill_ccw != PIPE_POLYGON_MODE_FILL ||
+			brw->rast->fill_cw != PIPE_POLYGON_MODE_FILL);
 
    /* Make an early check for the key.
     */
@@ -116,7 +114,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
+      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 1638ef8111..7f20c4baca 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "main/macros.h"
 #include "shader/program.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "pipe/p_shader_tokens.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 
@@ -129,6 +129,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 reg++;
       }
    }
+
    /* If there are no inputs, we'll still be reading one attribute's worth
     * because it's required -- see urb_read_length setting.
     */
@@ -226,6 +227,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * vertex urb, so is half the amount:
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+
    /* Setting this field to 0 leads to undefined behavior according to the
     * the VS_STATE docs.  Our VUEs will always have at least one attribute
     * sitting in them, even if it's padding.
@@ -960,9 +962,6 @@ static void emit_arl( struct brw_vs_compile *c,
 
 /**
  * Return the brw reg for the given instruction's src argument.
- * Will return mangled results for SWZ op.  The emit_swz() function
- * ignores this result and recalculates taking extended swizzles into
- * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
                                const struct prog_instruction *inst,
@@ -1024,74 +1023,6 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 }
 
 
-static void emit_swz( struct brw_vs_compile *c, 
-		      struct brw_reg dst,
-                      const struct prog_instruction *inst)
-{
-   const GLuint argIndex = 0;
-   const struct prog_src_register src = inst->SrcReg[argIndex];
-   struct brw_compile *p = &c->func;
-   GLuint zeros_mask = 0;
-   GLuint ones_mask = 0;
-   GLuint src_mask = 0;
-   GLubyte src_swz[4];
-   GLboolean need_tmp = (src.Negate &&
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-   struct brw_reg tmp = dst;
-   GLuint i;
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   for (i = 0; i < 4; i++) {
-      if (dst.dw1.bits.writemask & (1<<i)) {
-	 GLubyte s = GET_SWZ(src.Swizzle, i);
-	 switch (s) {
-	 case SWIZZLE_X:
-	 case SWIZZLE_Y:
-	 case SWIZZLE_Z:
-	 case SWIZZLE_W:
-	    src_mask |= 1<<i;
-	    src_swz[i] = s;
-	    break;
-	 case SWIZZLE_ZERO:
-	    zeros_mask |= 1<<i;
-	    break;
-	 case SWIZZLE_ONE:
-	    ones_mask |= 1<<i;
-	    break;
-	 }
-      }
-   }
-   
-   /* Do src first, in case dst aliases src:
-    */
-   if (src_mask) {
-      struct brw_reg arg0;
-
-      arg0 = get_src_reg(c, inst, argIndex);
-
-      arg0 = brw_swizzle(arg0, 
-			 src_swz[0], src_swz[1], 
-			 src_swz[2], src_swz[3]);
-
-      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
-   } 
-   
-   if (zeros_mask) 
-      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
-
-   if (ones_mask) 
-      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
-
-   if (src.Negate)
-      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
-   
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
 
 
 /**
@@ -1332,20 +1263,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
-   for (insn = 0; insn < nr_insns; insn++) {
-       GLuint i;
-       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
-       for (i = 0; i < 3; i++) {
-	   struct prog_src_register *src = &inst->SrcReg[i];
-	   GLuint index = src->Index;
-	   GLuint file = src->File;	
-	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
-	       c->output_regs[index].used_in_src = GL_TRUE;
-       }
-   }
-
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
@@ -1362,18 +1279,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
       _mesa_print_instruction(inst);
 #endif
 
-      /* Get argument regs.  SWZ is special and does this itself.
+      /* Get argument regs.
        */
-      if (inst->Opcode != OPCODE_SWZ)
-	  for (i = 0; i < 3; i++) {
-	      const struct prog_src_register *src = &inst->SrcReg[i];
-	      index = src->Index;
-	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-		  args[i] = c->output_regs[index].reg;
-	      else
-                  args[i] = get_arg(c, inst, i);
-	  }
+      for (i = 0; i < 3; i++) {
+	 const struct prog_src_register *src = &inst->SrcReg[i];
+	 index = src->Index;
+	 file = src->File;	
+	 args[i] = get_arg(c, inst, i);
+      }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
        * dst and arg, given the static allocation of registers.  So
@@ -1381,10 +1294,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
        */ 
       index = inst->DstReg.Index;
       file = inst->DstReg.File;
-      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-	  dst = c->output_regs[index].reg;
-      else
-	  dst = get_dst(c, inst->DstReg);
+      dst = get_dst(c, inst->DstReg);
 
       if (inst->SaturateMode != SATURATE_OFF) {
 	 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
@@ -1392,151 +1302,144 @@ void brw_vs_emit(struct brw_vs_compile *c )
       }
 
       switch (inst->Opcode) {
-      case OPCODE_ABS:
+      case TGSI_OPCODE_ABS:
 	 brw_MOV(p, dst, brw_abs(args[0]));
 	 break;
-      case OPCODE_ADD:
+      case TGSI_OPCODE_ADD:
 	 brw_ADD(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_COS:
+      case TGSI_OPCODE_COS:
 	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_DP3:
+      case TGSI_OPCODE_DP3:
 	 brw_DP3(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 brw_DP4(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 brw_DPH(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_NRM3:
+      case TGSI_OPCODE_NRM3:
 	 emit_nrm(c, dst, args[0], 3);
 	 break;
-      case OPCODE_NRM4:
+      case TGSI_OPCODE_NRM4:
 	 emit_nrm(c, dst, args[0], 4);
 	 break;
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
 	 break;
-      case OPCODE_EXP:
+      case TGSI_OPCODE_EXP:
 	 unalias1(c, dst, args[0], emit_exp_noalias);
 	 break;
-      case OPCODE_EX2:
+      case TGSI_OPCODE_EX2:
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_ARL:
+      case TGSI_OPCODE_ARL:
 	 emit_arl(c, dst, args[0]);
 	 break;
-      case OPCODE_FLR:
+      case TGSI_OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
 	 break;
-      case OPCODE_FRC:
+      case TGSI_OPCODE_FRC:
 	 brw_FRC(p, dst, args[0]);
 	 break;
-      case OPCODE_LOG:
+      case TGSI_OPCODE_LOG:
 	 unalias1(c, dst, args[0], emit_log_noalias);
 	 break;
-      case OPCODE_LG2:
+      case TGSI_OPCODE_LG2:
 	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 unalias1(c, dst, args[0], emit_lit_noalias);
 	 break;
-      case OPCODE_LRP:
+      case TGSI_OPCODE_LRP:
 	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
 	 break;
-      case OPCODE_MAD:
+      case TGSI_OPCODE_MAD:
 	 brw_MOV(p, brw_acc_reg(), args[2]);
 	 brw_MAC(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MAX:
+      case TGSI_OPCODE_MAX:
 	 emit_max(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MIN:
+      case TGSI_OPCODE_MIN:
 	 emit_min(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MOV:
+      case TGSI_OPCODE_MOV:
 	 brw_MOV(p, dst, args[0]);
 	 break;
-      case OPCODE_MUL:
+      case TGSI_OPCODE_MUL:
 	 brw_MUL(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
 	 break;
-      case OPCODE_RCP:
+      case TGSI_OPCODE_RCP:
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_RSQ:
+      case TGSI_OPCODE_RSQ:
 	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-
-      case OPCODE_SEQ:
+      case TGSI_OPCODE_SEQ:
          emit_seq(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SIN:
+      case TGSI_OPCODE_SIN:
 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_SNE:
+      case TGSI_OPCODE_SNE:
          emit_sne(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SGE:
+      case TGSI_OPCODE_SGE:
 	 emit_sge(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SGT:
+      case TGSI_OPCODE_SGT:
          emit_sgt(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SLT:
+      case TGSI_OPCODE_SLT:
 	 emit_slt(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SLE:
+      case TGSI_OPCODE_SLE:
          emit_sle(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SUB:
+      case TGSI_OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
 	 break;
-      case OPCODE_SWZ:
-	 /* The args[0] value can't be used here as it won't have
-	  * correctly encoded the full swizzle:
-	  */
-	 emit_swz(c, dst, inst);
-	 break;
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_TRUNC:
          /* round toward zero */
 	 brw_RNDZ(p, dst, args[0]);
 	 break;
-      case OPCODE_XPD:
+      case TGSI_OPCODE_XPD:
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_IF:
+      case TGSI_OPCODE_IF:
 	 assert(if_depth < MAX_IF_DEPTH);
 	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
 	 /* Note that brw_IF smashes the predicate_control field. */
 	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
 	 if_depth++;
 	 break;
-      case OPCODE_ELSE:
+      case TGSI_OPCODE_ELSE:
 	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
-      case OPCODE_ENDIF:
+      case TGSI_OPCODE_ENDIF:
          assert(if_depth > 0);
 	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
-      case OPCODE_BGNLOOP:
+      case TGSI_OPCODE_BGNLOOP:
          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
          break;
-      case OPCODE_BRK:
+      case TGSI_OPCODE_BRK:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_BREAK(p);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CONT:
+      case TGSI_OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_CONT(p);
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
+      case TGSI_OPCODE_ENDLOOP: 
          {
             struct brw_instruction *inst0, *inst1;
 	    GLuint br = 1;
@@ -1550,23 +1453,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
             while (inst0 > loop_inst[loop_depth]) {
                inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+               if (inst0->header.opcode == BRW_TGSI_OPCODE_BREAK) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
                   inst0->bits3.if_else.pop_count = 0;
                }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+               else if (inst0->header.opcode == BRW_TGSI_OPCODE_CONTINUE) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                   inst0->bits3.if_else.pop_count = 0;
                }
             }
          }
          break;
-      case OPCODE_BRA:
+      case TGSI_OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CAL:
+      case TGSI_OPCODE_CAL:
 	 brw_set_access_mode(p, BRW_ALIGN_1);
 	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
@@ -1575,27 +1478,27 @@ void brw_vs_emit(struct brw_vs_compile *c )
          brw_save_call(p, inst->Comment, p->nr_insn);
 	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_RET:
+      case TGSI_OPCODE_RET:
 	 brw_ADD(p, get_addr_reg(stack_index),
 			 get_addr_reg(stack_index), brw_imm_d(-4));
 	 brw_set_access_mode(p, BRW_ALIGN_1);
          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
 	 break;
-      case OPCODE_END:	
+      case TGSI_OPCODE_END:	
          end_offset = p->nr_insn;
          /* this instruction will get patched later to jump past subroutine
           * code, etc.
           */
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_PRINT:
+      case TGSI_OPCODE_PRINT:
          /* no-op */
          break;
-      case OPCODE_BGNSUB:
+      case TGSI_OPCODE_BGNSUB:
          brw_save_label(p, inst->Comment, p->nr_insn);
          break;
-      case OPCODE_ENDSUB:
+      case TGSI_OPCODE_ENDSUB:
          /* no-op */
          break;
       default:
@@ -1618,33 +1521,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
       }
 
-      if ((inst->DstReg.File == PROGRAM_OUTPUT)
-          && (inst->DstReg.Index != VERT_RESULT_HPOS)
-          && c->output_regs[inst->DstReg.Index].used_in_src) {
-         brw_MOV(p, get_dst(c, inst->DstReg), dst);
-      }
-
-      /* Result color clamping.
-       *
-       * When destination register is an output register and
-       * it's primary/secondary front/back color, we have to clamp
-       * the result to [0,1]. This is done by enabling the
-       * saturation bit for the last instruction.
-       *
-       * We don't use brw_set_saturate() as it modifies
-       * p->current->header.saturate, which affects all the subsequent
-       * instructions. Instead, we directly modify the header
-       * of the last (already stored) instruction.
-       */
-      if (inst->DstReg.File == PROGRAM_OUTPUT) {
-         if ((inst->DstReg.Index == VERT_RESULT_COL0)
-             || (inst->DstReg.Index == VERT_RESULT_COL1)
-             || (inst->DstReg.Index == VERT_RESULT_BFC0)
-             || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
-            p->store[p->nr_insn-1].header.saturate = 1;
-         }
-      }
-
       release_tmps(c);
    }
 
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 2292de94c4..20d31880b4 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -269,61 +269,46 @@ static void brw_wm_populate_key( struct brw_context *brw,
 		    uses_depth,
 		    key);
 
+   /* Revisit this, figure out if it's really useful, and either push
+    * it into the state tracker so that everyone benefits (use to
+    * create fs varients with TEX rather than TXP), or discard.
+    */
+   key->proj_attrib_mask = ~0; /*brw->wm.input_size_masks[4-1];*/
 
-   /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
-
-   /* _NEW_LIGHT */
-   key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
+   /* PIPE_NEW_RAST */
+   key->flat_shade = brw->rast.flat_shade;
 
-   /* _NEW_HINT */
-   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+   /* This can be determined by looking at the INTERP mode each input decl.
+    */
+   key->linear_color = 0;
 
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
-
-      if (unit->_ReallyEnabled) {
-         const struct gl_texture_object *t = unit->_Current;
-         const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+      if (i < brw->nr_textures) {
+	 const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
+	 const struct gl_texture_object *t = unit->_Current;
+	 const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 
 	 if (img->InternalFormat == GL_YCBCR_MESA) {
 	    key->yuvtex_mask |= 1 << i;
 	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
-		key->yuvtex_swap_mask |= 1 << i;
+	       key->yuvtex_swap_mask |= 1 << i;
 	 }
 
-         key->tex_swizzles[i] = t->_Swizzle;
+	 key->tex_swizzles[i] = t->_Swizzle;
+	 
+	 if (0)
+	    key->shadowtex_mask |= 1<<i;
       }
       else {
          key->tex_swizzles[i] = SWIZZLE_NOOP;
       }
    }
 
-   /* Shadow */
-   key->shadowtex_mask = fp->program.Base.ShadowSamplers;
 
-   /* _NEW_BUFFERS */
-   /*
-    * Include the draw buffer origin and height so that we can calculate
-    * fragment position values relative to the bottom left of the drawable,
-    * from the incoming screen origin relative position we get as part of our
-    * payload.
-    *
-    * We could avoid recompiling by including this as a constant referenced by
-    * our program, but if we were to do that it would also be nice to handle
-    * getting that constant updated at batchbuffer submit time (when we
-    * hold the lock and know where the buffer really is) rather than at emit
-    * time when we don't hold the lock and are just guessing.  We could also
-    * just avoid using this as key data if the program doesn't use
-    * fragment.position.
-    *
-    * This pretty much becomes moot with DRI2 and redirected buffers anyway,
-    * as our origins will always be zero then.
-    */
+   /* _NEW_FRAMEBUFFER */
    if (brw->intel.driDrawable != NULL) {
-      key->origin_x = brw->intel.driDrawable->x;
-      key->origin_y = brw->intel.driDrawable->y;
-      key->drawable_height = brw->intel.driDrawable->h;
+      key->drawable_height = brw->fb.cbufs[0].height;
    }
 
    /* CACHE_NEW_VS_PROG */
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 872b1f3ecf..756a680150 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -76,7 +76,6 @@ struct brw_wm_prog_key {
    GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
-   GLuint origin_x, origin_y;
    GLuint drawable_height;
    GLuint vp_outputs_written;
 };
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index bf80a2942a..9c47c46a3d 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -125,23 +125,21 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   /* Calculate the pixel offset from window bottom left into destination
-    * X and Y channels.
-    */
    if (mask & WRITEMASK_X) {
-      /* X' = X - origin */
-      brw_ADD(p,
+      /* X' = X */
+      brw_MOV(p,
 	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_W),
-	      brw_imm_d(0 - c->key.origin_x));
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
    }
 
+   /* XXX: is this needed any more, or is this a NOOP?
+    */
    if (mask & WRITEMASK_Y) {
-      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
+      /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
 	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
-	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+	      brw_imm_d(c->key.drawable_height - 1));
    }
 }
 
@@ -1376,7 +1374,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_MOV:
-      case OPCODE_SWZ:
 	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 	 break;
 
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 4e3edfbbff..5f47d86f71 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -30,25 +30,12 @@
   */
                
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_shader_constants.h"
+
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_util.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
-
-
-/** An invalid texture target */
-#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS
-
-/** An invalid texture unit */
-#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT
-
-#define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
 
 #define X    0
 #define Y    1
@@ -68,11 +55,6 @@ static const char *wm_opcode_strings[] = {
    "FRONTFACING",
 };
 
-#if 0
-static const char *wm_file_strings[] = {   
-   "PAYLOAD"
-};
-#endif
 
 
 /***********************************************************************
@@ -165,13 +147,13 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    }
 
    c->fp_temp |= 1<<(bit-1);
-   return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1));
+   return dst_reg(PROGRAM_TEMPORARY, c->first_internal_temp+(bit-1));
 }
 
 
 static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
 {
-   c->fp_temp &= ~(1 << (temp.Index - FIRST_INTERNAL_TEMP));
+   c->fp_temp &= ~(1 << (temp.Index - c->first_internal_temp));
 }
 
 
@@ -192,58 +174,29 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
    return inst;
 }
 
-static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       GLuint tex_src_unit,
-				       GLuint tex_src_target,
-				       GLuint tex_shadow,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
+static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+					 GLuint op,
+					 struct prog_dst_register dest,
+					 GLuint saturate,
+					 struct prog_src_register src0,
+					 struct prog_src_register src1,
+					 struct prog_src_register src2 )
 {
    struct prog_instruction *inst = get_fp_inst(c);
       
-   assert(tex_src_unit < BRW_MAX_TEX_UNIT ||
-          tex_src_unit == TEX_UNIT_NONE);
-   assert(tex_src_target < NUM_TEXTURE_TARGETS ||
-          tex_src_target == TEX_TARGET_NONE);
-
-   /* update mask of which texture units are referenced by this program */
-   if (tex_src_unit != TEX_UNIT_NONE)
-      c->fp->tex_units_used |= (1 << tex_src_unit);
-
    memset(inst, 0, sizeof(*inst));
 
    inst->Opcode = op;
    inst->DstReg = dest;
    inst->SaturateMode = saturate;   
-   inst->TexSrcUnit = tex_src_unit;
-   inst->TexSrcTarget = tex_src_target;
-   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
    return inst;
 }
-   
-
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
-{
-   return emit_tex_op(c, op, dest, saturate,
-                      TEX_UNIT_NONE, TEX_TARGET_NONE, 0,  /* unit, tgt, shadow */
-                      src0, src1, src2);
-}
 
 
-/* Many Mesa opcodes produce the same value across all the result channels.
+/* Many opcodes produce the same value across all the result channels.
  * We'd rather not have to support that splatting in the opcode implementations,
  * and brw_wm_pass*.c wants to optimize them out by shuffling references around
  * anyway.  We can easily get both by emitting the opcode to one channel, and
@@ -267,7 +220,7 @@ static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
    other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
    if (other_channel_mask != 0) {
       inst = emit_op(c,
-		     OPCODE_MOV,
+		     TGSI_OPCODE_MOV,
 		     dst_mask(inst0->DstReg, other_channel_mask),
 		     0,
 		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
@@ -356,7 +309,9 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 }
 
 static void emit_interp( struct brw_wm_compile *c,
-			 GLuint idx )
+			 GLuint semantic,
+			 GLuint semantic_index,
+			 GLuint interp_mode )
 {
    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
@@ -366,7 +321,7 @@ static void emit_interp( struct brw_wm_compile *c,
     * multiplied by 1/W in the SF program, and LINTERP on those
     * which have not:
     */
-   switch (idx) {
+   switch (semantic) {
    case FRAG_ATTRIB_WPOS:
       /* Have to treat wpos.xy specially:
        */
@@ -390,8 +345,8 @@ static void emit_interp( struct brw_wm_compile *c,
 	      deltas,
 	      src_undef());
       break;
-   case FRAG_ATTRIB_COL0:
-   case FRAG_ATTRIB_COL1:
+
+   case TGSI_SEMANTIC_COLOR:
       if (c->key.flat_shade) {
 	 emit_op(c,
 		 WM_CINTERP,
@@ -402,25 +357,13 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-         if (c->key.linear_color) {
-            emit_op(c,
-                    WM_LINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    src_undef());
-         }
-         else {
-            /* perspective-corrected color interpolation */
-            emit_op(c,
-                    WM_PINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    get_pixel_w(c));
-         }
+	 emit_op(c,
+		 translate_interp_mode(interp_mode),
+		 dst,
+		 0,
+		 interp,
+		 deltas,
+		 src_undef());
       }
       break;
    case FRAG_ATTRIB_FOGC:
@@ -434,7 +377,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
@@ -468,7 +411,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
@@ -482,7 +425,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
    default:
       emit_op(c,
-	      WM_PINTERP,
+	      translate_interp_mode(interp_mode),
 	      dst,
 	      0,
 	      interp,
@@ -490,8 +433,6 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
       break;
    }
-
-   c->fp_interp_emitted |= 1<<idx;
 }
 
 /***********************************************************************
@@ -581,7 +522,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(dst, WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
@@ -596,7 +537,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.xz = swz src0.1zzz
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
@@ -609,7 +550,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.w = mov src1.w
        */
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
@@ -631,7 +572,7 @@ static void precalc_lit( struct brw_wm_compile *c,
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
@@ -643,7 +584,7 @@ static void precalc_lit( struct brw_wm_compile *c,
 
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
-	      OPCODE_LIT,
+	      TGSI_OPCODE_LIT,
 	      dst_mask(dst, WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
@@ -681,7 +622,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        coord = src_reg_from_dst(tmpcoord);
 
        /* tmpcoord = src0 (i.e.: coord = src0) */
-       out = emit_op(c, OPCODE_MOV,
+       out = emit_op(c, TGSI_OPCODE_MOV,
                      tmpcoord,
                      0,
                      src0,
@@ -691,7 +632,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        out->SrcReg[0].Abs = 1;
 
        /* tmp0 = MAX(coord.X, coord.Y) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp0,
                0,
                src_swizzle1(coord, X),
@@ -699,7 +640,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp1 = MAX(tmp0, coord.Z) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp1,
                0,
                tmp0src,
@@ -707,7 +648,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp0 = 1 / tmp1 */
-       emit_op(c, OPCODE_RCP,
+       emit_op(c, TGSI_OPCODE_RCP,
                dst_mask(tmp0, WRITEMASK_X),
                0,
                tmp1src,
@@ -715,7 +656,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmpCoord = src0 * tmp0 */
-       emit_op(c, OPCODE_MUL,
+       emit_op(c, TGSI_OPCODE_MUL,
                tmpcoord,
                0,
                src0,
@@ -738,7 +679,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* coord.xy   = MUL inst->SrcReg[0], { 1/width, 1/height }
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      tmpcoord,
 	      0,
 	      inst->SrcReg[0],
@@ -785,7 +726,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp     = TEX ...
        */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   tmp,
                   inst->SaturateMode,
                   unit,
@@ -798,7 +739,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp.xyz =  ADD TMP, C0
        */
       emit_op(c,
-	      OPCODE_ADD,
+	      TGSI_OPCODE_ADD,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
@@ -809,7 +750,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
 	      0,
 	      tmpsrc,
@@ -824,7 +765,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
@@ -834,7 +775,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
        */
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
@@ -846,7 +787,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    else {
       /* ordinary RGBA tex instruction */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   inst->DstReg,
                   inst->SaturateMode,
                   unit,
@@ -861,7 +802,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
       /* swizzle the result of the TEX instruction */
       struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
-      emit_op(c, OPCODE_SWZ,
+      emit_op(c, TGSI_OPCODE_MOV,
               inst->DstReg,
               SATURATE_OFF, /* saturate already done above */
               src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
@@ -884,7 +825,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
    const struct prog_src_register src = inst->SrcReg[0];
    GLboolean retVal;
 
-   assert(inst->Opcode == OPCODE_TXP);
+   assert(inst->Opcode == TGSI_OPCODE_TXP);
 
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
@@ -921,7 +862,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.w = RCP inst.arg[0][3]
        */
       emit_op(c,
-	      OPCODE_RCP,
+	      TGSI_OPCODE_RCP,
 	      dst_mask(tmp, WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
@@ -931,7 +872,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      src0,
@@ -1015,6 +956,7 @@ static void validate_src_regs( struct brw_wm_compile *c,
 	 GLuint idx = inst->SrcReg[i].Index;
 	 if (!(c->fp_interp_emitted & (1<<idx))) {
 	    emit_interp(c, idx);
+	    c->fp_interp_emitted |= 1<<idx;
 	 }
       }
    }
@@ -1094,71 +1036,64 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        */
 
       switch (inst->Opcode) {
-      case OPCODE_SWZ: 
+      case TGSI_OPCODE_ABS:
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
-	 break;
-	 
-      case OPCODE_ABS:
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
+	 out->Opcode = TGSI_OPCODE_MOV;
 	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
-      case OPCODE_SUB: 
+      case TGSI_OPCODE_SUB: 
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_ADD;
+	 out->Opcode = TGSI_OPCODE_ADD;
 	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
-      case OPCODE_SCS: 
+      case TGSI_OPCODE_SCS: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XY;
 	 break;
 	 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 precalc_dst(c, inst);
 	 break;
 
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 precalc_lit(c, inst);
 	 break;
 
-      case OPCODE_TEX:
+      case TGSI_OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
 
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TXP:
 	 precalc_txp(c, inst);
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 out = emit_insn(c, inst);
 	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
          assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_KIL: 
+      case TGSI_OPCODE_KIL: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask = 0;
 	 break;
-      case OPCODE_END:
+      case TGSI_OPCODE_END:
 	 emit_fb_write(c);
 	 break;
-      case OPCODE_PRINT:
-	 break;
       default:
 	 if (brw_wm_is_scalar_result(inst->Opcode))
 	    emit_scalar_insn(c, inst);
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index c9fe1dd8ad..d836e2fb34 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -6,9 +6,6 @@
 #include "brw_eu.h"
 #include "brw_wm.h"
 
-enum _subroutine {
-    SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
-};
 
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
                                   const struct prog_instruction *inst,
@@ -32,10 +29,6 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
 	    case OPCODE_RET:
-	    case OPCODE_NOISE1:
-	    case OPCODE_NOISE2:
-	    case OPCODE_NOISE3:
-	    case OPCODE_NOISE4:
 	    case OPCODE_BGNLOOP:
 		return GL_TRUE; 
 	    default:
@@ -1495,1036 +1488,7 @@ static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 		   0, 16, 2 );
 }
 
-/* One-, two- and three-dimensional Perlin noise, similar to the description
-   in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
-static void noise1_sub( struct brw_wm_compile *c ) {
 
-    struct brw_compile *p = &c->func;
-    struct brw_reg param,
-	x0, x1, /* gradients at each end */       
-	t, tmp[ 2 ], /* float temporaries */
-	itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0 = alloc_tmp( c );
-    x1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    tmp[ 0 ] = alloc_tmp( c );
-    tmp[ 1 ] = alloc_tmp( c );
-    itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
-    itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
-    itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
-    
-    param = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-
-    /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
-       be hashed.  Also compute the remainder (offset within the unit
-       length), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
-    brw_FRC( p, param, param );
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
-    brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-
-    /* We're now ready to perform the hashing.  The two hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the two gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 31 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
-    brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
-
-    brw_MUL( p, x0, x0, param );
-    brw_MUL( p, x1, x1, t );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
-					   pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_MUL( p, param, tmp[ 0 ], param );
-    brw_MUL( p, x1, x1, param );
-    brw_ADD( p, x0, x0, x1 );    
-    /* scale by pow( 2, -30 ), to compensate for the format conversion
-       above and an extra factor of 2 so that a single gradient covers
-       the [-1,1] range */
-    brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise1( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src, param, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src = get_src_reg( c, inst, 0, 0 );
-
-    param = alloc_tmp( c );
-
-    brw_MOV( p, param, src );
-
-    invoke_subroutine( c, SUB_NOISE1, noise1_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-static void noise2_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */       
-	t, tmp[ 4 ], /* float temporaries */
-	itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 4; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-    }
-    itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
-    itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
-    itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
-    
-    param0 = lookup_tmp( c, mark - 3 );
-    param1 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       square), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-    brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
-	     low_words( itmp[ 1 ] ) );
-    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
-    brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
-    brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
-
-    /* We're now ready to perform the hashing.  The four hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the four gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
-    
-    brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
-    brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
-    brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
-
-    brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_MUL( p, param0, tmp[ 0 ], param0 );
-    brw_MUL( p, param1, tmp[ 1 ], param1 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, param1 );
-    brw_MUL( p, x1y1, x1y1, param1 );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  There are horrible register dependencies here,
-       but we have nothing else to do. */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, param0 );
-    brw_ADD( p, x0y0, x0y0, x1y0 );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise2( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, param0, param1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-
-    invoke_subroutine( c, SUB_NOISE2, noise2_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-
-/**
- * The three-dimensional case is much like the one- and two- versions above,
- * but since the number of corners is rapidly growing we now pack 16 16-bit
- * hashes into each register to extract more parallelism from the EUs.
- */
-static void noise3_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1, param2,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	xi, yi, zi, /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    xi = alloc_tmp( c );
-    yi = alloc_tmp( c );
-    zi = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-    
-    param0 = lookup_tmp( c, mark - 4 );
-    param1 = lookup_tmp( c, mark - 3 );
-    param2 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       cube), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_FRC( p, param2, param2 );
-    /* Since we now have only 16 bits of precision in the hash, we must
-       be more careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
-    brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
-    brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* The interpolation coefficients are still around from last time, so
-       again interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* The final interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise3( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, param0, param1, param2, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-
-    invoke_subroutine( c, SUB_NOISE3, noise3_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-/**
- * For the four-dimensional case, the little micro-optimisation benefits
- * we obtain by unrolling all the loops aren't worth the massive bloat it
- * now causes.  Instead, we loop twice around performing a similar operation
- * to noise3, once for the w=0 cube and once for the w=1, with a bit more
- * code to glue it all together.
- */
-static void noise4_sub( struct brw_wm_compile *c )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg param[ 4 ],
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	w0, /* noise for the w=0 cube */
-	floors[ 2 ], /* integer coordinates of base corner of hypercube */
-	interp[ 4 ], /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i, j;
-    int mark = mark_tmps( c );
-    GLuint loop, origin;
-    
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    w0 = alloc_tmp( c );    
-    floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-    floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-
-    for( i = 0; i < 4; i++ ) {
-	param[ i ] = lookup_tmp( c, mark - 5 + i );
-	interp[ i ] = alloc_tmp( c );
-    }
-    
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    /* We only want 16 bits of precision from the integral part of each
-       co-ordinate, but unfortunately the RNDD semantics would saturate
-       at 16 bits if we performed the operation directly to a 16-bit
-       destination.  Therefore, we round to 32-bit temporaries where
-       appropriate, and then store only the lower 16 bits. */
-    brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
-    brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
-    brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
-    brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
-
-    /* Modify the flag register here, because the side effect is useful
-       later (see below).  We know for certain that all flags will be
-       cleared, since the FRC instruction cannot possibly generate
-       negative results.  Even for exceptional inputs (infinities, denormals,
-       NaNs), the architecture guarantees that the L conditional is false. */
-    brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
-    brw_FRC( p, param[ 0 ], param[ 0 ] );
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    for( i = 1; i < 4; i++ )	
-	brw_FRC( p, param[ i ], param[ i ] );
-    
-    /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
-       of all. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
-    for( j = 0; j < 3; j++ )
-	for( i = 0; i < 4; i++ )
-	    brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-
-    /* Mark the current address, as it will be a jump destination.  The
-       following code will be executed twice: first, with the flag
-       register clear indicating the w=0 case, and second with flags
-       set for w=1. */
-    loop = p->nr_insn;
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Since we have only 16 bits of precision in the hash, we
-       must be careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 0xA359 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );    
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Here we interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Interpolate in the y dimension: */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* Another interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-
-    /* Exit the loop if we've computed both cubes... */
-    origin = p->nr_insn;
-    brw_push_insn_state( p );
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
-    brw_pop_insn_state( p );
-
-    /* Save the result for the w=0 case, and increment the w coordinate: */
-    brw_MOV( p, w0, tmp[ 0 ] );
-    brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 1 ) );
-
-    /* Loop around for the other cube.  Explicitly set the flag register
-       (unfortunately we must spend an extra instruction to do this: we
-       can't rely on a side effect of the previous MOV or ADD because
-       conditional modifiers which are normally true might be false in
-       exceptional circumstances, e.g. given a NaN input; the add to
-       brw_ip_reg() is not suitable because the IP is not an 8-vector). */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
-	     brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
-    brw_pop_insn_state( p );
-
-    /* Patch the previous conditional branch now that we know the
-       destination address. */
-    brw_set_src1( p->store + origin,
-		  brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
-
-    /* The very last interpolation. */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );    
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
-
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise4( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-    src3 = get_src_reg( c, inst, 0, 3 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-    param3 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-    brw_MOV( p, param3, src3 );
-
-    invoke_subroutine( c, SUB_NOISE4, noise4_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
                          const struct prog_instruction *inst)
@@ -2543,19 +1507,18 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
      * X and Y channels.
      */
     if (mask & WRITEMASK_X) {
-	/* X' = X - origin_x */
-	brw_ADD(p,
+	/* X' = X */
+	brw_MOV(p,
 		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_W),
-		brw_imm_d(0 - c->key.origin_x));
+		retype(src0[0], BRW_REGISTER_TYPE_W));
     }
 
     if (mask & WRITEMASK_Y) {
-	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
+	/* Y' = height - 1 - Y */
 	brw_ADD(p,
 		dst[1],
 		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
-		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+		brw_imm_d(c->key.drawable_height - 1));
     }
 }
 
@@ -2827,7 +1790,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
 		emit_mov(c, inst);
 		break;
 	    case OPCODE_DP3:
@@ -2903,18 +1865,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_MAD:
 		emit_mad(c, inst);
 		break;
-	    case OPCODE_NOISE1:
-		emit_noise1(c, inst);
-		break;
-	    case OPCODE_NOISE2:
-		emit_noise2(c, inst);
-		break;
-	    case OPCODE_NOISE3:
-		emit_noise3(c, inst);
-		break;
-	    case OPCODE_NOISE4:
-		emit_noise4(c, inst);
-		break;
 	    case OPCODE_TEX:
 		emit_tex(c, inst);
 		break;
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 6279258339..0c411b57f5 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -422,7 +422,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
        */      
       switch (inst->Opcode) {
       case OPCODE_MOV: 
-      case OPCODE_SWZ: 
 	 if (!inst->SaturateMode) {
 	    pass0_precalc_mov(c, inst);
 	 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index b449394029..d940ec09a9 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -120,7 +120,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint writemask;
       GLuint read0, read1, read2;
 
-      if (inst->opcode == OPCODE_KIL) {
+      if (inst->opcode == TGSI_OPCODE_KIL) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
@@ -154,76 +154,75 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       /* Mark all inputs which contribute to the marked outputs:
        */
       switch (inst->opcode) {
-      case OPCODE_ABS:
-      case OPCODE_FLR:
-      case OPCODE_FRC:
-      case OPCODE_MOV:
-      case OPCODE_SWZ:
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_ABS:
+      case TGSI_OPCODE_FLR:
+      case TGSI_OPCODE_FRC:
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_SUB:
-      case OPCODE_SLT:
-      case OPCODE_SLE:
-      case OPCODE_SGE:
-      case OPCODE_SGT:
-      case OPCODE_SEQ:
-      case OPCODE_SNE:
-      case OPCODE_ADD:
-      case OPCODE_MAX:
-      case OPCODE_MIN:
-      case OPCODE_MUL:
+      case TGSI_OPCODE_SUB:
+      case TGSI_OPCODE_SLT:
+      case TGSI_OPCODE_SLE:
+      case TGSI_OPCODE_SGE:
+      case TGSI_OPCODE_SGT:
+      case TGSI_OPCODE_SEQ:
+      case TGSI_OPCODE_SNE:
+      case TGSI_OPCODE_ADD:
+      case TGSI_OPCODE_MAX:
+      case TGSI_OPCODE_MIN:
+      case TGSI_OPCODE_MUL:
 	 read0 = writemask;
 	 read1 = writemask;
 	 break;
 
-      case OPCODE_DDX:
-      case OPCODE_DDY:
+      case TGSI_OPCODE_DDX:
+      case TGSI_OPCODE_DDY:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_MAD:	
-      case OPCODE_CMP:
-      case OPCODE_LRP:
+      case TGSI_OPCODE_MAD:	
+      case TGSI_OPCODE_CMP:
+      case TGSI_OPCODE_LRP:
 	 read0 = writemask;
 	 read1 = writemask;	
 	 read2 = writemask;	
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
 	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
 	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
-      case OPCODE_COS:
-      case OPCODE_EX2:
-      case OPCODE_LG2:
-      case OPCODE_RCP:
-      case OPCODE_RSQ:
-      case OPCODE_SIN:
-      case OPCODE_SCS:
+      case TGSI_OPCODE_COS:
+      case TGSI_OPCODE_EX2:
+      case TGSI_OPCODE_LG2:
+      case TGSI_OPCODE_RCP:
+      case TGSI_OPCODE_RSQ:
+      case TGSI_OPCODE_SIN:
+      case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
 	 read0 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 read0 = WRITEMASK_X;
 	 read1 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_TEX:
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
 	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
@@ -254,28 +253,28 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read2 = WRITEMASK_W; /* pixel w */
 	 break;
 
-      case OPCODE_DP3:	
+      case TGSI_OPCODE_DP3:	
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 read0 = WRITEMASK_XYZW;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_LIT: 
+      case TGSI_OPCODE_LIT: 
 	 read0 = WRITEMASK_XYW;
 	 break;
 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
+      case TGSI_OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/gallium/drivers/i965/intel_chipset.h b/src/gallium/drivers/i965/intel_chipset.h
index 3dc8653a73..3c38f1676c 100644
--- a/src/gallium/drivers/i965/intel_chipset.h
+++ b/src/gallium/drivers/i965/intel_chipset.h
@@ -66,7 +66,6 @@
 #define PCI_CHIP_Q45_G                  0x2E12
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
-#define PCI_CHIP_B43_G                  0x2E42
 
 #define PCI_CHIP_ILD_G                  0x0042
 #define PCI_CHIP_ILM_G                  0x0046
@@ -84,8 +83,7 @@
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
-                                 devid == PCI_CHIP_G41_G || \
-                                 devid == PCI_CHIP_B43_G)
+                                 devid == PCI_CHIP_G41_G)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
-- 
cgit v1.2.3


From 22906f730141a233341f3ec124bbb9dd2e8904e2 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 23:27:43 +0100
Subject: i965g: wip on removing GL stuff, trying to get a few files compiling

---
 src/gallium/drivers/i965/Makefile               |  73 ++-----
 src/gallium/drivers/i965/brw_cc.c               |  18 +-
 src/gallium/drivers/i965/brw_clip.c             |   4 +-
 src/gallium/drivers/i965/brw_clip_state.c       |   7 +-
 src/gallium/drivers/i965/brw_context.c          |  64 +++---
 src/gallium/drivers/i965/brw_context.h          | 199 ++++++++-----------
 src/gallium/drivers/i965/brw_curbe.c            |   5 +-
 src/gallium/drivers/i965/brw_draw.c             |  25 +--
 src/gallium/drivers/i965/brw_draw.h             |   7 +-
 src/gallium/drivers/i965/brw_draw_upload.c      |  39 ++--
 src/gallium/drivers/i965/brw_eu_debug.c         |   2 -
 src/gallium/drivers/i965/brw_gs.c               |   8 +-
 src/gallium/drivers/i965/brw_gs_emit.c          |   5 -
 src/gallium/drivers/i965/brw_gs_state.c         |   7 +-
 src/gallium/drivers/i965/brw_misc_state.c       |  22 +-
 src/gallium/drivers/i965/brw_pipe_depth.c       |  18 +-
 src/gallium/drivers/i965/brw_pipe_fb.c          |   4 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   8 +-
 src/gallium/drivers/i965/brw_pipe_query.c       | 246 +++++++++++++++++++++++
 src/gallium/drivers/i965/brw_program.c          | 166 ----------------
 src/gallium/drivers/i965/brw_queryobj.c         | 254 ------------------------
 src/gallium/drivers/i965/brw_sf.c               |  12 +-
 src/gallium/drivers/i965/brw_sf.h               |  12 +-
 src/gallium/drivers/i965/brw_sf_emit.c          |  26 +--
 src/gallium/drivers/i965/brw_sf_state.c         |  16 +-
 src/gallium/drivers/i965/brw_state.h            |  30 +--
 src/gallium/drivers/i965/brw_state_batch.c      |   1 -
 src/gallium/drivers/i965/brw_state_cache.c      |  59 +++---
 src/gallium/drivers/i965/brw_state_dump.c       |  12 +-
 src/gallium/drivers/i965/brw_state_upload.c     |   4 +-
 src/gallium/drivers/i965/brw_structs.h          |   1 +
 src/gallium/drivers/i965/brw_swtnl.c            |   1 -
 src/gallium/drivers/i965/brw_tex.c              |   7 -
 src/gallium/drivers/i965/brw_tex_layout.c       |  12 +-
 src/gallium/drivers/i965/brw_types.h            |  15 +-
 src/gallium/drivers/i965/brw_util.c             |   2 -
 src/gallium/drivers/i965/brw_util.h             |   2 +-
 src/gallium/drivers/i965/brw_vs.c               |   5 +-
 src/gallium/drivers/i965/brw_vs_emit.c          |   3 -
 src/gallium/drivers/i965/brw_vs_state.c         |   9 +-
 src/gallium/drivers/i965/brw_vs_surface_state.c |  20 +-
 src/gallium/drivers/i965/brw_wm.c               |   6 +-
 src/gallium/drivers/i965/brw_wm_emit.c          |   1 -
 src/gallium/drivers/i965/brw_wm_glsl.c          |   4 -
 src/gallium/drivers/i965/brw_wm_iz.c            |   1 -
 src/gallium/drivers/i965/brw_wm_sampler_state.c |  15 +-
 src/gallium/drivers/i965/brw_wm_state.c         |  19 +-
 src/gallium/drivers/i965/brw_wm_surface_state.c | 181 ++++++++---------
 src/gallium/drivers/i965/intel_batchbuffer.h    |   7 +-
 src/gallium/drivers/i965/intel_tex_format.c     | 197 ------------------
 src/gallium/drivers/i965/intel_tex_layout.c     |   7 +-
 51 files changed, 634 insertions(+), 1234 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_pipe_query.c
 delete mode 100644 src/gallium/drivers/i965/brw_program.c
 delete mode 100644 src/gallium/drivers/i965/brw_queryobj.c

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
index 7a55333e89..480d2efbc5 100644
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@@ -1,38 +1,9 @@
-
-TOP = ../../../../..
+TOP = ../../../..
 include $(TOP)/configs/current
 
-LIBNAME = i965_dri.so
+LIBNAME = i965
 
-DRIVER_SOURCES = \
-	intel_batchbuffer.c \
-	intel_blit.c \
-	intel_buffer_objects.c \
-	intel_buffers.c \
-	intel_clear.c \
-	intel_context.c \
-	intel_decode.c \
-	intel_extensions.c \
-	intel_fbo.c \
-	intel_mipmap_tree.c \
-	intel_regions.c \
-	intel_screen.c \
-	intel_span.c \
-	intel_pixel.c \
-	intel_pixel_bitmap.c \
-	intel_pixel_copy.c \
-	intel_pixel_draw.c \
-	intel_pixel_read.c \
-	intel_state.c \
-	intel_swapbuffers.c \
-	intel_syncobj.c \
-	intel_tex.c \
-	intel_tex_copy.c \
-	intel_tex_format.c \
-	intel_tex_image.c \
-	intel_tex_layout.c \
-	intel_tex_subimage.c \
-	intel_tex_validate.c \
+C_SOURCES = \
 	brw_cc.c \
 	brw_clip.c \
 	brw_clip_line.c \
@@ -50,13 +21,18 @@ DRIVER_SOURCES = \
 	brw_eu_debug.c \
 	brw_eu_emit.c \
 	brw_eu_util.c \
-	brw_fallback.c \
 	brw_gs.c \
 	brw_gs_emit.c \
 	brw_gs_state.c \
 	brw_misc_state.c \
-	brw_program.c \
-	brw_queryobj.c \
+	brw_pipe_blend.c \
+	brw_pipe_debug.c \
+	brw_pipe_depth.c \
+	brw_pipe_fb.c \
+	brw_pipe_flush.c \
+	brw_pipe_query.c \
+	brw_pipe_shader.c \
+	brw_screen_surface.c \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
@@ -64,41 +40,30 @@ DRIVER_SOURCES = \
 	brw_state_cache.c \
 	brw_state_dump.c \
 	brw_state_upload.c \
+	brw_swtnl.c \
 	brw_tex.c \
 	brw_tex_layout.c \
 	brw_urb.c \
 	brw_util.c \
 	brw_vs.c \
-	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
 	brw_vs_surface_state.c \
-	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_debug.c \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
-	brw_wm_iz.c \
 	brw_wm_glsl.c \
+	brw_wm_iz.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
 	brw_wm_pass2.c \
 	brw_wm_sampler_state.c \
 	brw_wm_state.c \
-	brw_wm_surface_state.c 
-
-C_SOURCES = \
-	$(COMMON_SOURCES) \
-	$(MINIGLX_SOURCES) \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
-DRIVER_DEFINES = -I../intel -I../intel/server
-
-DRI_LIB_DEPS += -ldrm_intel
-
-include ../Makefile.template
+	brw_wm_surface_state.c \
+	brw_bo.c \
+	intel_batchbuffer.c \
+	intel_tex_format.c \
+	intel_tex_layout.c 
 
-intel_decode.o: ../intel/intel_decode.c
-intel_tex_layout.o: ../intel/intel_tex_layout.c
+include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 9ab5638137..af432b1f52 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -33,13 +33,9 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "brw_util.h"
-#include "main/macros.h"
-#include "main/enums.h"
 
 static void prepare_cc_vp( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_cc_viewport ccv;
 
    memset(&ccv, 0, sizeof(ccv));
@@ -48,13 +44,13 @@ static void prepare_cc_vp( struct brw_context *brw )
    ccv.min_depth = ctx->Viewport.Near;
    ccv.max_depth = ctx->Viewport.Far;
 
-   dri_bo_unreference(brw->cc.vp_bo);
+   brw->sws->bo_unreference(brw->cc.vp_bo);
    brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
 }
 
 const struct brw_tracked_state brw_cc_vp = {
    .dirty = {
-      .mesa = _NEW_VIEWPORT,
+      .mesa = PIPE_NEW_VIEWPORT,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -71,8 +67,8 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
    memset(key, 0, sizeof(*key));
    
-   key->dsa = brw->curr.dsa.base;
-   key->blend = brw->curr.blend.base;
+   key->dsa = brw->dsa;
+   key->blend = brw->blend;
 
    /* Clear non-respected values:
     */
@@ -82,11 +78,11 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 /**
  * Creates the state cache entry for the given CC unit key.
  */
-static dri_bo *
+static struct brw_winsys_buffer *
 cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
    struct brw_cc_unit_state cc;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&cc, 0, sizeof(cc));
 
@@ -124,7 +120,7 @@ static void prepare_cc_unit( struct brw_context *brw )
 
    cc_unit_populate_key(brw, &key);
 
-   dri_bo_unreference(brw->cc.state_bo);
+   brw->sws->bo_unreference(brw->cc.state_bo);
    brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT,
 				       &key, sizeof(key),
 				       &brw->cc.vp_bo, 1,
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index df1b3718d0..d82ebeb9a9 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -129,7 +129,7 @@ static void compile_clip_prog( struct brw_context *brw,
 
    /* Upload
     */
-   dri_bo_unreference(brw->clip.prog_bo);
+   brw->sws->bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_upload_cache( &brw->cache,
 					 BRW_CLIP_PROG,
 					 &c.key, sizeof(c.key),
@@ -199,7 +199,7 @@ static void upload_clip_prog(struct brw_context *brw)
       }
    }
 
-   dri_bo_unreference(brw->clip.prog_bo);
+   brw->sws->bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
 					&key, sizeof(key),
 					NULL, 0,
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 72e27205e2..0ea7ce5734 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -49,7 +49,6 @@ struct brw_clip_unit_key {
 static void
 clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_CLIP_PROG */
@@ -69,12 +68,12 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    key->depth_clamp = 0; // XXX: add this to gallium: ctx->Transform.DepthClamp;
 }
 
-static dri_bo *
+static struct brw_winsys_buffer *
 clip_unit_create_from_key(struct brw_context *brw,
 			  struct brw_clip_unit_key *key)
 {
    struct brw_clip_unit_state clip;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&clip, 0, sizeof(clip));
 
@@ -162,7 +161,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
    clip_unit_populate_key(brw, &key);
 
-   dri_bo_unreference(brw->clip.state_bo);
+   brw->sws->bo_unreference(brw->clip.state_bo);
    brw->clip.state_bo = brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
 					 &key, sizeof(key),
 					 &brw->clip.prog_bo, 1,
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index bf0ec89e13..063ada5772 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -30,32 +30,21 @@
   */
 
 
-#include "main/imports.h"
-#include "main/api_noop.h"
-#include "main/macros.h"
-#include "main/vtxfmt.h"
-#include "main/simple_list.h"
-#include "shader/shader_api.h"
+#include "pipe/p_context.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
 #include "brw_state.h"
 #include "brw_vs.h"
-#include "intel_tex.h"
-#include "intel_blit.h"
+#include "brw_screen_tex.h"
 #include "intel_batchbuffer.h"
-#include "intel_pixel.h"
-#include "intel_span.h"
-#include "tnl/t_pipeline.h"
 
-#include "utils.h"
 
 
-GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
-			    void *sharedContextPrivate)
+struct pipe_context *brw_create_context( struct pipe_screen *screen,
+					 void *priv )
 {
    struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
 
@@ -87,9 +76,8 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 /**
  * called from intelDestroyContext()
  */
-static void brw_destroy_context( struct intel_context *intel )
+static void brw_destroy_context( struct brw_context *brw )
 {
-   struct brw_context *brw = brw_context(&intel->ctx);
    int i;
 
    brw_destroy_state(brw);
@@ -102,27 +90,27 @@ static void brw_destroy_context( struct intel_context *intel )
    brw->state.nr_color_regions = 0;
    intel_region_release(&brw->state.depth_region);
 
-   dri_bo_unreference(brw->curbe.curbe_bo);
-   dri_bo_unreference(brw->vs.prog_bo);
-   dri_bo_unreference(brw->vs.state_bo);
-   dri_bo_unreference(brw->vs.bind_bo);
-   dri_bo_unreference(brw->gs.prog_bo);
-   dri_bo_unreference(brw->gs.state_bo);
-   dri_bo_unreference(brw->clip.prog_bo);
-   dri_bo_unreference(brw->clip.state_bo);
-   dri_bo_unreference(brw->clip.vp_bo);
-   dri_bo_unreference(brw->sf.prog_bo);
-   dri_bo_unreference(brw->sf.state_bo);
-   dri_bo_unreference(brw->sf.vp_bo);
+   brw->sws->bo_unreference(brw->curbe.curbe_bo);
+   brw->sws->bo_unreference(brw->vs.prog_bo);
+   brw->sws->bo_unreference(brw->vs.state_bo);
+   brw->sws->bo_unreference(brw->vs.bind_bo);
+   brw->sws->bo_unreference(brw->gs.prog_bo);
+   brw->sws->bo_unreference(brw->gs.state_bo);
+   brw->sws->bo_unreference(brw->clip.prog_bo);
+   brw->sws->bo_unreference(brw->clip.state_bo);
+   brw->sws->bo_unreference(brw->clip.vp_bo);
+   brw->sws->bo_unreference(brw->sf.prog_bo);
+   brw->sws->bo_unreference(brw->sf.state_bo);
+   brw->sws->bo_unreference(brw->sf.vp_bo);
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
-      dri_bo_unreference(brw->wm.sdc_bo[i]);
-   dri_bo_unreference(brw->wm.bind_bo);
+      brw->sws->bo_unreference(brw->wm.sdc_bo[i]);
+   brw->sws->bo_unreference(brw->wm.bind_bo);
    for (i = 0; i < BRW_WM_MAX_SURF; i++)
-      dri_bo_unreference(brw->wm.surf_bo[i]);
-   dri_bo_unreference(brw->wm.sampler_bo);
-   dri_bo_unreference(brw->wm.prog_bo);
-   dri_bo_unreference(brw->wm.state_bo);
-   dri_bo_unreference(brw->cc.prog_bo);
-   dri_bo_unreference(brw->cc.state_bo);
-   dri_bo_unreference(brw->cc.vp_bo);
+      brw->sws->bo_unreference(brw->wm.surf_bo[i]);
+   brw->sws->bo_unreference(brw->wm.sampler_bo);
+   brw->sws->bo_unreference(brw->wm.prog_bo);
+   brw->sws->bo_unreference(brw->wm.state_bo);
+   brw->sws->bo_unreference(brw->cc.prog_bo);
+   brw->sws->bo_unreference(brw->cc.state_bo);
+   brw->sws->bo_unreference(brw->cc.vp_bo);
 }
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 009e28b227..0fcb75a440 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -33,9 +33,9 @@
 #ifndef BRWCONTEXT_INC
 #define BRWCONTEXT_INC
 
-#include "intel_context.h"
 #include "brw_structs.h"
-#include "main/imports.h"
+#include "brw_winsys.h"
+#include "pipe/p_state.h"
 
 
 /* Glossary:
@@ -119,6 +119,19 @@
 
 struct brw_context;
 
+#define PIPE_NEW_DEPTH_STENCIL_ALPHA    0x1
+#define PIPE_NEW_RAST                   0x2
+#define PIPE_NEW_BLEND                  0x2
+#define PIPE_NEW_VIEWPORT               0x2
+#define PIPE_NEW_FRAMEBUFFER            0x2
+#define PIPE_NEW_VERTEX_BUFFER          0x2
+#define PIPE_NEW_VERTEX_ELEMENT         0x2
+#define PIPE_NEW_FRAGMENT_SHADER        0x2
+#define PIPE_NEW_VERTEX_SHADER          0x2
+#define PIPE_NEW_FRAGMENT_CONSTS        0x2
+#define PIPE_NEW_VERTEX_CONSTS          0x2
+
+
 #define BRW_NEW_URB_FENCE               0x1
 #define BRW_NEW_FRAGMENT_PROGRAM        0x2
 #define BRW_NEW_VERTEX_PROGRAM          0x4
@@ -156,26 +169,23 @@ struct brw_state_flags {
 };
 
 
-/** Subclass of Mesa vertex program */
 struct brw_vertex_program {
-   struct gl_vertex_program program;
+   const struct tgsi_token *tokens;
    GLuint id;
-   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
 
 
 /** Subclass of Mesa fragment program */
 struct brw_fragment_program {
-   struct gl_fragment_program program;
+   const struct tgsi_token *tokens;
+
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
+   GLboolean isGLSL;  /**< any IF/LOOP/CONT/BREAK instructions */
 
-   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
-
-   /** for debugging, which texture units are referenced */
-   GLbitfield tex_units_used;
 };
 
 
@@ -244,7 +254,7 @@ struct brw_vs_prog_data {
 /* Size == 0 if output either not written, or always [0,0,0,1]
  */
 struct brw_vs_ouput_sizes {
-   GLubyte output_size[VERT_RESULT_MAX];
+   GLubyte output_size[PIPE_MAX_SHADER_OUTPUTS];
 };
 
 
@@ -312,10 +322,10 @@ struct brw_cache_item {
    GLuint hash;
    GLuint key_size;		/* for variable-sized keys */
    const void *key;
-   dri_bo **reloc_bufs;
+   struct brw_winsys_buffer **reloc_bufs;
    GLuint nr_reloc_bufs;
 
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    GLuint data_size;
 
    struct brw_cache_item *next;
@@ -336,7 +346,7 @@ struct brw_cache {
    /* Record of the last BOs chosen for each cache_id.  Used to set
     * brw->state.dirty.cache when a new cache item is chosen.
     */
-   dri_bo *last_bo[BRW_MAX_CACHE];
+   struct brw_winsys_buffer *last_bo[BRW_MAX_CACHE];
 };
 
 
@@ -384,56 +394,22 @@ struct brw_cached_batch_item {
 /* Protect against a future where VERT_ATTRIB_MAX > 32.  Wouldn't life
  * be easier if C allowed arrays of packed elements?
  */
-#define ATTRIB_BIT_DWORDS  ((VERT_ATTRIB_MAX+31)/32)
-
-struct brw_vertex_element {
-   const struct gl_client_array *glarray;
-
-   /** The corresponding Mesa vertex attribute */
-   gl_vert_attrib attrib;
-   /** Size of a complete element */
-   GLuint element_size;
-   /** Number of uploaded elements for this input. */
-   GLuint count;
-   /** Byte stride between elements in the uploaded array */
-   GLuint stride;
-   /** Offset of the first element within the buffer object */
-   unsigned int offset;
-   /** Buffer object containing the uploaded vertex data */
-   dri_bo *bo;
-};
-
-
-
-struct brw_vertex_info {
-   GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
-};
+#define VS_INPUT_BITMASK_DWORDS  ((PIPE_MAX_SHADER_INPUTS+31)/32)
 
 
-/* Cache for TNL programs.
- */
-struct brw_tnl_cache_item {
-   GLuint hash;
-   void *key;
-   void *data;
-   struct brw_tnl_cache_item *next;
+struct brw_vertex_info {
+   GLuint sizes[VS_INPUT_BITMASK_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
-struct brw_tnl_cache {
-   struct brw_tnl_cache_item **items;
-   GLuint size, n_items;
-};
 
 struct brw_query_object {
-   struct gl_query_object Base;
-
    /** Doubly linked list of active query objects in the context. */
    struct brw_query_object *prev, *next;
 
    /** Last query BO associated with this query. */
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    /** First index in bo with query data for this object. */
    int first_index;
    /** Last index in bo with query data for this object. */
@@ -445,22 +421,29 @@ struct brw_query_object {
 
 
 /**
- * brw_context is derived from intel_context.
+ * brw_context is derived from pipe_context
  */
 struct brw_context 
 {
+   struct pipe_context *pipe;
+   struct pipe_screen *screen;
+   
+   struct brw_winsys_screen *sws;
+
    GLuint primitive;
 
    GLboolean emit_state_always;
    GLboolean no_batch_wrap;
 
+   /* Active vertex program: 
+    */
+   const struct gl_vertex_program *vertex_program;
+   const struct gl_fragment_program *fragment_program;
+   struct pipe_framebuffer_state fb;
+
    struct {
       struct brw_state_flags dirty;
 
-      GLuint nr_color_regions;
-      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
-      struct intel_region *depth_region;
-
       /**
        * List of buffers accumulated in brw_validate_state to receive
        * dri_bo_check_aperture treatment before exec, so we can know if we
@@ -471,7 +454,7 @@ struct brw_context
        * consisting of the vertex buffers, pipelined state pointers,
        * the CURBE, the depth buffer, and a query BO.
        */
-      dri_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      struct brw_winsys_buffer *validated_bos[PIPE_MAX_SHADER_INPUTS + 16];
       int validated_bo_count;
    } state;
 
@@ -480,18 +463,14 @@ struct brw_context
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
-      struct brw_vertex_element inputs[VERT_ATTRIB_MAX];
+      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      unsigned num_vertex_element;
+      unsigned num_vertex_buffer;
 
-      struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
-      GLuint nr_enabled;
-
-#define BRW_NR_UPLOAD_BUFS 17
-#define BRW_UPLOAD_INIT_SIZE (128*1024)
-
-      struct {
-	 dri_bo *bo;
-	 GLuint offset;
-      } upload;
+      struct u_upload_mgr *upload_vertex;
+      struct u_upload_mgr *upload_index;
+      
 
       /* Summary of size and varying of active arrays, so we can check
        * for changes to this state:
@@ -509,7 +488,7 @@ struct brw_context
       const struct _mesa_index_buffer *ib;
 
       /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
-      dri_bo *bo;
+      struct brw_winsys_buffer *bo;
       unsigned int offset;
       unsigned int size;
       /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
@@ -519,16 +498,6 @@ struct brw_context
       unsigned int start_vertex_offset;
    } ib;
 
-   /* Active vertex program: 
-    */
-   const struct gl_vertex_program *vertex_program;
-   const struct gl_fragment_program *fragment_program;
-
-
-   /* For populating the gtt:
-    */
-   GLuint next_free_page;
-
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -545,12 +514,6 @@ struct brw_context
       GLuint nr_sf_entries;
       GLuint nr_cs_entries;
 
-/*       GLuint vs_size; */
-/*       GLuint gs_size; */
-/*       GLuint clip_size; */
-/*       GLuint sf_size; */
-/*       GLuint cs_size; */
-
       GLuint vs_start;
       GLuint gs_start;
       GLuint clip_start;
@@ -570,7 +533,7 @@ struct brw_context
       GLuint vs_size;
       GLuint total_size;
 
-      dri_bo *curbe_bo;
+      struct brw_winsys_buffer *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
       /** Offset within curbe_bo of space for next curbe entry */
@@ -588,12 +551,12 @@ struct brw_context
    struct {
       struct brw_vs_prog_data *prog_data;
 
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
 
       /** Binding table of pointers to surf_bo entries */
-      dri_bo *bind_bo;
-      dri_bo *surf_bo[BRW_VS_MAX_SURF];
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[BRW_VS_MAX_SURF];
       GLuint nr_surfaces;      
    } vs;
 
@@ -601,25 +564,25 @@ struct brw_context
       struct brw_gs_prog_data *prog_data;
 
       GLboolean prog_active;
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
    } gs;
 
    struct {
       struct brw_clip_prog_data *prog_data;
 
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
-      dri_bo *vp_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
    } clip;
 
 
    struct {
       struct brw_sf_prog_data *prog_data;
 
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
-      dri_bo *vp_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
    } sf;
 
    struct {
@@ -629,38 +592,38 @@ struct brw_context
       /** Input sizes, calculated from active vertex program.
        * One bit per fragment program input attribute.
        */
-      GLbitfield input_size_masks[4];
+      //GLbitfield input_size_masks[4];
 
       /** Array of surface default colors (texture border color) */
-      dri_bo *sdc_bo[BRW_MAX_TEX_UNIT];
+      struct brw_winsys_buffer *sdc_bo[BRW_MAX_TEX_UNIT];
 
       GLuint render_surf;
       GLuint nr_surfaces;      
 
       GLuint max_threads;
-      dri_bo *scratch_bo;
+      struct brw_winsys_buffer *scratch_bo;
 
       GLuint sampler_count;
-      dri_bo *sampler_bo;
+      struct brw_winsys_buffer *sampler_bo;
 
       /** Binding table of pointers to surf_bo entries */
-      dri_bo *bind_bo;
-      dri_bo *surf_bo[BRW_WM_MAX_SURF];
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[PIPE_MAX_COLOR_BUFS];
 
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
    } wm;
 
 
    struct {
-      dri_bo *prog_bo;
-      dri_bo *state_bo;
-      dri_bo *vp_bo;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
    } cc;
 
    struct {
       struct brw_query_object active_head;
-      dri_bo *bo;
+      struct brw_winsys_buffer *bo;
       int index;
       GLboolean active;
    } query;
@@ -679,12 +642,6 @@ struct brw_context
  */
 void brwInitVtbl( struct brw_context *brw );
 
-/*======================================================================
- * brw_context.c
- */
-GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
-			    void *sharedContextPrivate);
 
 /*======================================================================
  * brw_queryobj.c
@@ -697,7 +654,7 @@ void brw_emit_query_end(struct brw_context *brw);
 /*======================================================================
  * brw_state_dump.c
  */
-void brw_debug_batch(struct intel_context *intel);
+void brw_debug_batch(struct brw_context *intel);
 
 /*======================================================================
  * brw_tex.c
@@ -706,9 +663,9 @@ void brw_validate_textures( struct brw_context *brw );
 
 
 /*======================================================================
- * brw_program.c
+ * brw_pipe_shader.c
  */
-void brwInitFragProgFuncs( struct dd_function_table *functions );
+void brw_init_shader_funcs( struct brw_context *brw );
 
 
 /* brw_urb.c
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 3e32c4983d..33ea9a00f7 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -47,7 +47,6 @@
  */
 static void calculate_curbe_offsets( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    /* CACHE_NEW_WM_PROG */
    const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
    
@@ -157,7 +156,6 @@ static GLfloat fixed_plane[6][4] = {
  */
 static void prepare_constant_buffer(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
    const struct brw_fragment_program *fp =
@@ -269,7 +267,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	  (brw->curbe.need_new_bo ||
 	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
       {
-	 dri_bo_unreference(brw->curbe.curbe_bo);
+	 brw->sws->bo_unreference(brw->curbe.curbe_bo);
 	 brw->curbe.curbe_bo = NULL;
       }
 
@@ -310,7 +308,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
 
 static void emit_constant_buffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 8cd117c24f..856999f3ef 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -26,15 +26,6 @@
  **************************************************************************/
 
 
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/enums.h"
-#include "tnl/tnl.h"
-#include "vbo/vbo_context.h"
-#include "swrast/swrast.h"
-#include "swrast_setup/swrast_setup.h"
-
 #include "brw_draw.h"
 #include "brw_defines.h"
 #include "brw_context.h"
@@ -67,7 +58,6 @@ static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
  */
 static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
 {
-   GLcontext *ctx = &brw->intel.ctx;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
@@ -110,7 +100,6 @@ static void brw_emit_prim(struct brw_context *brw,
 			  uint32_t hw_prim)
 {
    struct brw_3d_primitive prim_packet;
-   struct intel_context *intel = &brw->intel;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
@@ -163,7 +152,7 @@ static void brw_merge_inputs( struct brw_context *brw,
    GLuint i;
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++)
-      dri_bo_unreference(brw->vb.inputs[i].bo);
+      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
 
    memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs));
    memset(&brw->vb.info, 0, sizeof(brw->vb.info));
@@ -185,7 +174,7 @@ static void brw_merge_inputs( struct brw_context *brw,
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
-static GLboolean brw_try_draw_prims( GLcontext *ctx,
+static GLboolean brw_try_draw_prims( struct brw_context *brw,
 				     const struct gl_client_array *arrays[],
 				     const struct _mesa_prim *prim,
 				     GLuint nr_prims,
@@ -193,7 +182,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
 				     GLuint min_index,
 				     GLuint max_index )
 {
-   struct intel_context *intel = intel_context(ctx);
    struct brw_context *brw = brw_context(ctx);
    GLboolean retval = GL_FALSE;
    GLboolean warn = GL_FALSE;
@@ -241,7 +229,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    return 0;
 }
 
-void brw_draw_prims( GLcontext *ctx,
+void brw_draw_prims( struct brw_context *brw,
 		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
@@ -274,7 +262,6 @@ void brw_draw_prims( GLcontext *ctx,
 
 void brw_draw_init( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct vbo_context *vbo = vbo_context(ctx);
 
    /* Register our drawing function: 
@@ -287,15 +274,15 @@ void brw_draw_destroy( struct brw_context *brw )
    int i;
 
    if (brw->vb.upload.bo != NULL) {
-      dri_bo_unreference(brw->vb.upload.bo);
+      brw->sws->bo_unreference(brw->vb.upload.bo);
       brw->vb.upload.bo = NULL;
    }
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      dri_bo_unreference(brw->vb.inputs[i].bo);
+      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
       brw->vb.inputs[i].bo = NULL;
    }
 
-   dri_bo_unreference(brw->ib.bo);
+   brw->sws->bo_unreference(brw->ib.bo);
    brw->ib.bo = NULL;
 }
diff --git a/src/gallium/drivers/i965/brw_draw.h b/src/gallium/drivers/i965/brw_draw.h
index 2a14db217f..dc7ca8731d 100644
--- a/src/gallium/drivers/i965/brw_draw.h
+++ b/src/gallium/drivers/i965/brw_draw.h
@@ -28,13 +28,12 @@
 #ifndef BRW_DRAW_H
 #define BRW_DRAW_H
 
-#include "main/mtypes.h"		/* for GLcontext... */
-#include "vbo/vbo.h"
+#include "brw_types.h"
 
 struct brw_context;
 
 
-void brw_draw_prims( GLcontext *ctx,
+void brw_draw_prims( struct brw_context *brw,
 		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
@@ -48,7 +47,7 @@ void brw_draw_destroy( struct brw_context *brw );
 
 /* brw_draw_current.c
  */
-void brw_init_current_values(GLcontext *ctx,
+void brw_init_current_values(struct brw_context *brw,
 			     struct gl_client_array *arrays);
 
 #endif
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index ad3ef6b7dd..dce015d79f 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -191,8 +191,6 @@ static unsigned get_index_type(int type)
 
 static boolean brw_prepare_vertices(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = intel_context(ctx);
    GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
    GLuint i;
    const unsigned char *ptr = NULL;
@@ -210,15 +208,17 @@ static boolean brw_prepare_vertices(struct brw_context *brw)
 
 
-   for (i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
+   for (i = 0; i < brw->vb.num_vertex_buffer; i++) {
+      struct brw_vertex_buffer *vb = brw->vb.vertex_buffer[i];
+      unsigned size = (vb->stride == 0 ? 
+		       vb->size :
+		       vb->stride * (max_index + 1 - min_index));
 
-      input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
 
       if (brw_is_user_buffer(vb)) {
-	 u_upload_buffer( brw->upload, 
+	 u_upload_buffer( brw->upload_vertex, 
 			  min_index * vb->stride,
-			  (max_index + 1 - min_index) * vb->stride,
+			  size,
 			  &offset,
 			  &buffer );
       }
@@ -226,20 +226,20 @@ static boolean brw_prepare_vertices(struct brw_context *brw)
       {
 	 offset = 0;
 	 buffer = vb->buffer;
-	 count = stride == 0 ? 1 : max_index + 1 - min_index;
       }
-
-      /* Named buffer object: Just reference its contents directly. */
-      dri_bo_unreference(input->bo);
-      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
-					 INTEL_READ);
-      dri_bo_reference(input->bo);
-
+      
+      /* Set up post-upload info about this vertex buffer:
+       */
       input->offset = (unsigned long)offset;
       input->stride = vb->stride;
       input->count = count;
+      brw->sws->bo_unreference(input->bo);
+      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
+					 INTEL_READ);
+      brw->sws->bo_reference(input->bo);
 
       assert(input->offset < input->bo->size);
+      assert(input->offset + size <= input->bo->size);
    }
 
    brw_prepare_query_begin(brw);
@@ -253,8 +253,6 @@ static boolean brw_prepare_vertices(struct brw_context *brw)
 
 static void brw_emit_vertices(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = intel_context(ctx);
    GLuint i;
 
    brw_emit_query_begin(brw);
@@ -370,11 +368,9 @@ const struct brw_tracked_state brw_vertices = {
 
 static void brw_prepare_indices(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint ib_size;
-   dri_bo *bo = NULL;
+   struct brw_winsys_buffer *bo = NULL;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
    GLuint ib_type_size;
@@ -421,7 +417,7 @@ static void brw_prepare_indices(struct brw_context *brw)
        } else {
 	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
 				      INTEL_READ);
-	  dri_bo_reference(bo);
+	  brw->sws->bo_reference(bo);
 
 	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
 	   * the index buffer state when we're just moving the start index
@@ -461,7 +457,6 @@ const struct brw_tracked_state brw_indices = {
 
 static void brw_emit_index_buffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 
    if (index_buffer == NULL)
diff --git a/src/gallium/drivers/i965/brw_eu_debug.c b/src/gallium/drivers/i965/brw_eu_debug.c
index 29f3f6d02f..ad7ec36e86 100644
--- a/src/gallium/drivers/i965/brw_eu_debug.c
+++ b/src/gallium/drivers/i965/brw_eu_debug.c
@@ -30,8 +30,6 @@
   */
     
 
-#include "main/mtypes.h"
-#include "main/imports.h"
 #include "brw_eu.h"
 
 void brw_print_reg( struct brw_reg hwreg )
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 5ec0c585fe..58930e7964 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -29,10 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
       
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
@@ -124,7 +120,7 @@ static void compile_gs_prog( struct brw_context *brw,
 
    /* Upload
     */
-   dri_bo_unreference(brw->gs.prog_bo);
+   brw->sws->bo_unreference(brw->gs.prog_bo);
    brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG,
 				       &c.key, sizeof(c.key),
 				       NULL, 0,
@@ -180,7 +176,7 @@ static void prepare_gs_prog(struct brw_context *brw)
    }
 
    if (brw->gs.prog_active) {
-      dri_bo_unreference(brw->gs.prog_bo);
+      brw->sws->bo_unreference(brw->gs.prog_bo);
       brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
 					 &key, sizeof(key),
 					 NULL, 0,
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
index a9b2aa2eac..9ec206d7e8 100644
--- a/src/gallium/drivers/i965/brw_gs_emit.c
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -30,11 +30,6 @@
   */
  
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-
-#include "shader/program.h"
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
index ed9d2ffe60..6d03d72d96 100644
--- a/src/gallium/drivers/i965/brw_gs_state.c
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -34,7 +34,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_gs_unit_key {
    unsigned int total_grf;
@@ -69,11 +68,11 @@ gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    key->urb_size = brw->urb.vsize;
 }
 
-static dri_bo *
+static struct brw_winsys_buffer *
 gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 {
    struct brw_gs_unit_state gs;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&gs, 0, sizeof(gs));
 
@@ -128,7 +127,7 @@ static void prepare_gs_unit(struct brw_context *brw)
 
    gs_unit_populate_key(brw, &key);
 
-   dri_bo_unreference(brw->gs.state_bo);
+   brw->sws->bo_unreference(brw->gs.state_bo);
    brw->gs.state_bo = brw_search_cache(&brw->cache, BRW_GS_UNIT,
 				       &key, sizeof(key),
 				       &brw->gs.prog_bo, 1,
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index ea71857548..d33bf40a01 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -48,7 +48,6 @@
 
 static void upload_blend_constant_color(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_blend_constant_color bcc;
 
    memset(&bcc, 0, sizeof(bcc));      
@@ -75,17 +74,11 @@ const struct brw_tracked_state brw_blend_constant_color = {
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 static void upload_drawing_rect(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
-   GLcontext *ctx = &intel->ctx;
-
-   if (!intel->constant_cliprect)
-      return;
-
    BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
-   OUT_BATCH(0); /* xmin, ymin */
-   OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
-	    ((ctx->DrawBuffer->Height - 1) << 16));
+   OUT_BATCH(0);
+   OUT_BATCH(((brw->fb.width - 1) & 0xffff) |
+	    ((brw->fb.height - 1) << 16));
    OUT_BATCH(0);
    ADVANCE_BATCH();
 }
@@ -114,8 +107,6 @@ static void prepare_binding_table_pointers(struct brw_context *brw)
  */
 static void upload_binding_table_pointers(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
-
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
    if (brw->vs.bind_bo != NULL)
@@ -148,8 +139,6 @@ const struct brw_tracked_state brw_binding_table_pointers = {
  */
 static void upload_pipelined_state_pointers(struct brw_context *brw )
 {
-   struct intel_context *intel = &brw->intel;
-
    BEGIN_BATCH(7, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
    OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
@@ -210,7 +199,6 @@ static void prepare_depthbuffer(struct brw_context *brw)
 
 static void emit_depthbuffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
    unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
@@ -287,7 +275,6 @@ const struct brw_tracked_state brw_depthbuffer = {
 
 static void upload_polygon_stipple(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_polygon_stipple bps;
    GLuint i;
 
@@ -401,7 +388,6 @@ const struct brw_tracked_state brw_aa_line_parameters = {
 
 static void upload_line_stipple(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_line_stipple bls;
    GLfloat tmp;
    GLint tmpi;
@@ -507,8 +493,6 @@ const struct brw_tracked_state brw_invarient_state = {
  */
 static void upload_state_base_address( struct brw_context *brw )
 {
-   struct intel_context *intel = &brw->intel;
-
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
index da29bc8bcb..29f135d37a 100644
--- a/src/gallium/drivers/i965/brw_pipe_depth.c
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -1,5 +1,9 @@
-   /* _NEW_STENCIL */
-   if (key->dsa.stencil[0].enable) {
+
+static void *
+brw_create_depth_stencil( struct pipe_context *pipe,
+			  const struct pipe_depth_stencil_alpha_state *tmpl )
+{
+   if (tmpl->stencil[0].enable) {
       cc.cc0.stencil_enable = 1;
       cc.cc0.stencil_func =
 	 intel_translate_compare_func(key->stencil_func[0]);
@@ -13,7 +17,7 @@
       cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
       cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
 
-      if (key->stencil_two_side) {
+      if (tmpl->stencil[1].enable) {
 	 cc.cc0.bf_stencil_enable = 1;
 	 cc.cc0.bf_stencil_func =
 	    intel_translate_compare_func(key->stencil_func[1]);
@@ -30,9 +34,8 @@
 
       /* Not really sure about this:
        */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 cc.cc0.stencil_write_enable = 1;
+      cc.cc0.stencil_write_enable = (cc.cc1.stencil_write_mask ||
+				     cc.cc2.bf_stencil_write_mask);
    }
 
 
@@ -50,3 +53,6 @@
       cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
       cc.cc2.depth_write_enable = key->depth_write;
    }
+
+
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
index d4ae332f46..dbf97a0544 100644
--- a/src/gallium/drivers/i965/brw_pipe_fb.c
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -2,12 +2,12 @@
 /**
  * called from intelDrawBuffer()
  */
-static void brw_set_draw_region( struct intel_context *intel, 
+static void brw_set_draw_region( struct pipe_context *pipe, 
                                  struct intel_region *color_regions[],
                                  struct intel_region *depth_region,
                                  GLuint num_color_regions)
 {
-   struct brw_context *brw = brw_context(&intel->ctx);
+   struct brw_context *brw = brw_context(pipe);
    GLuint i;
 
    /* release old color/depth regions */
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index 008f623151..d5b7bd3b83 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -13,10 +13,8 @@ static void brw_finish_batch(struct intel_context *intel)
 /**
  * called from intelFlushBatchLocked
  */
-static void brw_new_batch( struct intel_context *intel )
+static void brw_new_batch( struct brw_context *brw )
 {
-   struct brw_context *brw = brw_context(&intel->ctx);
-
    /* Check that we didn't just wrap our batchbuffer at a bad time. */
    assert(!brw->no_batch_wrap);
 
@@ -36,14 +34,14 @@ static void brw_new_batch( struct intel_context *intel )
     * a new buffer next time.
     */
    if (brw->vb.upload.bo != NULL) {
-      dri_bo_unreference(brw->vb.upload.bo);
+      brw->sws->bo_unreference(brw->vb.upload.bo);
       brw->vb.upload.bo = NULL;
       brw->vb.upload.offset = 0;
    }
 }
 
 
-static void brw_note_fence( struct intel_context *intel, GLuint fence )
+static void brw_note_fence( struct brw_context *brw, GLuint fence )
 {
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
new file mode 100644
index 0000000000..0b9ba0c0ed
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "util/u_simple_list.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static void
+brw_queryobj_get_results(struct brw_query_object *query)
+{
+   int i;
+   uint64_t *results;
+
+   if (query->bo == NULL)
+      return;
+
+   /* Map and count the pixels from the current query BO */
+   dri_bo_map(query->bo, GL_FALSE);
+   results = query->bo->virtual;
+   for (i = query->first_index; i <= query->last_index; i++) {
+      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   }
+   dri_bo_unmap(query->bo);
+
+   brw->sws->bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+static struct pipe_query *
+brw_query_create(struct pipe_context *pipe, unsigned type )
+{
+   struct brw_query_object *query;
+
+   switch (query->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      query = CALLOC_STRUCT( brw_query_object );
+      if (query == NULL)
+	 return NULL;
+      return &query->Base;
+      
+   default:
+      return NULL;
+   }
+}
+
+static void
+brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   brw->sws->bo_unreference(query->bo);
+   FREE(query);
+}
+
+static void
+brw_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   brw->sws->bo_unreference(query->bo);
+   query->bo = NULL;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   brw->stats_wm++;
+   brw->dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+static void
+brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      intel_batchbuffer_flush(brw->batch);
+
+      brw->sws->bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+   }
+
+   remove_from_list(query);
+   brw->stats_wm--;
+   brw->dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+static void brw_wait_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   brw_queryobj_get_results(query);
+   query->Base.Ready = GL_TRUE;
+}
+
+static void brw_check_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+      brw_queryobj_get_results(query);
+      query->Base.Ready = GL_TRUE;
+   }
+}
+
+/** Called to set up the query BO and account for its aperture space */
+void
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+      brw->sws->bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+
+      brw->query.bo = dri_bo_alloc(brw->bufmgr, "query", 4096, 1);
+      brw->query.index = 0;
+   }
+
+   brw_add_validated_bo(brw, brw->query.bo);
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 if (query->bo != NULL)
+	    brw_queryobj_get_results(query);
+	 brw->sws->bo_reference(brw->query.bo);
+	 query->bo = brw->query.bo;
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_init_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->NewQueryObject = brw_new_query_object;
+   functions->DeleteQuery = brw_delete_query;
+   functions->BeginQuery = brw_begin_query;
+   functions->EndQuery = brw_end_query;
+   functions->CheckQuery = brw_check_query;
+   functions->WaitQuery = brw_wait_query;
+}
diff --git a/src/gallium/drivers/i965/brw_program.c b/src/gallium/drivers/i965/brw_program.c
deleted file mode 100644
index bac69187c1..0000000000
--- a/src/gallium/drivers/i965/brw_program.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-  
-#include "main/imports.h"
-#include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/program.h"
-#include "shader/programopt.h"
-#include "tnl/tnl.h"
-
-#include "brw_context.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-
-static void brwBindProgram( GLcontext *ctx,
-			    GLenum target, 
-			    struct gl_program *prog )
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   switch (target) {
-   case GL_VERTEX_PROGRAM_ARB: 
-      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-      break;
-   case GL_FRAGMENT_PROGRAM_ARB:
-      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-      break;
-   }
-}
-
-static struct gl_program *brwNewProgram( GLcontext *ctx,
-				      GLenum target, 
-				      GLuint id )
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   switch (target) {
-   case GL_VERTEX_PROGRAM_ARB: {
-      struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program);
-      if (prog) {
-	 prog->id = brw->program_id++;
-
-	 return _mesa_init_vertex_program( ctx, &prog->program,
-					     target, id );
-      }
-      else
-	 return NULL;
-   }
-
-   case GL_FRAGMENT_PROGRAM_ARB: {
-      struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program);
-      if (prog) {
-	 prog->id = brw->program_id++;
-
-	 return _mesa_init_fragment_program( ctx, &prog->program,
-					     target, id );
-      }
-      else
-	 return NULL;
-   }
-
-   default:
-      return _mesa_new_program(ctx, target, id);
-   }
-}
-
-static void brwDeleteProgram( GLcontext *ctx,
-			      struct gl_program *prog )
-{
-   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
-      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
-      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
-      dri_bo_unreference(brw_fprog->const_buffer);
-   }
-
-   _mesa_delete_program( ctx, prog );
-}
-
-
-static GLboolean brwIsProgramNative( GLcontext *ctx,
-				     GLenum target, 
-				     struct gl_program *prog )
-{
-   return GL_TRUE;
-}
-
-static void brwProgramStringNotify( GLcontext *ctx,
-				    GLenum target,
-				    struct gl_program *prog )
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   if (target == GL_FRAGMENT_PROGRAM_ARB) {
-      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
-      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
-      const struct brw_fragment_program *curFP =
-         brw_fragment_program_const(brw->fragment_program);
-
-      if (fprog->FogOption) {
-         _mesa_append_fog_code(ctx, fprog);
-         fprog->FogOption = GL_NONE;
-      }
-
-      if (newFP == curFP)
-	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-      newFP->id = brw->program_id++;      
-      newFP->isGLSL = brw_wm_is_glsl(fprog);
-   }
-   else if (target == GL_VERTEX_PROGRAM_ARB) {
-      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
-      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
-      const struct brw_vertex_program *curVP =
-         brw_vertex_program_const(brw->vertex_program);
-
-      if (newVP == curVP)
-	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-      if (newVP->program.IsPositionInvariant) {
-	 _mesa_insert_mvp_code(ctx, &newVP->program);
-      }
-      newVP->id = brw->program_id++;      
-
-      /* Also tell tnl about it:
-       */
-      _tnl_program_string(ctx, target, prog);
-   }
-}
-
-void brwInitFragProgFuncs( struct dd_function_table *functions )
-{
-   assert(functions->ProgramStringNotify == _tnl_program_string); 
-
-   functions->BindProgram = brwBindProgram;
-   functions->NewProgram = brwNewProgram;
-   functions->DeleteProgram = brwDeleteProgram;
-   functions->IsProgramNative = brwIsProgramNative;
-   functions->ProgramStringNotify = brwProgramStringNotify;
-}
-
diff --git a/src/gallium/drivers/i965/brw_queryobj.c b/src/gallium/drivers/i965/brw_queryobj.c
deleted file mode 100644
index a195bc32b0..0000000000
--- a/src/gallium/drivers/i965/brw_queryobj.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright © 2008 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-/** @file support for ARB_query_object
- *
- * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
- * execution on the completion of previous depth tests, and write the
- * current PS_DEPTH_COUNT to a buffer object.
- *
- * We use before and after counts when drawing during a query so that
- * we don't pick up other clients' query data in ours.  To reduce overhead,
- * a single BO is used to record the query data for all active queries at
- * once.  This also gives us a simple bound on how much batchbuffer space is
- * required for handling queries, so that we can be sure that we won't
- * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
- */
-#include "main/simple_list.h"
-#include "main/imports.h"
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "intel_batchbuffer.h"
-#include "intel_reg.h"
-
-/** Waits on the query object's BO and totals the results for this query */
-static void
-brw_queryobj_get_results(struct brw_query_object *query)
-{
-   int i;
-   uint64_t *results;
-
-   if (query->bo == NULL)
-      return;
-
-   /* Map and count the pixels from the current query BO */
-   dri_bo_map(query->bo, GL_FALSE);
-   results = query->bo->virtual;
-   for (i = query->first_index; i <= query->last_index; i++) {
-      query->Base.Result += results[i * 2 + 1] - results[i * 2];
-   }
-   dri_bo_unmap(query->bo);
-
-   dri_bo_unreference(query->bo);
-   query->bo = NULL;
-}
-
-static struct gl_query_object *
-brw_new_query_object(GLcontext *ctx, GLuint id)
-{
-   struct brw_query_object *query;
-
-   query = _mesa_calloc(sizeof(struct brw_query_object));
-
-   query->Base.Id = id;
-   query->Base.Result = 0;
-   query->Base.Active = GL_FALSE;
-   query->Base.Ready = GL_TRUE;
-
-   return &query->Base;
-}
-
-static void
-brw_delete_query(GLcontext *ctx, struct gl_query_object *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   dri_bo_unreference(query->bo);
-   _mesa_free(query);
-}
-
-static void
-brw_begin_query(GLcontext *ctx, struct gl_query_object *q)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = intel_context(ctx);
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   /* Reset our driver's tracking of query state. */
-   dri_bo_unreference(query->bo);
-   query->bo = NULL;
-   query->first_index = -1;
-   query->last_index = -1;
-
-   insert_at_head(&brw->query.active_head, query);
-   intel->stats_wm++;
-}
-
-/**
- * Begin the ARB_occlusion_query query on a query object.
- */
-static void
-brw_end_query(GLcontext *ctx, struct gl_query_object *q)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = intel_context(ctx);
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   /* Flush the batchbuffer in case it has writes to our query BO.
-    * Have later queries write to a new query BO so that further rendering
-    * doesn't delay the collection of our results.
-    */
-   if (query->bo) {
-      brw_emit_query_end(brw);
-      intel_batchbuffer_flush(intel->batch);
-
-      dri_bo_unreference(brw->query.bo);
-      brw->query.bo = NULL;
-   }
-
-   remove_from_list(query);
-
-   intel->stats_wm--;
-}
-
-static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   brw_queryobj_get_results(query);
-   query->Base.Ready = GL_TRUE;
-}
-
-static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
-      brw_queryobj_get_results(query);
-      query->Base.Ready = GL_TRUE;
-   }
-}
-
-/** Called to set up the query BO and account for its aperture space */
-void
-brw_prepare_query_begin(struct brw_context *brw)
-{
-   struct intel_context *intel = &brw->intel;
-
-   /* Skip if we're not doing any queries. */
-   if (is_empty_list(&brw->query.active_head))
-      return;
-
-   /* Get a new query BO if we're going to need it. */
-   if (brw->query.bo == NULL ||
-       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
-      dri_bo_unreference(brw->query.bo);
-      brw->query.bo = NULL;
-
-      brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1);
-      brw->query.index = 0;
-   }
-
-   brw_add_validated_bo(brw, brw->query.bo);
-}
-
-/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
-void
-brw_emit_query_begin(struct brw_context *brw)
-{
-   struct intel_context *intel = &brw->intel;
-   struct brw_query_object *query;
-
-   /* Skip if we're not doing any queries, or we've emitted the start. */
-   if (brw->query.active || is_empty_list(&brw->query.active_head))
-      return;
-
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
-	     PIPE_CONTROL_DEPTH_STALL |
-	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
-   /* This object could be mapped cacheable, but we don't have an exposed
-    * mechanism to support that.  Since it's going uncached, tell GEM that
-    * we're writing to it.  The usual clflush should be all that's required
-    * to pick up the results.
-    */
-   OUT_RELOC(brw->query.bo,
-	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
-	     ((brw->query.index * 2) * sizeof(uint64_t)));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   foreach(query, &brw->query.active_head) {
-      if (query->bo != brw->query.bo) {
-	 if (query->bo != NULL)
-	    brw_queryobj_get_results(query);
-	 dri_bo_reference(brw->query.bo);
-	 query->bo = brw->query.bo;
-	 query->first_index = brw->query.index;
-      }
-      query->last_index = brw->query.index;
-   }
-   brw->query.active = GL_TRUE;
-}
-
-/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
-void
-brw_emit_query_end(struct brw_context *brw)
-{
-   struct intel_context *intel = &brw->intel;
-
-   if (!brw->query.active)
-      return;
-
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
-	     PIPE_CONTROL_DEPTH_STALL |
-	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
-   OUT_RELOC(brw->query.bo,
-	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
-	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   brw->query.active = GL_FALSE;
-   brw->query.index++;
-}
-
-void brw_init_queryobj_functions(struct dd_function_table *functions)
-{
-   functions->NewQueryObject = brw_new_query_object;
-   functions->DeleteQuery = brw_delete_query;
-   functions->BeginQuery = brw_begin_query;
-   functions->EndQuery = brw_end_query;
-   functions->CheckQuery = brw_check_query;
-   functions->WaitQuery = brw_wait_query;
-}
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 90513245ee..0115f77c08 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -30,10 +30,6 @@
   */
   
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
@@ -46,7 +42,6 @@
 static void compile_sf_prog( struct brw_context *brw,
 			     struct brw_sf_prog_key *key )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_sf_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -116,7 +111,7 @@ static void compile_sf_prog( struct brw_context *brw,
 
    /* Upload
     */
-   dri_bo_unreference(brw->sf.prog_bo);
+   brw->sws->bo_unreference(brw->sf.prog_bo);
    brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG,
 				       &c.key, sizeof(c.key),
 				       NULL, 0,
@@ -129,7 +124,6 @@ static void compile_sf_prog( struct brw_context *brw,
  */
 static void upload_sf_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_sf_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -167,7 +161,7 @@ static void upload_sf_prog(struct brw_context *brw)
    key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_HINT */
-   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+   key.linear_color = 0;
 
    /* _NEW_POLYGON */
    if (key.do_twoside_color) {
@@ -179,7 +173,7 @@ static void upload_sf_prog(struct brw_context *brw)
       key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
    }
 
-   dri_bo_unreference(brw->sf.prog_bo);
+   brw->sws->bo_unreference(brw->sf.prog_bo);
    brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
 				      &key, sizeof(key),
 				      NULL, 0,
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index 6426b6df9f..26c2e8891a 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -45,19 +45,23 @@
 #define SF_UNFILLED_TRIS   3
 
 struct brw_sf_prog_key {
-   GLuint attrs:32;
+
+   /* Bitmask of linear and perspective interpolated inputs, 0..nr
+    */
+   GLuint persp_attrs:32;
+   GLuint linear_attrs:32;
+
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
-   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
+   GLuint sprite_origin_lower_left:1;
    GLuint pad:25;
-   GLenum SpriteOrigin;
 };
 
 struct brw_sf_point_tex {
-	GLboolean CoordReplace;	
+   GLboolean CoordReplace;	
 };
 
 struct brw_sf_compile {
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index 4cc427a935..c98d7ec13a 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -30,10 +30,6 @@
   */
    
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
@@ -305,6 +301,10 @@ static void invert_det( struct brw_sf_compile *c)
 }
 
 
+/* Two attributes packed into a wide register.  Figure out if either
+ * or both of them need linear/perspective interpolation.  Constant
+ * regs are left as-is.
+ */
 static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLuint reg,
 				  GLushort *pc,
@@ -312,20 +312,8 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask;
-   GLuint linear_mask;
-
-   if (c->key.do_flat_shading || c->key.linear_color)
-      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
-                                    FRAG_BIT_COL0 |
-                                    FRAG_BIT_COL1);
-   else
-      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS);
-
-   if (c->key.do_flat_shading)
-      linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
-   else
-      linear_mask = c->key.attrs;
+   GLuint persp_mask = c->key.persp_attrs;
+   GLuint linear_mask = c->key.linear_attrs;
 
    *pc_persp = 0;
    *pc_linear = 0;
@@ -570,7 +558,7 @@ void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
       {
 	 brw_set_predicate_control_flag_value(p, pc); 
 	 if (tex->CoordReplace) {
-	     if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
+	     if (c->key.sprite_origin_lower_left) {
 		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
 		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
 	     }
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
index bc0f076073..5e1229d22f 100644
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -34,12 +34,9 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
-#include "intel_fbo.h"
 
 static void upload_sf_vp(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport sfv;
    GLfloat y_scale, y_bias;
@@ -92,7 +89,7 @@ static void upload_sf_vp(struct brw_context *brw)
       sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
    }
 
-   dri_bo_unreference(brw->sf.vp_bo);
+   brw->sws->bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
 }
 
@@ -126,7 +123,6 @@ struct brw_sf_unit_key {
 static void
 sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_SF_PROG */
@@ -159,12 +155,12 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
 
-static dri_bo *
+static struct brw_winsys_buffer *
 sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
-			dri_bo **reloc_bufs)
+			struct brw_winsys_buffer **reloc_bufs)
 {
    struct brw_sf_unit_state sf;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
@@ -332,14 +328,14 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 static void upload_sf_unit( struct brw_context *brw )
 {
    struct brw_sf_unit_key key;
-   dri_bo *reloc_bufs[2];
+   struct brw_winsys_buffer *reloc_bufs[2];
 
    sf_unit_populate_key(brw, &key);
 
    reloc_bufs[0] = brw->sf.prog_bo;
    reloc_bufs[1] = brw->sf.vp_bo;
 
-   dri_bo_unreference(brw->sf.state_bo);
+   brw->sws->bo_unreference(brw->sf.state_bo);
    brw->sf.state_bo = brw_search_cache(&brw->cache, BRW_SF_UNIT,
 				       &key, sizeof(key),
 				       reloc_bufs, 2,
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index d639656b9d..a007d542d0 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -36,12 +36,12 @@
 #include "brw_context.h"
 
 static inline void
-brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
+brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
 {
    assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
 
    if (bo != NULL) {
-      dri_bo_reference(bo);
+      brw->sws->bo_reference(bo);
       brw->state.validated_bos[brw->state.validated_bo_count++] = bo;
    }
 };
@@ -95,9 +95,9 @@ const struct brw_tracked_state brw_index_buffer;
  * Use same key for WM and VS surfaces.
  */
 struct brw_surface_key {
-   GLenum target, depthmode;
-   dri_bo *bo;
-   GLint format, internal_format;
+   unsigned target;
+   struct brw_winsys_buffer *bo;
+   GLint format;
    GLint first_level, last_level;
    GLint width, height, depth;
    GLint pitch, cpp;
@@ -116,42 +116,42 @@ void brw_destroy_state(struct brw_context *brw);
 /***********************************************************************
  * brw_state_cache.c
  */
-dri_bo *brw_cache_data(struct brw_cache *cache,
+struct brw_winsys_buffer *brw_cache_data(struct brw_cache *cache,
 		       enum brw_cache_id cache_id,
 		       const void *data,
-		       dri_bo **reloc_bufs,
+		       struct brw_winsys_buffer **reloc_bufs,
 		       GLuint nr_reloc_bufs);
 
-dri_bo *brw_cache_data_sz(struct brw_cache *cache,
+struct brw_winsys_buffer *brw_cache_data_sz(struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *data,
 			  GLuint data_size,
-			  dri_bo **reloc_bufs,
+			  struct brw_winsys_buffer **reloc_bufs,
 			  GLuint nr_reloc_bufs);
 
-dri_bo *brw_upload_cache( struct brw_cache *cache,
+struct brw_winsys_buffer *brw_upload_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *key,
 			  GLuint key_sz,
-			  dri_bo **reloc_bufs,
+			  struct brw_winsys_buffer **reloc_bufs,
 			  GLuint nr_reloc_bufs,
 			  const void *data,
 			  GLuint data_sz,
 			  const void *aux,
 			  void *aux_return );
 
-dri_bo *brw_search_cache( struct brw_cache *cache,
+struct brw_winsys_buffer *brw_search_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *key,
 			  GLuint key_size,
-			  dri_bo **reloc_bufs,
+			  struct brw_winsys_buffer **reloc_bufs,
 			  GLuint nr_reloc_bufs,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
 void brw_init_caches( struct brw_context *brw );
 void brw_destroy_caches( struct brw_context *brw );
-void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo);
+void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo);
 
 /***********************************************************************
  * brw_state_batch.c
@@ -166,7 +166,7 @@ void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache( struct brw_context *brw );
 
 /* brw_wm_surface_state.c */
-dri_bo *
+struct brw_winsys_buffer *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key );
 
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 7821898cf9..9568794625 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -33,7 +33,6 @@
 
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index c262e1db8b..91d0f80297 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -56,7 +56,6 @@
  * incorrect program is run for the other instance.
  */
 
-#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 
@@ -72,7 +71,7 @@
 
 static GLuint
 hash_key(const void *key, GLuint key_size,
-         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+         struct brw_winsys_buffer **reloc_bufs, GLuint nr_reloc_bufs)
 {
    GLuint *ikey = (GLuint *)key;
    GLuint hash = 0, i;
@@ -88,7 +87,7 @@ hash_key(const void *key, GLuint key_size,
 
    /* Include the BO pointers as key data as well */
    ikey = (GLuint *)reloc_bufs;
-   key_size = nr_reloc_bufs * sizeof(dri_bo *);
+   key_size = nr_reloc_bufs * sizeof(struct brw_winsys_buffer *);
    for (i = 0; i < key_size/4; i++) {
       hash ^= ikey[i];
       hash = (hash << 5) | (hash >> 27);
@@ -103,14 +102,14 @@ hash_key(const void *key, GLuint key_size,
  */
 static void
 update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
-		  dri_bo *bo)
+		  struct brw_winsys_buffer *bo)
 {
    if (bo == cache->last_bo[cache_id])
       return; /* no change */
 
-   dri_bo_unreference(cache->last_bo[cache_id]);
+   brw->sws->bo_unreference(cache->last_bo[cache_id]);
    cache->last_bo[cache_id] = bo;
-   dri_bo_reference(cache->last_bo[cache_id]);
+   brw->sws->bo_reference(cache->last_bo[cache_id]);
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
@@ -118,7 +117,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
-	     dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+	     struct brw_winsys_buffer **reloc_bufs, GLuint nr_reloc_bufs)
 {
    struct brw_cache_item *c;
 
@@ -139,7 +138,7 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	  memcmp(c->key, key, key_size) == 0 &&
 	  c->nr_reloc_bufs == nr_reloc_bufs &&
 	  memcmp(c->reloc_bufs, reloc_bufs,
-		 nr_reloc_bufs * sizeof(dri_bo *)) == 0)
+		 nr_reloc_bufs * sizeof(struct brw_winsys_buffer *)) == 0)
 	 return c;
    }
 
@@ -173,12 +172,12 @@ rehash(struct brw_cache *cache)
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-dri_bo *
+struct brw_winsys_buffer *
 brw_search_cache(struct brw_cache *cache,
                  enum brw_cache_id cache_id,
                  const void *key,
                  GLuint key_size,
-                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 struct brw_winsys_buffer **reloc_bufs, GLuint nr_reloc_bufs,
                  void *aux_return)
 {
    struct brw_cache_item *item;
@@ -195,17 +194,17 @@ brw_search_cache(struct brw_cache *cache,
 
    update_cache_last(cache, cache_id, item->bo);
 
-   dri_bo_reference(item->bo);
+   brw->sws->bo_reference(item->bo);
    return item->bo;
 }
 
 
-dri_bo *
+struct brw_winsys_buffer *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
 		  const void *key,
 		  GLuint key_size,
-		  dri_bo **reloc_bufs,
+		  struct brw_winsys_buffer **reloc_bufs,
 		  GLuint nr_reloc_bufs,
 		  const void *data,
 		  GLuint data_size,
@@ -214,10 +213,10 @@ brw_upload_cache( struct brw_cache *cache,
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
-   GLuint relocs_size = nr_reloc_bufs * sizeof(dri_bo *);
+   GLuint relocs_size = nr_reloc_bufs * sizeof(struct brw_winsys_buffer *);
    GLuint aux_size = cache->aux_size[cache_id];
    void *tmp;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    int i;
 
    /* Create the buffer object to contain the data */
@@ -233,7 +232,7 @@ brw_upload_cache( struct brw_cache *cache,
    memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
-	 dri_bo_reference(reloc_bufs[i]);
+	 brw->sws->bo_reference(reloc_bufs[i]);
    }
 
    item->cache_id = cache_id;
@@ -244,7 +243,7 @@ brw_upload_cache( struct brw_cache *cache,
    item->nr_reloc_bufs = nr_reloc_bufs;
 
    item->bo = bo;
-   dri_bo_reference(bo);
+   brw->sws->bo_reference(bo);
    item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
@@ -277,15 +276,15 @@ brw_upload_cache( struct brw_cache *cache,
 /**
  * This doesn't really work with aux data.  Use search/upload instead
  */
-dri_bo *
+struct brw_winsys_buffer *
 brw_cache_data_sz(struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
 		  const void *data,
 		  GLuint data_size,
-		  dri_bo **reloc_bufs,
+		  struct brw_winsys_buffer **reloc_bufs,
 		  GLuint nr_reloc_bufs)
 {
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    struct brw_cache_item *item;
    GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs);
 
@@ -293,7 +292,7 @@ brw_cache_data_sz(struct brw_cache *cache,
 		       reloc_bufs, nr_reloc_bufs);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
-      dri_bo_reference(item->bo);
+      brw->sws->bo_reference(item->bo);
       return item->bo;
    }
 
@@ -314,11 +313,11 @@ brw_cache_data_sz(struct brw_cache *cache,
  * better to use, as the potentially changing offsets in the data-used-as-key
  * will result in excessive cache misses.
  */
-dri_bo *
+struct brw_winsys_buffer *
 brw_cache_data(struct brw_cache *cache,
 	       enum brw_cache_id cache_id,
 	       const void *data,
-	       dri_bo **reloc_bufs,
+	       struct brw_winsys_buffer **reloc_bufs,
 	       GLuint nr_reloc_bufs)
 {
    return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
@@ -497,8 +496,8 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 
 	 next = c->next;
 	 for (j = 0; j < c->nr_reloc_bufs; j++)
-	    dri_bo_unreference(c->reloc_bufs[j]);
-	 dri_bo_unreference(c->bo);
+	    brw->sws->bo_unreference(c->reloc_bufs[j]);
+	 brw->sws->bo_unreference(c->bo);
 	 free((void *)c->key);
 	 free(c);
       }
@@ -523,7 +522,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
  * at the cost of walking the entire hash table.
  */
 void
-brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo)
+brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 {
    struct brw_cache_item **prev;
    GLuint i;
@@ -535,14 +534,14 @@ brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo)
       for (prev = &cache->items[i]; *prev;) {
 	 struct brw_cache_item *c = *prev;
 
-	 if (drm_intel_bo_references(c->bo, bo)) {
+	 if (cache->sws->bo_references(c->bo, bo)) {
 	    int j;
 
 	    *prev = c->next;
 
 	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       dri_bo_unreference(c->reloc_bufs[j]);
-	    dri_bo_unreference(c->bo);
+	       brw->sws->bo_unreference(c->reloc_bufs[j]);
+	    brw->sws->bo_unreference(c->bo);
 	    free((void *)c->key);
 	    free(c);
 	    cache->n_items--;
@@ -580,7 +579,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(cache->last_bo[i]);
+      brw->sws->bo_unreference(cache->last_bo[i]);
       free(cache->name[i]);
    }
    free(cache->items);
diff --git a/src/gallium/drivers/i965/brw_state_dump.c b/src/gallium/drivers/i965/brw_state_dump.c
index e94fa7d2b4..1bc83fb9c1 100644
--- a/src/gallium/drivers/i965/brw_state_dump.c
+++ b/src/gallium/drivers/i965/brw_state_dump.c
@@ -25,8 +25,6 @@
  *
  */
 
-#include "main/mtypes.h"
-
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
@@ -55,7 +53,7 @@ state_out(const char *name, void *data, uint32_t hw_offset, int index,
 
 /** Generic, undecoded state buffer debug printout */
 static void
-state_struct_out(const char *name, dri_bo *buffer, unsigned int state_size)
+state_struct_out(const char *name, struct brw_winsys_buffer *buffer, unsigned int state_size)
 {
    int i;
 
@@ -102,7 +100,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
    int i;
 
    for (i = 0; i < brw->wm.nr_surfaces; i++) {
-      dri_bo *surf_bo = brw->wm.surf_bo[i];
+      struct brw_winsys_buffer *surf_bo = brw->wm.surf_bo[i];
       unsigned int surfoff;
       struct brw_surface_state *surf;
       char name[20];
@@ -162,7 +160,7 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    dri_bo_unmap(brw->sf.vp_bo);
 }
 
-static void brw_debug_prog(const char *name, dri_bo *prog)
+static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
 {
    unsigned int i;
    uint32_t *data;
@@ -202,10 +200,8 @@ static void brw_debug_prog(const char *name, dri_bo *prog)
  * The buffer offsets printed rely on the buffer containing the last offset
  * it was validated at.
  */
-void brw_debug_batch(struct intel_context *intel)
+void brw_debug_batch(struct brw_context *brw)
 {
-   struct brw_context *brw = brw_context(&intel->ctx);
-
    state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
    dump_wm_surface_state(brw);
 
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index 6801084616..b68b6cb21a 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -149,7 +149,7 @@ brw_clear_validated_bos(struct brw_context *brw)
 
    /* Clear the last round of validated bos */
    for (i = 0; i < brw->state.validated_bo_count; i++) {
-      dri_bo_unreference(brw->state.validated_bos[i]);
+      brw->sws->bo_unreference(brw->state.validated_bos[i]);
       brw->state.validated_bos[i] = NULL;
    }
    brw->state.validated_bo_count = 0;
@@ -272,8 +272,6 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
  */
 enum pipe_error brw_validate_state( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = &brw->intel;
    struct brw_state_flags *state = &brw->state.dirty;
    GLuint i;
 
diff --git a/src/gallium/drivers/i965/brw_structs.h b/src/gallium/drivers/i965/brw_structs.h
index 66d4127271..27d264c3de 100644
--- a/src/gallium/drivers/i965/brw_structs.h
+++ b/src/gallium/drivers/i965/brw_structs.h
@@ -33,6 +33,7 @@
 #ifndef BRW_STRUCTS_H
 #define BRW_STRUCTS_H
 
+#include "brw_types.h"
 
 /** Number of general purpose registers (VS, WM, etc) */
 #define BRW_MAX_GRF 128
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
index 6684f442d5..83f138f67a 100644
--- a/src/gallium/drivers/i965/brw_swtnl.c
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -6,7 +6,6 @@ static GLboolean check_fallbacks( struct brw_context *brw,
 				  const struct _mesa_prim *prim,
 				  GLuint nr_prims )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
    /* If we don't require strict OpenGL conformance, never 
diff --git a/src/gallium/drivers/i965/brw_tex.c b/src/gallium/drivers/i965/brw_tex.c
index e911b105b2..c33c19ee51 100644
--- a/src/gallium/drivers/i965/brw_tex.c
+++ b/src/gallium/drivers/i965/brw_tex.c
@@ -30,11 +30,6 @@
   */
         
 
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/teximage.h"
-
-#include "intel_context.h"
 #include "intel_regions.h"
 #include "intel_tex.h"
 #include "brw_context.h"
@@ -45,8 +40,6 @@
  */
 void brw_validate_textures( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = &brw->intel;
    int i;
 
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
index 5986cbffad..75cdc18912 100644
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ b/src/gallium/drivers/i965/brw_tex_layout.c
@@ -34,13 +34,11 @@
 
 #include "intel_mipmap_tree.h"
 #include "intel_tex_layout.h"
-#include "intel_context.h"
-#include "main/macros.h"
 #include "intel_chipset.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
-GLboolean brw_miptree_layout(struct intel_context *intel,
+GLboolean brw_miptree_layout(struct brw_context *brw,
 			     struct intel_mipmap_tree *mt,
 			     uint32_t tiling)
 {
@@ -67,7 +65,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
               mt->pitch = ALIGN(mt->width0, align_w);
           }
 
-          if (mt->first_level != mt->last_level) {
+          if (mt->last_level != 0) {
               GLuint mip1_width;
 
               if (mt->compressed) {
@@ -93,7 +91,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
               mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
           }
 
-          for (level = mt->first_level; level <= mt->last_level; level++) {
+          for (level = 0; level <= mt->last_level; level++) {
               GLuint img_height;
               GLuint nr_images = 6;
               GLuint q = 0;
@@ -109,7 +107,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
               else
                   img_height = ALIGN(height, align_h);
 
-              if (level == mt->first_level + 1) {
+              if (level == 1) {
                   x += ALIGN(width, align_w);
               }
               else {
@@ -147,7 +145,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
       pack_x_pitch = width;
       pack_x_nr = 1;
 
-      for (level = mt->first_level ; level <= mt->last_level ; level++) {
+      for (level = 0 ; level <= mt->last_level ; level++) {
 	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
index 32b62848da..87dae13d94 100644
--- a/src/gallium/drivers/i965/brw_types.h
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -1,11 +1,18 @@
 #ifndef BRW_TYPES_H
 #define BRW_TYPES_H
 
-typedef GLuint uint32_t;
-typedef GLubyte uint8_t;
-typedef GLushort uint16_t;
+#include "pipe/p_compiler.h"
+
+typedef uint32_t GLuint;
+typedef uint8_t GLubyte;
+typedef uint16_t GLushort;
+typedef int32_t GLint;
+typedef int8_t GLbyte;
+typedef int16_t GLshort;
+typedef float GLfloat;
+
 /* no GLenum, translate all away */
 
-typedef GLboolean uint8_t;
+typedef uint8_t GLboolean;
 
 #endif
diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c
index 17f671a8fa..c5244e58ab 100644
--- a/src/gallium/drivers/i965/brw_util.c
+++ b/src/gallium/drivers/i965/brw_util.c
@@ -30,8 +30,6 @@
   */
          
 
-#include "main/mtypes.h"
-#include "shader/prog_parameter.h"
 #include "brw_util.h"
 #include "brw_defines.h"
 
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
index 33e7cd87e4..37c3acbc11 100644
--- a/src/gallium/drivers/i965/brw_util.h
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -33,7 +33,7 @@
 #ifndef BRW_UTIL_H
 #define BRW_UTIL_H
 
-#include "main/mtypes.h"
+#include "brw_types.h"
 
 extern GLuint brw_count_bits( GLuint val );
 extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 53a5560105..97e523c3ee 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -71,7 +71,7 @@ static void do_vs_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
-   dri_bo_unreference(brw->vs.prog_bo);
+   brw->sws->bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
 				       &c.key, sizeof(c.key),
 				       NULL, 0,
@@ -83,7 +83,6 @@ static void do_vs_prog( struct brw_context *brw,
 
 static void brw_upload_vs_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_vs_prog_key key;
    struct brw_vertex_program *vp = 
       (struct brw_vertex_program *)brw->vertex_program;
@@ -100,7 +99,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 
    /* Make an early check for the key.
     */
-   dri_bo_unreference(brw->vs.prog_bo);
+   brw->sws->bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
 				      &key, sizeof(key),
 				      NULL, 0,
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 7f20c4baca..6adb743017 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -30,9 +30,6 @@
   */
             
 
-#include "main/macros.h"
-#include "shader/program.h"
-#include "shader/prog_parameter.h"
 #include "pipe/p_shader_tokens.h"
 #include "brw_context.h"
 #include "brw_vs.h"
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index d790ab6555..1717223e49 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -34,7 +34,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_vs_unit_key {
    unsigned int total_grf;
@@ -51,8 +50,6 @@ struct brw_vs_unit_key {
 static void
 vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -79,11 +76,11 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    }
 }
 
-static dri_bo *
+static struct brw_winsys_buffer *
 vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
    struct brw_vs_unit_state vs;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
    int chipset_max_threads;
 
    memset(&vs, 0, sizeof(vs));
@@ -163,7 +160,7 @@ static void prepare_vs_unit(struct brw_context *brw)
 
    vs_unit_populate_key(brw, &key);
 
-   dri_bo_unreference(brw->vs.state_bo);
+   brw->sws->bo_unreference(brw->vs.state_bo);
    brw->vs.state_bo = brw_search_cache(&brw->cache, BRW_VS_UNIT,
 				       &key, sizeof(key),
 				       &brw->vs.prog_bo, 1,
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
index 89f47522a1..6446e8e761 100644
--- a/src/gallium/drivers/i965/brw_vs_surface_state.c
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -29,11 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/mtypes.h"
-#include "main/texformat.h"
-#include "main/texstore.h"
-#include "shader/prog_parameter.h"
-
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
@@ -47,7 +42,6 @@
 static drm_intel_bo *
 brw_vs_update_constant_buffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
@@ -73,7 +67,7 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
  * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
  */
 static void
-brw_update_vs_constant_surface( GLcontext *ctx,
+brw_update_vs_constant_surface( struct brw_context *brw,
                                 GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
@@ -87,7 +81,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
    /* If we're in this state update atom, we need to update VS constants, so
     * free the old buffer and create a new one for the new contents.
     */
-   dri_bo_unreference(vp->const_buffer);
+   brw->sws->bo_unreference(vp->const_buffer);
    vp->const_buffer = brw_vs_update_constant_buffer(brw);
 
    /* If there's no constant buffer, then no surface BO is needed to point at
@@ -101,8 +95,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
 
    memset(&key, 0, sizeof(key));
 
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
+   key.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
    key.bo = vp->const_buffer;
    key.depthmode = GL_NONE;
    key.pitch = params->NumParameters;
@@ -132,10 +125,10 @@ brw_update_vs_constant_surface( GLcontext *ctx,
 /**
  * Constructs the binding table for the VS surface state.
  */
-static dri_bo *
+static struct brw_winsys_buffer *
 brw_vs_get_binding_table(struct brw_context *brw)
 {
-   dri_bo *bind_bo;
+   struct brw_winsys_buffer *bind_bo;
 
    bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
@@ -186,7 +179,6 @@ brw_vs_get_binding_table(struct brw_context *brw)
  */
 static void prepare_vs_surfaces(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    int i;
    int nr_surfaces = 0;
 
@@ -208,7 +200,7 @@ static void prepare_vs_surfaces(struct brw_context *brw )
     * just slightly increases our working set size.
     */
    if (brw->vs.nr_surfaces != 0) {
-      dri_bo_unreference(brw->vs.bind_bo);
+      brw->sws->bo_unreference(brw->vs.bind_bo);
       brw->vs.bind_bo = brw_vs_get_binding_table(brw);
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 20d31880b4..32b8900bac 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -29,7 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
              
-#include "main/texformat.h"
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_wm.h"
@@ -186,7 +185,7 @@ static void do_wm_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c->func, &program_size);
 
-   dri_bo_unreference(brw->wm.prog_bo);
+   brw->sws->bo_unreference(brw->wm.prog_bo);
    brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG,
 				       &c->key, sizeof(c->key),
 				       NULL, 0,
@@ -200,7 +199,6 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
@@ -329,7 +327,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 
    /* Make an early check for the key.
     */
-   dri_bo_unreference(brw->wm.prog_bo);
+   brw->sws->bo_unreference(brw->wm.prog_bo);
    brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
 				      &key, sizeof(key),
 				      NULL, 0,
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 9c47c46a3d..fec33f74eb 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -30,7 +30,6 @@
   */
                
 
-#include "main/macros.h"
 #include "brw_context.h"
 #include "brw_wm.h"
 
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index d836e2fb34..c4f0711793 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1,7 +1,3 @@
-#include "main/macros.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
diff --git a/src/gallium/drivers/i965/brw_wm_iz.c b/src/gallium/drivers/i965/brw_wm_iz.c
index 5e399ac62a..6f1e9fcc3c 100644
--- a/src/gallium/drivers/i965/brw_wm_iz.c
+++ b/src/gallium/drivers/i965/brw_wm_iz.c
@@ -30,7 +30,6 @@
   */
                 
 
-#include "main/mtypes.h"
 #include "brw_wm.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
index dff466587a..a8993f9312 100644
--- a/src/gallium/drivers/i965/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -34,9 +34,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-#include "main/macros.h"
-
-
 
 /* Samplers aren't strictly wm state from the hardware's perspective,
  * but that is the only situation in which we use them in this driver.
@@ -79,7 +76,7 @@ static GLint S_FIXED(GLfloat value, GLuint frac_bits)
 }
 
 
-static dri_bo *upload_default_color( struct brw_context *brw,
+static struct brw_winsys_buffer *upload_default_color( struct brw_context *brw,
 				     const GLfloat *color )
 {
    struct brw_sampler_default_color sdc;
@@ -102,7 +99,7 @@ struct wm_sampler_key {
       float max_aniso;
       GLenum minfilter, magfilter;
       GLenum comparemode, comparefunc;
-      dri_bo *sdc_bo;
+      struct brw_winsys_buffer *sdc_bo;
 
       /** If target is cubemap, take context setting.
        */
@@ -115,7 +112,7 @@ struct wm_sampler_key {
  * entry.
  */
 static void brw_update_sampler_state(struct wm_sampler_entry *key,
-				     dri_bo *sdc_bo,
+				     struct brw_winsys_buffer *sdc_bo,
 				     struct brw_sampler_state *sampler)
 {
    _mesa_memset(sampler, 0, sizeof(*sampler));
@@ -240,7 +237,6 @@ static void
 brw_wm_sampler_populate_key(struct brw_context *brw,
 			    struct wm_sampler_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    int unit;
 
    memset(key, 0, sizeof(*key));
@@ -272,7 +268,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 entry->comparemode = texObj->CompareMode;
          entry->comparefunc = texObj->CompareFunc;
 
-	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
+	 brw->sws->bo_unreference(brw->wm.sdc_bo[unit]);
 	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 	    float bordercolor[4] = {
 	       texObj->BorderColor[0],
@@ -300,7 +296,6 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
  */
 static void upload_wm_samplers( struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct wm_sampler_key key;
    int i;
 
@@ -311,7 +306,7 @@ static void upload_wm_samplers( struct brw_context *brw )
       brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
    }
 
-   dri_bo_unreference(brw->wm.sampler_bo);
+   brw->sws->bo_unreference(brw->wm.sampler_bo);
    brw->wm.sampler_bo = NULL;
    if (brw->wm.sampler_count == 0)
       return;
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
index 361f91292b..958c00d3e0 100644
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -60,10 +60,8 @@ struct brw_wm_unit_key {
 static void
 wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
    const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
-   struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
 
@@ -121,7 +119,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
    /* temporary sanity check assertion */
    ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
-   /* _NEW_DEPTH */
+   /* _NEW_QUERY */
    key->stats_wm = intel->stats_wm;
 
    /* _NEW_LINE */
@@ -136,12 +134,12 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 /**
  * Setup wm hardware state.  See page 225 of Volume 2
  */
-static dri_bo *
+static struct brw_winsys_buffer *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
-			dri_bo **reloc_bufs)
+			struct brw_winsys_buffer **reloc_bufs)
 {
    struct brw_wm_unit_state wm;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&wm, 0, sizeof(wm));
 
@@ -257,9 +255,8 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
 static void upload_wm_unit( struct brw_context *brw )
 {
-   struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_key key;
-   dri_bo *reloc_bufs[3];
+   struct brw_winsys_buffer *reloc_bufs[3];
    wm_unit_populate_key(brw, &key);
 
    /* Allocate the necessary scratch space if we haven't already.  Don't
@@ -271,7 +268,7 @@ static void upload_wm_unit( struct brw_context *brw )
       GLuint total = key.total_scratch * key.max_threads;
 
       if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
-	 dri_bo_unreference(brw->wm.scratch_bo);
+	 brw->sws->bo_unreference(brw->wm.scratch_bo);
 	 brw->wm.scratch_bo = NULL;
       }
       if (brw->wm.scratch_bo == NULL) {
@@ -286,7 +283,7 @@ static void upload_wm_unit( struct brw_context *brw )
    reloc_bufs[1] = brw->wm.scratch_bo;
    reloc_bufs[2] = brw->wm.sampler_bo;
 
-   dri_bo_unreference(brw->wm.state_bo);
+   brw->sws->bo_unreference(brw->wm.state_bo);
    brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT,
 				       &key, sizeof(key),
 				       reloc_bufs, 3,
@@ -302,7 +299,7 @@ const struct brw_tracked_state brw_wm_unit = {
 	       _NEW_POLYGONSTIPPLE | 
 	       _NEW_LINE | 
 	       _NEW_COLOR |
-	       _NEW_DEPTH),
+	       _NEW_QUERY),
 
       .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
 	      BRW_NEW_CURBE_OFFSETS |
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index f7cc5153a8..86dcb74b5b 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -30,11 +30,6 @@
   */
                    
 
-#include "main/mtypes.h"
-#include "main/texformat.h"
-#include "main/texstore.h"
-#include "shader/prog_parameter.h"
-
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
@@ -70,90 +65,87 @@ static GLuint translate_tex_target( GLenum target )
 }
 
 
-static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
+static GLuint translate_tex_format( GLuint mesa_format, 
 				    GLenum depth_mode )
 {
-   switch( mesa_format ) {
-   case MESA_FORMAT_L8:
+   switch( pipe_format ) {
+   case PIPE_FORMAT_L8_UNORM:
       return BRW_SURFACEFORMAT_L8_UNORM;
 
-   case MESA_FORMAT_I8:
+   case PIPE_FORMAT_I8_UNORM:
       return BRW_SURFACEFORMAT_I8_UNORM;
 
-   case MESA_FORMAT_A8:
+   case PIPE_FORMAT_A8_UNORM:
       return BRW_SURFACEFORMAT_A8_UNORM; 
 
-   case MESA_FORMAT_AL88:
+   case PIPE_FORMAT_A8L8_UNORM:
       return BRW_SURFACEFORMAT_L8A8_UNORM;
 
-   case MESA_FORMAT_RGB888:
-      assert(0);		/* not supported for sampling */
-      return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
-   case MESA_FORMAT_ARGB8888:
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
 
-   case MESA_FORMAT_RGBA8888_REV:
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+   case PIPE_FORMAT_:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
-   case MESA_FORMAT_RGB565:
+   case PIPE_FORMAT_RGB565:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
 
-   case MESA_FORMAT_ARGB1555:
+   case PIPE_FORMAT_ARGB1555:
       return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
 
-   case MESA_FORMAT_ARGB4444:
+   case PIPE_FORMAT_ARGB4444:
       return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 
-   case MESA_FORMAT_YCBCR_REV:
+
+   case PIPE_FORMAT_L16_UNORM:
+      return BRW_SURFACEFORMAT_L16_UNORM;
+
+   case PIPE_FORMAT_I16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+
+   case PIPE_FORMAT_A16_UNORM:
+      return BRW_SURFACEFORMAT_A16_UNORM; 
+
+   case PIPE_FORMAT_YCBCR_REV:
       return BRW_SURFACEFORMAT_YCRCB_NORMAL;
 
-   case MESA_FORMAT_YCBCR:
+   case PIPE_FORMAT_YCBCR:
       return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
 
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
       return BRW_SURFACEFORMAT_FXT1;
 
-   case MESA_FORMAT_Z16:
-      if (depth_mode == GL_INTENSITY) 
-	  return BRW_SURFACEFORMAT_I16_UNORM;
-      else if (depth_mode == GL_ALPHA)
-	  return BRW_SURFACEFORMAT_A16_UNORM;
-      else
-	  return BRW_SURFACEFORMAT_L16_UNORM;
-
-   case MESA_FORMAT_RGB_DXT1:
+   case PIPE_FORMAT_RGB_DXT1:
        return BRW_SURFACEFORMAT_DXT1_RGB;
 
-   case MESA_FORMAT_RGBA_DXT1:
+   case PIPE_FORMAT_RGBA_DXT1:
        return BRW_SURFACEFORMAT_BC1_UNORM;
        
-   case MESA_FORMAT_RGBA_DXT3:
+   case PIPE_FORMAT_RGBA_DXT3:
        return BRW_SURFACEFORMAT_BC2_UNORM;
        
-   case MESA_FORMAT_RGBA_DXT5:
+   case PIPE_FORMAT_RGBA_DXT5:
        return BRW_SURFACEFORMAT_BC3_UNORM;
 
-   case MESA_FORMAT_SARGB8:
+   case PIPE_FORMAT_R8G8B8A8_SRGB:
       return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
 
-   case MESA_FORMAT_SLA8:
+   case PIPE_FORMAT_A8L8_SRGB:
       return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
 
-   case MESA_FORMAT_SL8:
+   case PIPE_FORMAT_L8_SRGB:
       return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
 
-   case MESA_FORMAT_SRGB_DXT1:
+   case PIPE_FORMAT_SRGB_DXT1:
       return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
 
-   case MESA_FORMAT_S8_Z24:
+   case PIPE_FORMAT_S8_Z24:
       /* XXX: these different surface formats don't seem to
        * make any difference for shadow sampler/compares.
        */
@@ -164,10 +156,10 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
       else
          return BRW_SURFACEFORMAT_L24X8_UNORM;
 
-   case MESA_FORMAT_DUDV8:
+   case PIPE_FORMAT_DUDV8:
       return BRW_SURFACEFORMAT_R8G8_SNORM;
 
-   case MESA_FORMAT_SIGNED_RGBA8888_REV:
+   case PIPE_FORMAT_SIGNED_RGBA8888_REV:
       return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
 
    default:
@@ -195,12 +187,12 @@ brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
    }
 }
 
-static dri_bo *
+static struct brw_winsys_buffer *
 brw_create_texture_surface( struct brw_context *brw,
 			    struct brw_surface_key *key )
 {
    struct brw_surface_state surf;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&surf, 0, sizeof(surf));
 
@@ -234,7 +226,7 @@ brw_create_texture_surface( struct brw_context *brw,
    else
       surf.ss1.base_addr = key->offset;
 
-   surf.ss2.mip_count = key->last_level - key->first_level;
+   surf.ss2.mip_count = key->last_level;
    surf.ss2.width = key->width - 1;
    surf.ss2.height = key->height - 1;
    brw_set_surface_tiling(&surf, key->tiling);
@@ -270,41 +262,30 @@ brw_create_texture_surface( struct brw_context *brw,
 }
 
 static void
-brw_update_texture_surface( GLcontext *ctx, GLuint unit )
+brw_update_texture_surface( struct brw_context *brw, GLuint unit )
 {
-   struct brw_context *brw = brw_context(ctx);
-   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
-   struct intel_texture_object *intelObj = intel_texture_object(tObj);
-   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   struct pipe_texture *tex = brw->texture[unit];
    struct brw_surface_key key;
    const GLuint surf = SURF_INDEX_TEXTURE(unit);
 
    memset(&key, 0, sizeof(key));
 
-   if (intelObj->imageOverride) {
-      key.pitch = intelObj->pitchOverride / intelObj->mt->cpp;
-      key.depth = intelObj->depthOverride;
-      key.bo = NULL;
-      key.offset = intelObj->textureOffset;
-   } else {
-      key.format = firstImage->TexFormat->MesaFormat;
-      key.internal_format = firstImage->InternalFormat;
-      key.pitch = intelObj->mt->pitch;
-      key.depth = firstImage->Depth;
-      key.bo = intelObj->mt->region->buffer;
-      key.offset = 0;
-   }
-
-   key.target = tObj->Target;
-   key.depthmode = tObj->DepthMode;
-   key.first_level = intelObj->firstLevel;
-   key.last_level = intelObj->lastLevel;
-   key.width = firstImage->Width;
-   key.height = firstImage->Height;
-   key.cpp = intelObj->mt->cpp;
-   key.tiling = intelObj->mt->region->tiling;
-
-   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   key.format = tex->base.format;
+   key.pitch = tex->pitch;
+   key.depth = tex->base.depth[0];
+   key.bo = tex->buffer;
+   key.offset = 0;
+
+   key.target = tObj->target;	/* translated to BRW enum */
+   /* key.depthmode = tObj->DepthMode; */ /* XXX: add this to gallium? or the state tracker? */
+   key.first_level = 0;
+   key.last_level = tex->base.last_level;
+   key.width = tex->base.depth[0];
+   key.height = tex->base.height[0];
+   key.cpp = tex->cpp;
+   key.tiling = tex->tiling;
+
+   brw->sws->bo_unreference(brw->wm.surf_bo[surf]);
    brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
                                             BRW_SS_SURFACE,
                                             &key, sizeof(key),
@@ -321,13 +302,13 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-dri_bo *
+struct brw_winsys_buffer *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key )
 {
    const GLint w = key->width - 1;
    struct brw_surface_state surf;
-   dri_bo *bo;
+   struct brw_winsys_buffer *bo;
 
    memset(&surf, 0, sizeof(surf));
 
@@ -374,7 +355,6 @@ brw_create_constant_surface( struct brw_context *brw,
 static drm_intel_bo *
 brw_wm_update_constant_buffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
@@ -399,7 +379,7 @@ brw_wm_update_constant_buffer(struct brw_context *brw)
  * The constant buffer will be (re)allocated here if needed.
  */
 static void
-brw_update_wm_constant_surface( GLcontext *ctx,
+brw_update_wm_constant_surface( struct brw_context *brw,
                                 GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
@@ -412,7 +392,7 @@ brw_update_wm_constant_surface( GLcontext *ctx,
    /* If we're in this state update atom, we need to update WM constants, so
     * free the old buffer and create a new one for the new contents.
     */
-   dri_bo_unreference(fp->const_buffer);
+   brw->sws->bo_unreference(fp->const_buffer);
    fp->const_buffer = brw_wm_update_constant_buffer(brw);
 
    /* If there's no constant buffer, then no surface BO is needed to point at
@@ -426,7 +406,7 @@ brw_update_wm_constant_surface( GLcontext *ctx,
 
    memset(&key, 0, sizeof(key));
 
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.format = PIPE_FORMAT_RGBA_FLOAT32;
    key.internal_format = GL_RGBA;
    key.bo = fp->const_buffer;
    key.depthmode = GL_NONE;
@@ -442,7 +422,7 @@ brw_update_wm_constant_surface( GLcontext *ctx,
           key.width, key.height, key.depth, key.cpp, key.pitch);
    */
 
-   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->sws->bo_unreference(brw->wm.surf_bo[surf]);
    brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
                                             BRW_SS_SURFACE,
                                             &key, sizeof(key),
@@ -464,7 +444,6 @@ brw_update_wm_constant_surface( GLcontext *ctx,
  */
 static void prepare_wm_constant_surface(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
    GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
@@ -507,8 +486,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
 				unsigned int unit)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   dri_bo *region_bo = NULL;
+   struct brw_winsys_buffer *region_bo = NULL;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_region *region = irb ? irb->region : NULL;
    struct {
@@ -528,16 +506,16 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 
       key.surface_type = BRW_SURFACE_2D;
       switch (irb->texformat->MesaFormat) {
-      case MESA_FORMAT_ARGB8888:
+      case PIPE_FORMAT_ARGB8888:
 	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 break;
-      case MESA_FORMAT_RGB565:
+      case PIPE_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
 	 break;
-      case MESA_FORMAT_ARGB1555:
+      case PIPE_FORMAT_ARGB1555:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
 	 break;
-      case MESA_FORMAT_ARGB4444:
+      case PIPE_FORMAT_ARGB4444:
 	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 	 break;
       default:
@@ -569,7 +547,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    key.color_blend = (!ctx->Color._LogicOpEnabled &&
 		      ctx->Color.BlendEnabled);
 
-   dri_bo_unreference(brw->wm.surf_bo[unit]);
+   brw->sws->bo_unreference(brw->wm.surf_bo[unit]);
    brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
 					    BRW_SS_SURFACE,
 					    &key, sizeof(key),
@@ -646,10 +624,10 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
  * Constructs the binding table for the WM surface state, which maps unit
  * numbers to surface state objects.
  */
-static dri_bo *
+static struct brw_winsys_buffer *
 brw_wm_get_binding_table(struct brw_context *brw)
 {
-   dri_bo *bind_bo;
+   struct brw_winsys_buffer *bind_bo;
 
    assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
 
@@ -692,7 +670,6 @@ brw_wm_get_binding_table(struct brw_context *brw)
 
 static void prepare_wm_surfaces(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
    int old_nr_surfaces;
 
@@ -724,12 +701,12 @@ static void prepare_wm_surfaces(struct brw_context *brw )
 	 brw_update_texture_surface(ctx, i);
 	 brw->wm.nr_surfaces = surf + 1;
       } else {
-         dri_bo_unreference(brw->wm.surf_bo[surf]);
+         brw->sws->bo_unreference(brw->wm.surf_bo[surf]);
          brw->wm.surf_bo[surf] = NULL;
       }
    }
 
-   dri_bo_unreference(brw->wm.bind_bo);
+   brw->sws->bo_unreference(brw->wm.bind_bo);
    brw->wm.bind_bo = brw_wm_get_binding_table(brw);
 
    if (brw->wm.nr_surfaces != old_nr_surfaces)
diff --git a/src/gallium/drivers/i965/intel_batchbuffer.h b/src/gallium/drivers/i965/intel_batchbuffer.h
index a595d2e0c5..be04656aec 100644
--- a/src/gallium/drivers/i965/intel_batchbuffer.h
+++ b/src/gallium/drivers/i965/intel_batchbuffer.h
@@ -1,9 +1,6 @@
 #ifndef INTEL_BATCHBUFFER_H
 #define INTEL_BATCHBUFFER_H
 
-#include "main/mtypes.h"
-
-#include "intel_context.h"
 #include "intel_bufmgr.h"
 #include "intel_reg.h"
 
@@ -44,7 +41,7 @@ struct intel_batchbuffer
 {
    struct intel_context *intel;
 
-   dri_bo *buf;
+   struct brw_winsys_buffer *buf;
 
    GLubyte *buffer;
 
@@ -89,7 +86,7 @@ void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
                                      GLuint bytes);
 
 GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
-                                       dri_bo *buffer,
+                                       struct brw_winsys_buffer *buffer,
 				       uint32_t read_domains,
 				       uint32_t write_domain,
 				       uint32_t offset);
diff --git a/src/gallium/drivers/i965/intel_tex_format.c b/src/gallium/drivers/i965/intel_tex_format.c
index 3322a71130..c62ecdadf0 100644
--- a/src/gallium/drivers/i965/intel_tex_format.c
+++ b/src/gallium/drivers/i965/intel_tex_format.c
@@ -1,206 +1,9 @@
 #include "intel_context.h"
 #include "intel_tex.h"
 #include "intel_chipset.h"
-#include "main/texformat.h"
-#include "main/enums.h"
 
 
-/**
- * Choose hardware texture format given the user's glTexImage parameters.
- *
- * It works out that this function is fine for all the supported
- * hardware.  However, there is still a need to map the formats onto
- * hardware descriptors.
- *
- * Note that the i915 can actually support many more formats than
- * these if we take the step of simply swizzling the colors
- * immediately after sampling...
- */
-const struct gl_texture_format *
-intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
-                         GLenum format, GLenum type)
-{
-   struct intel_context *intel = intel_context(ctx);
-   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24);
-
-#if 0
-   printf("%s intFmt=0x%x format=0x%x type=0x%x\n",
-          __FUNCTION__, internalFormat, format, type);
-#endif
-
-   switch (internalFormat) {
-   case 4:
-   case GL_RGBA:
-   case GL_COMPRESSED_RGBA:
-      if (format == GL_BGRA) {
-         if (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-            return &_mesa_texformat_argb8888;
-         }
-         else if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) {
-            return &_mesa_texformat_argb4444;
-         }
-         else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) {
-            return &_mesa_texformat_argb1555;
-         }
-      }
-      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444;
-
-   case 3:
-   case GL_RGB:
-   case GL_COMPRESSED_RGB:
-      if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-         return &_mesa_texformat_rgb565;
-      }
-      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_rgb565;
-
-   case GL_RGBA8:
-   case GL_RGB10_A2:
-   case GL_RGBA12:
-   case GL_RGBA16:
-      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444;
-
-   case GL_RGBA4:
-   case GL_RGBA2:
-      return &_mesa_texformat_argb4444;
-
-   case GL_RGB5_A1:
-      return &_mesa_texformat_argb1555;
-
-   case GL_RGB8:
-   case GL_RGB10:
-   case GL_RGB12:
-   case GL_RGB16:
-      return &_mesa_texformat_argb8888;
-
-   case GL_RGB5:
-   case GL_RGB4:
-   case GL_R3_G3_B2:
-      return &_mesa_texformat_rgb565;
-
-   case GL_ALPHA:
-   case GL_ALPHA4:
-   case GL_ALPHA8:
-   case GL_ALPHA12:
-   case GL_ALPHA16:
-   case GL_COMPRESSED_ALPHA:
-      return &_mesa_texformat_a8;
-
-   case 1:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE4:
-   case GL_LUMINANCE8:
-   case GL_LUMINANCE12:
-   case GL_LUMINANCE16:
-   case GL_COMPRESSED_LUMINANCE:
-      return &_mesa_texformat_l8;
-
-   case 2:
-   case GL_LUMINANCE_ALPHA:
-   case GL_LUMINANCE4_ALPHA4:
-   case GL_LUMINANCE6_ALPHA2:
-   case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
-   case GL_COMPRESSED_LUMINANCE_ALPHA:
-      return &_mesa_texformat_al88;
-
-   case GL_INTENSITY:
-   case GL_INTENSITY4:
-   case GL_INTENSITY8:
-   case GL_INTENSITY12:
-   case GL_INTENSITY16:
-   case GL_COMPRESSED_INTENSITY:
-      return &_mesa_texformat_i8;
 
-   case GL_YCBCR_MESA:
-      if (type == GL_UNSIGNED_SHORT_8_8_MESA || type == GL_UNSIGNED_BYTE)
-         return &_mesa_texformat_ycbcr;
-      else
-         return &_mesa_texformat_ycbcr_rev;
-
-   case GL_COMPRESSED_RGB_FXT1_3DFX:
-      return &_mesa_texformat_rgb_fxt1;
-   case GL_COMPRESSED_RGBA_FXT1_3DFX:
-      return &_mesa_texformat_rgba_fxt1;
-
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgb_dxt1;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgba_dxt1;
-
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      return &_mesa_texformat_rgba_dxt3;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_rgba_dxt5;
-
-   case GL_DEPTH_COMPONENT:
-   case GL_DEPTH_COMPONENT16:
-   case GL_DEPTH_COMPONENT24:
-   case GL_DEPTH_COMPONENT32:
-#if 0
-      return &_mesa_texformat_z16;
-#else
-      /* fall-through.
-       * 16bpp depth texture can't be paired with a stencil buffer so
-       * always used combined depth/stencil format.
-       */
-#endif
-   case GL_DEPTH_STENCIL_EXT:
-   case GL_DEPTH24_STENCIL8_EXT:
-      return &_mesa_texformat_s8_z24;
-
-#ifndef I915
-   case GL_SRGB_EXT:
-   case GL_SRGB8_EXT:
-   case GL_SRGB_ALPHA_EXT:
-   case GL_SRGB8_ALPHA8_EXT:
-   case GL_COMPRESSED_SRGB_EXT:
-   case GL_COMPRESSED_SRGB_ALPHA_EXT:
-   case GL_COMPRESSED_SLUMINANCE_EXT:
-   case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
-      return &_mesa_texformat_sargb8;
-   case GL_SLUMINANCE_EXT:
-   case GL_SLUMINANCE8_EXT:
-      if (IS_G4X(intel->intelScreen->deviceID))
-         return &_mesa_texformat_sl8;
-      else
-         return &_mesa_texformat_sargb8;
-   case GL_SLUMINANCE_ALPHA_EXT:
-   case GL_SLUMINANCE8_ALPHA8_EXT:
-      if (IS_G4X(intel->intelScreen->deviceID))
-         return &_mesa_texformat_sla8;
-      else
-         return &_mesa_texformat_sargb8;
-   case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_srgb_dxt1;
-
-   /* i915 could also do this */
-   case GL_DUDV_ATI:
-   case GL_DU8DV8_ATI:
-      return &_mesa_texformat_dudv8;
-   case GL_RGBA_SNORM:
-   case GL_RGBA8_SNORM:
-      return &_mesa_texformat_signed_rgba8888_rev;
-#endif
-
-   default:
-      fprintf(stderr, "unexpected texture format %s in %s\n",
-              _mesa_lookup_enum_by_nr(internalFormat), __FUNCTION__);
-      return NULL;
-   }
-
-   return NULL;                 /* never get here */
-}
 
 int intel_compressed_num_bytes(GLuint mesaFormat)
 {
diff --git a/src/gallium/drivers/i965/intel_tex_layout.c b/src/gallium/drivers/i965/intel_tex_layout.c
index 7d69ea4484..1cdab49e5e 100644
--- a/src/gallium/drivers/i965/intel_tex_layout.c
+++ b/src/gallium/drivers/i965/intel_tex_layout.c
@@ -33,7 +33,6 @@
 #include "intel_mipmap_tree.h"
 #include "intel_tex_layout.h"
 #include "intel_context.h"
-#include "main/macros.h"
 
 void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h)
 {
@@ -86,7 +85,7 @@ void i945_miptree_layout_2d( struct intel_context *intel,
     * constraints of mipmap placement push the right edge of the
     * 2nd mipmap out past the width of its parent.
     */
-   if (mt->first_level != mt->last_level) {
+   if (mt->last_level) {
        GLuint mip1_width;
 
        if (mt->compressed) {
@@ -108,7 +107,7 @@ void i945_miptree_layout_2d( struct intel_context *intel,
    mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
    mt->total_height = 0;
 
-   for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+   for ( level = 0 ; level <= mt->last_level ; level++ ) {
       GLuint img_height;
 
       intel_miptree_set_level_info(mt, level, 1, x, y, width, 
@@ -127,7 +126,7 @@ void i945_miptree_layout_2d( struct intel_context *intel,
 
       /* Layout_below: step right after second mipmap.
        */
-      if (level == mt->first_level + 1) {
+      if (level == 1) {
 	 x += ALIGN(width, align_w);
       }
       else {
-- 
cgit v1.2.3


From 074606a806df755ecbb84e0a1182c66fd0b2a8dd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 24 Oct 2009 13:18:34 +0100
Subject: i965g: more files compiling

---
 src/gallium/drivers/i965/brw_batchbuffer.h      | 124 ++++++++++++
 src/gallium/drivers/i965/brw_cc.c               |  16 +-
 src/gallium/drivers/i965/brw_clip.c             |  80 +++-----
 src/gallium/drivers/i965/brw_clip.h             |   7 +-
 src/gallium/drivers/i965/brw_clip_unfilled.c    |   2 +-
 src/gallium/drivers/i965/brw_clip_util.c        |   2 +-
 src/gallium/drivers/i965/brw_context.c          |   2 +-
 src/gallium/drivers/i965/brw_context.h          |  89 ++++-----
 src/gallium/drivers/i965/brw_curbe.c            |  10 +-
 src/gallium/drivers/i965/brw_defines.h          |   4 +-
 src/gallium/drivers/i965/brw_draw.c             |  12 +-
 src/gallium/drivers/i965/brw_draw_upload.c      |   2 +-
 src/gallium/drivers/i965/brw_eu.h               |  32 +++-
 src/gallium/drivers/i965/brw_eu_emit.c          |   4 +-
 src/gallium/drivers/i965/brw_gs.c               |   2 +-
 src/gallium/drivers/i965/brw_gs_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_misc_state.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_query.c       |   4 +-
 src/gallium/drivers/i965/brw_pipe_rast.c        |  46 +++++
 src/gallium/drivers/i965/brw_pipe_rast.h        |  14 ++
 src/gallium/drivers/i965/brw_pipe_shader.c      | 159 ++++++++++++++++
 src/gallium/drivers/i965/brw_reg.h              |  79 ++++++++
 src/gallium/drivers/i965/brw_screen.h           |  78 ++++++++
 src/gallium/drivers/i965/brw_screen_surface.c   |   4 +-
 src/gallium/drivers/i965/brw_sf.c               |   2 +-
 src/gallium/drivers/i965/brw_sf.h               |   1 -
 src/gallium/drivers/i965/brw_sf_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_state.h            |   2 +-
 src/gallium/drivers/i965/brw_state_batch.c      |   6 +-
 src/gallium/drivers/i965/brw_state_cache.c      |   2 +-
 src/gallium/drivers/i965/brw_state_upload.c     |   2 +-
 src/gallium/drivers/i965/brw_tex_layout.c       |   2 +-
 src/gallium/drivers/i965/brw_urb.c              |   2 +-
 src/gallium/drivers/i965/brw_util.h             |   5 +-
 src/gallium/drivers/i965/brw_vs.c               |   3 +-
 src/gallium/drivers/i965/brw_vs.h               |   1 -
 src/gallium/drivers/i965/brw_vs_emit.c          |  82 ++++----
 src/gallium/drivers/i965/brw_winsys.h           | 243 ++++++++++++++++++++++++
 src/gallium/drivers/i965/brw_wm.h               |   1 -
 src/gallium/drivers/i965/brw_wm_debug.c         |   2 +-
 src/gallium/drivers/i965/brw_wm_emit.c          |  84 ++++----
 src/gallium/drivers/i965/brw_wm_fp.c            |  60 +++---
 src/gallium/drivers/i965/brw_wm_pass0.c         |   1 -
 src/gallium/drivers/i965/brw_wm_pass1.c         |  68 +++----
 src/gallium/drivers/i965/brw_wm_surface_state.c |   2 +-
 src/gallium/drivers/i965/intel_batchbuffer.h    | 168 ----------------
 47 files changed, 1027 insertions(+), 492 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_batchbuffer.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_shader.c
 create mode 100644 src/gallium/drivers/i965/brw_reg.h
 create mode 100644 src/gallium/drivers/i965/brw_screen.h
 create mode 100644 src/gallium/drivers/i965/brw_winsys.h
 delete mode 100644 src/gallium/drivers/i965/intel_batchbuffer.h

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
new file mode 100644
index 0000000000..76b3c1bf69
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -0,0 +1,124 @@
+#ifndef BRW_BATCHBUFFER_H
+#define BRW_BATCHBUFFER_H
+
+#include "brw_types.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+/* All ignored:
+ */
+enum cliprect_mode {
+   IGNORE_CLIPRECTS,
+   LOOP_CLIPRECTS,
+   NO_LOOP_CLIPRECTS,
+   REFERENCES_CLIPRECTS
+};
+
+void brw_batchbuffer_free(struct brw_batchbuffer *batch);
+
+void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
+			      const char *file, int line);
+
+#define brw_batchbuffer_flush(batch) \
+	_brw_batchbuffer_flush(batch, __FILE__, __LINE__)
+
+void brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+void brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+void brw_batchbuffer_release_space(struct brw_batchbuffer *batch,
+                                     GLuint bytes);
+
+GLboolean brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+                                       struct brw_winsys_buffer *buffer,
+				       uint32_t read_domains,
+				       uint32_t write_domain,
+				       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+brw_batchbuffer_space(struct brw_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+brw_batchbuffer_emit_dword(struct brw_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(brw_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE boolean
+brw_batchbuffer_require_space(struct brw_batchbuffer *batch,
+                                GLuint sz,
+				enum cliprect_mode cliprect_mode)
+{
+   assert(sz < batch->size - 8);
+   if (brw_batchbuffer_space(batch) < sz) {
+      assert(0);
+      return FALSE;
+   }
+
+   /* All commands should be executed once regardless of cliprect
+    * mode.
+    */
+   (void)cliprect_mode;
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BATCH_LOCALS
+
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+   brw_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+   assert(intel->batch->emit.start_ptr == NULL);			\
+   intel->batch->emit.total = (n) * 4;					\
+   intel->batch->emit.start_ptr = intel->batch->ptr;			\
+} while (0)
+
+#define OUT_BATCH(d) brw_batchbuffer_emit_dword(intel->batch, d)
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
+   assert((unsigned) (delta) < buf->size);				\
+   brw_batchbuffer_emit_reloc(intel->batch, buf,			\
+				read_domains, write_domain, delta);	\
+} while (0)
+
+#define ADVANCE_BATCH() do {						\
+   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
+   assert(intel->batch->emit.start_ptr != NULL);			\
+   if (_n != intel->batch->emit.total) {				\
+      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
+	      _n, intel->batch->emit.total);				\
+      abort();								\
+   }									\
+   intel->batch->emit.start_ptr = NULL;					\
+} while(0)
+
+
+static INLINE void
+brw_batchbuffer_emit_mi_flush(struct brw_batchbuffer *batch)
+{
+   brw_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
+   brw_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index bf2743ebbe..c8e7851d75 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -65,7 +65,7 @@ static void prepare_cc_vp( struct brw_context *brw )
    memset(&ccv, 0, sizeof(ccv));
 
    /* PIPE_NEW_VIEWPORT */
-   calc_sane_viewport( &brw->vp, &svp );
+   calc_sane_viewport( &brw->curr.vp, &svp );
 
    ccv.min_depth = svp.near;
    ccv.max_depth = svp.far;
@@ -109,13 +109,13 @@ static void
 cc_unit_populate_key(const struct brw_context *brw,
 		     struct brw_cc_unit_key *key)
 {
-   key->cc0 = brw->dsa->cc0;
-   key->cc1 = brw->dsa->cc1;
-   key->cc2 = brw->dsa->cc2;
-   key->cc3 = combine_cc3( brw->dsa->cc3, brw->blend->cc3 );
-   key->cc5 = brw->blend->cc5;
-   key->cc6 = brw->blend->cc6;
-   key->cc7 = brw->blend->cc7;
+   key->cc0 = brw->curr.dsa->cc0;
+   key->cc1 = brw->curr.dsa->cc1;
+   key->cc2 = brw->curr.dsa->cc2;
+   key->cc3 = combine_cc3( brw->curr.dsa->cc3, brw->curr.blend->cc3 );
+   key->cc5 = brw->curr.blend->cc5;
+   key->cc6 = brw->curr.blend->cc6;
+   key->cc7 = brw->curr.blend->cc7;
 }
 
 /**
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index d82ebeb9a9..591e904705 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -33,13 +33,14 @@
 
 #include "util/u_math.h"
 
-#include "intel_batchbuffer.h"
-
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_util.h"
 #include "brw_state.h"
+#include "brw_pipe_rast.h"
 #include "brw_clip.h"
 
 
@@ -77,13 +78,16 @@ static void compile_clip_prog( struct brw_context *brw,
    else
        delta = REG_SIZE;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
-	 c.offset[i] = delta;
-	 delta += ATTR_SIZE;
-      }
+   /* XXX: c.offset is now pretty redundant:
+    */
+   for (i = 0; i < c.key.nr_attrs; i++) {
+      c.offset[i] = delta;
+      delta += ATTR_SIZE;
+   }
 
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   /* XXX: c.nr_attrs is very redundant:
+    */
+   c.nr_attrs = c.key.nr_attrs;
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -145,59 +149,21 @@ static void upload_clip_prog(struct brw_context *brw)
 {
    struct brw_clip_prog_key key;
 
-   memset(&key, 0, sizeof(key));
-
-   /* Populate the key:
+   /* Populate the key, starting from the almost-complete version from
+    * the rast state. 
     */
+
+   /* PIPE_NEW_RAST */
+   memcpy(&key, &brw->curr.rast->clip_key, sizeof key);
+
    /* BRW_NEW_REDUCED_PRIMITIVE */
    key.primitive = brw->reduced_primitive;
-   /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written;
-   /* PIPE_NEW_RAST */
-   key.do_flat_shading = brw->rast.base.flatshade;
-   /* PIPE_NEW_UCP */
-   key.nr_userclip = brw->nr_ucp;
 
-   if (BRW_IS_IGDNG(brw))
-       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
-   else
-       key.clip_mode = BRW_CLIPMODE_NORMAL;
+   /* PIPE_NEW_VS */
+   key.nr_attrs = brw->curr.vs->info.file_max[TGSI_FILE_OUTPUT] + 1;
 
-   /* PIPE_NEW_RAST */
-   if (key.primitive == PIPE_PRIM_TRIANGLES) {
-      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
-	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
-      else {
-	 key.fill_ccw = CLIP_CULL;
-	 key.fill_cw = CLIP_CULL;
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
-	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
-	 }
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
-	    key.fill_cw = translate_fill(brw->rast.fill_cw);
-	 }
-
-	 if (key.fill_cw != CLIP_FILL ||
-	     key.fill_ccw != CLIP_FILL) {
-	    key.do_unfilled = 1;
-	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
-	 }
-
-	 key.offset_ccw = brw->rast.offset_ccw;
-	 key.offset_cw = brw->rast.offset_cw;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_cw != CLIP_CULL) 
-	    key.copy_bfc_cw = 1;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_ccw != CLIP_CULL) 
-	    key.copy_bfc_ccw = 1;
-	 }
-      }
-   }
+   /* PIPE_NEW_CLIP */
+   key.nr_userclip = brw->curr.ucp.nr;
 
    brw->sws->bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
@@ -212,7 +178,7 @@ static void upload_clip_prog(struct brw_context *brw)
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
       .mesa  = (PIPE_NEW_RAST | 
-		PIPE_NEW_UCP),
+		PIPE_NEW_CLIP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index d80ec819b9..cfe51bf292 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -42,8 +42,7 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:32;		
-
+   GLuint nr_attrs:5;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -55,7 +54,7 @@ struct brw_clip_prog_key {
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:12;
+   GLuint pad1:7;
    
    GLfloat offset_factor;
    GLfloat offset_units;
@@ -117,7 +116,7 @@ struct brw_clip_compile {
    GLuint last_mrf;
 
    GLuint header_position_offset;
-   GLuint offset[VERT_ATTRIB_MAX];
+   GLuint offset[PIPE_MAX_SHADER_OUTPUTS];
    GLboolean need_ff_sync;
 };
 
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index 4baff55806..8501599aef 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 7a6c46ce07..60bfd3538e 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -93,7 +93,7 @@ void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
    /* value.xyz *= value.rhw
     */
    brw_set_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_MUL(p, brw_writemask(pos, BRW_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
    brw_set_access_mode(p, BRW_ALIGN_1);
 }
 
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index 063ada5772..07a5420d6e 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -38,7 +38,7 @@
 #include "brw_state.h"
 #include "brw_vs.h"
 #include "brw_screen_tex.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 6699d3bdb6..3a2fece45c 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -36,6 +36,8 @@
 #include "brw_structs.h"
 #include "brw_winsys.h"
 #include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
 
 
 /* Glossary:
@@ -143,6 +145,27 @@ struct brw_blend_state {
 };
 
 
+struct brw_rasterizer_state;
+
+
+struct brw_vertex_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+struct brw_fragment_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   GLboolean isGLSL;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
 
 
@@ -157,6 +180,7 @@ struct brw_blend_state {
 #define PIPE_NEW_VERTEX_SHADER          0x2
 #define PIPE_NEW_FRAGMENT_CONSTS        0x2
 #define PIPE_NEW_VERTEX_CONSTS          0x2
+#define PIPE_NEW_CLIP                   0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -196,25 +220,6 @@ struct brw_state_flags {
 };
 
 
-struct brw_vertex_program {
-   const struct tgsi_token *tokens;
-   GLuint id;
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
-
-/** Subclass of Mesa fragment program */
-struct brw_fragment_program {
-   const struct tgsi_token *tokens;
-
-   GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< any IF/LOOP/CONT/BREAK instructions */
-
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
 
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
@@ -452,24 +457,29 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct pipe_context *pipe;
-   struct pipe_screen *screen;
-   
+   struct pipe_context pipe;
+
+   struct brw_screen *brw_screen;   
    struct brw_winsys_screen *sws;
 
    GLuint primitive;
+   GLuint reduced_primitive;
 
    GLboolean emit_state_always;
    GLboolean no_batch_wrap;
 
    /* Active vertex program: 
     */
-   const struct gl_vertex_program *vertex_program;
-   const struct gl_fragment_program *fragment_program;
-   struct pipe_framebuffer_state fb;
-   struct brw_depth_stencil_alpha_state *dsa;
-   struct brw_blend_state *blend;
-   struct pipe_viewport_state vp;
+   struct {
+      const struct brw_vertex_shader *vs;
+      const struct brw_fragment_shader *fs;
+      const struct brw_blend_state *blend;
+      const struct brw_rasterizer_state *rast;
+      const struct brw_depth_stencil_alpha_state *dsa;
+      struct pipe_framebuffer_state fb;
+      struct pipe_viewport_state vp;
+      struct pipe_clip_state ucp;
+   } curr;
 
    struct {
       struct brw_state_flags dirty;
@@ -719,29 +729,6 @@ brw_context( struct pipe_context *ctx )
    return (struct brw_context *)ctx;
 }
 
-static INLINE struct brw_vertex_program *
-brw_vertex_program(struct gl_vertex_program *p)
-{
-   return (struct brw_vertex_program *) p;
-}
-
-static INLINE const struct brw_vertex_program *
-brw_vertex_program_const(const struct gl_vertex_program *p)
-{
-   return (const struct brw_vertex_program *) p;
-}
-
-static INLINE struct brw_fragment_program *
-brw_fragment_program(struct gl_fragment_program *p)
-{
-   return (struct brw_fragment_program *) p;
-}
-
-static INLINE const struct brw_fragment_program *
-brw_fragment_program_const(const struct gl_fragment_program *p)
-{
-   return (const struct brw_fragment_program *) p;
-}
 
 
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 33ea9a00f7..f2524d75e2 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,7 +30,7 @@
   */
 
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
 #include "brw_defines.h"
@@ -55,8 +55,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* PIPE_NEW_UCP */
-   if (brw->nr_ucp) {
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
       GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
@@ -106,7 +106,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = PIPE_NEW_UCP,
+      .mesa = PIPE_NEW_CLIP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -327,7 +327,7 @@ const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
       .mesa = (PIPE_NEW_FS_CONSTANTS |
 	       PIPE_NEW_VS_CONSTANTS |
-	       PIPE_NEW_UCP),
+	       PIPE_NEW_CLIP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 282c5b18f4..1dc64ddc8f 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->pci_id))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->pci_id))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 856999f3ef..741537309a 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -31,7 +31,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
@@ -133,7 +133,7 @@ static void brw_emit_prim(struct brw_context *brw,
       ADVANCE_BATCH();
    }
    if (prim_packet.verts_per_instance) {
-      intel_batchbuffer_data( brw->intel.batch, &prim_packet,
+      brw_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
    if (intel->always_flush_cache) {
@@ -224,7 +224,7 @@ static GLboolean brw_try_draw_prims( struct brw_context *brw,
       return ret;
 
    if (intel->always_flush_batch)
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
 
    return 0;
 }
@@ -249,12 +249,10 @@ void brw_draw_prims( struct brw_context *brw,
     */
    ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
-   /* Otherwise, we really are out of memory.  Pass the drawing
-    * command to the software tnl module and which will in turn call
-    * swrast to do the drawing.
+   /* Otherwise, flush and retry:
     */
    if (ret != 0) {
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
       ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
       assert(ret == 0);
    }
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index dce015d79f..1ab65d60c4 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -35,7 +35,7 @@
 #include "brw_state.h"
 #include "brw_fallback.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index 30603bdd0e..46d52a473b 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -35,7 +35,6 @@
 
 #include "brw_structs.h"
 #include "brw_defines.h"
-#include "shader/prog_instruction.h"
 
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
@@ -45,6 +44,23 @@
 #define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 
+#define BRW_WRITEMASK_NONE     0x00
+#define BRW_WRITEMASK_X        0x01
+#define BRW_WRITEMASK_Y        0x02
+#define BRW_WRITEMASK_XY       0x03
+#define BRW_WRITEMASK_Z        0x04
+#define BRW_WRITEMASK_XZ       0x05
+#define BRW_WRITEMASK_YZ       0x06
+#define BRW_WRITEMASK_XYZ      0x07
+#define BRW_WRITEMASK_W        0x08
+#define BRW_WRITEMASK_XW       0x09
+#define BRW_WRITEMASK_YW       0x0A
+#define BRW_WRITEMASK_XYW      0x0B
+#define BRW_WRITEMASK_ZW       0x0C
+#define BRW_WRITEMASK_XZW      0x0D
+#define BRW_WRITEMASK_YZW      0x0E
+#define BRW_WRITEMASK_XYZW     0x0F
+
 
 #define REG_SIZE (8*4)
 
@@ -157,7 +173,7 @@ static INLINE int type_sz( GLuint type )
  * \param width  one of BRW_WIDTH_x
  * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
  * \param swizzle  one of BRW_SWIZZLE_x
- * \param writemask  WRITEMASK_X/Y/Z/W bitfield
+ * \param writemask  BRW_WRITEMASK_X/Y/Z/W bitfield
  */
 static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint nr,
@@ -215,7 +231,7 @@ static INLINE struct brw_reg brw_vec16_reg( GLuint file,
 		  BRW_WIDTH_16,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[8] register */
@@ -231,7 +247,7 @@ static INLINE struct brw_reg brw_vec8_reg( GLuint file,
 		  BRW_WIDTH_8,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[4] register */
@@ -247,7 +263,7 @@ static INLINE struct brw_reg brw_vec4_reg( GLuint file,
 		  BRW_WIDTH_4,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[2] register */
@@ -263,7 +279,7 @@ static INLINE struct brw_reg brw_vec2_reg( GLuint file,
 		  BRW_WIDTH_2,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYXY,
-		  WRITEMASK_XY);
+		  BRW_WRITEMASK_XY);
 }
 
 /** Construct float[1] register */
@@ -279,7 +295,7 @@ static INLINE struct brw_reg brw_vec1_reg( GLuint file,
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XXXX,
-		  WRITEMASK_X);
+		  BRW_WRITEMASK_X);
 }
 
 
@@ -510,7 +526,7 @@ static INLINE struct brw_reg brw_ip_reg( void )
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XYZW, /* NOTE! */
-		  WRITEMASK_XYZW); /* NOTE! */
+		  BRW_WRITEMASK_XYZW); /* NOTE! */
 }
 
 static INLINE struct brw_reg brw_acc_reg( void )
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index 241cdc33f8..f6b8843e01 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -1276,7 +1276,7 @@ void brw_SAMPLE(struct brw_compile *p,
     * instruction, so that is a guide for whether a workaround is
     * needed.
     */
-   if (writemask != WRITEMASK_XYZW) {
+   if (writemask != BRW_WRITEMASK_XYZW) {
       GLuint dst_offset = 0;
       GLuint i, newmask = 0, len = 0;
 
@@ -1299,7 +1299,7 @@ void brw_SAMPLE(struct brw_compile *p,
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 	 
-	 newmask = ~newmask & WRITEMASK_XYZW;
+	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
 
 	 brw_push_insn_state(p);
 
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 58930e7964..692ce46679 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
       
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
index 9ec206d7e8..fd8e2acced 100644
--- a/src/gallium/drivers/i965/brw_gs_emit.c
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -30,7 +30,7 @@
   */
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index d33bf40a01..eb39be8545 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -31,7 +31,7 @@
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index d5b7bd3b83..e85a1a9c1b 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -1,6 +1,6 @@
 
 /**
- * called from intel_batchbuffer_flush and children before sending a
+ * called from brw_batchbuffer_flush and children before sending a
  * batchbuffer off.
  */
 static void brw_finish_batch(struct intel_context *intel)
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index 0b9ba0c0ed..55242ac6ad 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -42,7 +42,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_reg.h"
 
 /** Waits on the query object's BO and totals the results for this query */
@@ -122,7 +122,7 @@ brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
     */
    if (query->bo) {
       brw_emit_query_end(brw);
-      intel_batchbuffer_flush(brw->batch);
+      brw_batchbuffer_flush(brw->batch);
 
       brw->sws->bo_unreference(brw->query.bo);
       brw->query.bo = NULL;
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
new file mode 100644
index 0000000000..ff64dbd48d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -0,0 +1,46 @@
+
+static void
+calculate_clip_key_rast()
+{
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   key.do_flat_shading = brw->rast->templ.flatshade;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->templ.cull_mode = PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
+	 }
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
+	 }
+
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
+	    key.do_unfilled = 1;
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.templ.offset_ccw;
+	 key.offset_cw = brw->rast.templ.offset_cw;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
+	 }
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
new file mode 100644
index 0000000000..6ceaa1fb09
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -0,0 +1,14 @@
+#ifndef BRW_PIPE_RAST_H
+#define BRW_PIPE_RAST_H
+
+#include "brw_clip.h"
+
+struct brw_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* for draw module */
+
+   /* Precalculated hardware state:
+    */
+   struct brw_clip_prog_key clip_key;
+};
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
new file mode 100644
index 0000000000..fbb772d18c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -0,0 +1,159 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+
+static void brwBindProgram( struct brw_context *brw,
+			    GLenum target, 
+			    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: 
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      break;
+   case GL_FRAGMENT_PROGRAM_ARB:
+      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      break;
+   }
+}
+
+static struct gl_program *brwNewProgram( structg brw_context *brw,
+				      GLenum target, 
+				      GLuint id )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_vertex_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_fragment_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   default:
+      return _mesa_new_program(ctx, target, id);
+   }
+}
+
+static void brwDeleteProgram( struct brw_context *brw,
+			      struct gl_program *prog )
+{
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      brw->sws->bo_unreference(brw_fprog->const_buffer);
+   }
+
+   _mesa_delete_program( ctx, prog );
+}
+
+
+static GLboolean brwIsProgramNative( struct brw_context *brw,
+				     GLenum target, 
+				     struct gl_program *prog )
+{
+   return GL_TRUE;
+}
+
+static void brwProgramStringNotify( struct brw_context *brw,
+				    GLenum target,
+				    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
+      if (fprog->FogOption) {
+         _mesa_append_fog_code(ctx, fprog);
+         fprog->FogOption = GL_NONE;
+      }
+
+      if (newFP == curFP)
+	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
+   }
+   else if (target == GL_VERTEX_PROGRAM_ARB) {
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
+	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
+      }
+      newVP->id = brw->program_id++;      
+
+      /* Also tell tnl about it:
+       */
+      _tnl_program_string(ctx, target, prog);
+   }
+}
+
+void brwInitFragProgFuncs( struct dd_function_table *functions )
+{
+   assert(functions->ProgramStringNotify == _tnl_program_string); 
+
+   functions->BindProgram = brwBindProgram;
+   functions->NewProgram = brwNewProgram;
+   functions->DeleteProgram = brwDeleteProgram;
+   functions->IsProgramNative = brwIsProgramNative;
+   functions->ProgramStringNotify = brwProgramStringNotify;
+}
+
diff --git a/src/gallium/drivers/i965/brw_reg.h b/src/gallium/drivers/i965/brw_reg.h
new file mode 100644
index 0000000000..a640104d71
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_reg.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_NOOP				(CMD_MI | 0)
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+#define MI_FLUSH			(CMD_MI | (4 << 23))
+
+#define _3DSTATE_DRAWRECT_INFO_I965	(CMD_3D | (3 << 27) | (1 << 24) | 0x2)
+
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
new file mode 100644
index 0000000000..716b55c52b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+
+struct brw_winsys_screen;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen base;
+
+   struct brw_winsys_screen *sws;
+
+   boolean is_i945;
+   uint pci_id;
+};
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct brw_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+
+/*
+ * Cast wrappers
+ */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+static INLINE struct brw_transfer *
+brw_transfer(struct pipe_transfer *transfer)
+{
+   return (struct brw_transfer *)transfer;
+}
+
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index d199d0b81a..544be6a089 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -1,6 +1,6 @@
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (IS_965(brw->brw_screen->pci_id) &&
+       !IS_G4X(brw->brw_screen->pci_id)) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 0115f77c08..54202cbd12 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -30,7 +30,7 @@
   */
   
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index 26c2e8891a..c99116b8b1 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -34,7 +34,6 @@
 #define BRW_SF_H
 
 
-#include "shader/program.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index c98d7ec13a..4acb2b7d72 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -30,7 +30,7 @@
   */
    
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index b716097bfc..02657eaba7 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -157,7 +157,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 9568794625..b285837070 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -32,7 +32,7 @@
 
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
@@ -47,7 +47,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
 
@@ -74,7 +74,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
    return GL_TRUE;
 }
 
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index 91d0f80297..1b5f27cc16 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -57,7 +57,7 @@
  */
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b68b6cb21a..842380e38f 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -33,7 +33,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
index 75cdc18912..813cd31f49 100644
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ b/src/gallium/drivers/i965/brw_tex_layout.c
@@ -47,7 +47,7 @@ GLboolean brw_miptree_layout(struct brw_context *brw,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (IS_IGDNG(brw->brw_screen->pci_id)) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index 8c6f4355a6..18d79c5ebb 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -31,7 +31,7 @@
         
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
index 37c3acbc11..b5f9a36e7b 100644
--- a/src/gallium/drivers/i965/brw_util.h
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -36,9 +36,8 @@
 #include "brw_types.h"
 
 extern GLuint brw_count_bits( GLuint val );
-extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
-extern GLuint brw_translate_blend_factor( GLenum factor );
-extern GLuint brw_translate_blend_equation( GLenum mode );
+extern GLuint brw_translate_blend_factor( unsigned factor );
+extern GLuint brw_translate_blend_equation( unsigned mode );
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 97e523c3ee..dcd687ac34 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -34,7 +34,6 @@
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "shader/prog_print.h"
 
 
@@ -113,7 +112,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
+      .mesa  = PIPE_NEW_CLIP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 4a591365c9..54f7d7d7c4 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -36,7 +36,6 @@
 
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "shader/program.h"
 
 
 struct brw_vs_prog_key {
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 6adb743017..e946944295 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -192,7 +192,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 					     BRW_WIDTH_8,
 					     BRW_HORIZONTAL_STRIDE_1,
 					     BRW_SWIZZLE_XXXX,
-					     WRITEMASK_X);
+					     BRW_WRITEMASK_X);
       reg++;
    }
 
@@ -487,7 +487,7 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
    struct brw_compile *p = &c->func;
    
 
-   if (dst.dw1.bits.writemask & WRITEMASK_X) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
       struct brw_reg tmp = get_tmp(c);
       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 
@@ -499,23 +499,23 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
       /* Adjust exponent for floating point: 
        * exp += 127 
        */
-      brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
+      brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 
       /* Install exponent and sign.  
        * Excess drops off the edge: 
        */
-      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X), 
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X), 
 	      tmp_d, brw_imm_d(23));
 
       release_tmp(c, tmp);
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_Y) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
       /* result[1] = arg0.x - floor(arg0.x) */
-      brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
+      brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* As with the LOG instruction, we might be better off just
        * doing a taylor expansion here, seeing as we have to do all
        * the prep work.
@@ -525,14 +525,14 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_EXP, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(arg0, 0), 
 		 BRW_MATH_PRECISION_FULL);
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 }
 
@@ -562,36 +562,36 @@ static void emit_log_noalias( struct brw_vs_compile *c,
     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1U<<31)-1));
 
       brw_SHR(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X), 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X), 
 	      tmp_ud,
 	      brw_imm_ud(23));
 
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_X), 
+	      brw_writemask(tmp, BRW_WRITEMASK_X), 
 	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
 	      brw_imm_d(-127));
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_Y),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1<<23)-1));
 
       brw_OR(p, 
-	     brw_writemask(tmp_ud, WRITEMASK_Y), 
+	     brw_writemask(tmp_ud, BRW_WRITEMASK_Y), 
 	     tmp_ud,
 	     brw_imm_ud(127<<23));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* result[2] = result[0] + LOG2(result[1]); */
 
       /* Why bother?  The above is just a hint how to do this with a
@@ -606,19 +606,19 @@ static void emit_log_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_LOG, 
-		 brw_writemask(tmp, WRITEMASK_Z), 
+		 brw_writemask(tmp, BRW_WRITEMASK_Z), 
 		 brw_swizzle1(tmp, 1), 
 		 BRW_MATH_PRECISION_FULL);
       
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_Z), 
+	      brw_writemask(tmp, BRW_WRITEMASK_Z), 
 	      brw_swizzle1(tmp, 2), 
 	      brw_swizzle1(tmp, 0));
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 
    if (need_tmp) {
@@ -639,14 +639,14 @@ static void emit_dst_noalias( struct brw_vs_compile *c,
 
    /* There must be a better way to do this: 
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_X)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
-   if (dst.dw1.bits.writemask & WRITEMASK_Y)
-      brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
-   if (dst.dw1.bits.writemask & WRITEMASK_Z)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
-   if (dst.dw1.bits.writemask & WRITEMASK_W)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 }
 
 
@@ -672,8 +672,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
    
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0)); 
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1)); 
 
    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
     * to get all channels active inside the IF.  In the clipping code
@@ -683,15 +683,15 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
    if_insn = brw_IF(p, BRW_EXECUTE_8);
    {
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
       emit_math2(c, 
 		 BRW_MATH_FUNCTION_POW, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(tmp, 2),
 		 brw_swizzle1(arg0, 3),
 		 BRW_MATH_PRECISION_PARTIAL);      
@@ -1045,7 +1045,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* ndc = 1.0 / pos.w */
    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
    /* ndc.xyz = pos * ndc */
-   brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
+   brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
@@ -1062,14 +1062,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
-	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
-	 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
+	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
       }
 
       for (i = 0; i < c->key.nr_userclip; i++) {
 	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
 
@@ -1089,7 +1089,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 brw_swizzle1(ndc, 3),
 		 brw_imm_f(0));
    
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
 	 brw_MOV(p, ndc, brw_imm_f(0));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
@@ -1139,7 +1139,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
-
+!
    if (c->first_overflow_output > 0) {
       /* Not all of the vertex outputs/results fit into the MRF.
        * Move the overflowed attributes from the GRF to the MRF and
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
new file mode 100644
index 0000000000..2142db5a4d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -0,0 +1,243 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+struct brw_winsys;
+struct pipe_fence_handle;
+
+/* This currently just wraps dri_bo:
+ */
+struct brw_winsys_buffer {
+   struct brw_winsys_screen *sws;
+   void *bo;
+   unsigned offset;
+};
+
+enum brw_buffer_usage {
+   I915_GEM_DOMAIN_RENDER,
+   I915_GEM_DOMAIN_SAMPLER,
+   I915_GEM_DOMAIN_VERTEX,
+   I915_GEM_DOMAIN_INSTRUCTION,
+
+
+   /* XXX: migrate from domains to explicit usage cases, eg below:
+    */
+
+   /* use on textures */
+   BRW_USAGE_RENDER    = 0x01,
+   BRW_USAGE_SAMPLER   = 0x02,
+   BRW_USAGE_2D_TARGET = 0x04,
+   BRW_USAGE_2D_SOURCE = 0x08,
+   /* use on vertex */
+   BRW_USAGE_VERTEX    = 0x10,
+};
+
+enum brw_buffer_type
+{
+   BRW_BUFFER_TYPE_TEXTURE,
+   BRW_BUFFER_TYPE_SCANOUT, /**< a texture used for scanning out from */
+   BRW_BUFFER_TYPE_VERTEX,
+};
+
+
+/* AKA winsys context:
+ */
+struct brw_batchbuffer {
+
+   struct brw_winsys *iws;
+   struct brw_winsys_buffer *buf;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct brw_winsys_screen {
+
+   /**
+    * Batchbuffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a new batchbuffer.
+    */
+   struct brw_batchbuffer *(*batchbuffer_create)(struct brw_winsys_screen *iws);
+
+   /**
+    * Emit a relocation to a buffer.
+    * Target position in batchbuffer is the same as ptr.
+    */
+   int (*batchbuffer_reloc)(struct brw_batchbuffer *batch,
+			    unsigned offset,
+                            struct brw_winsys_buffer *reloc,
+			    unsigned pre_add,
+                            enum brw_buffer_usage usage);
+
+   /**
+    * Flush a bufferbatch.
+    */
+   void (*batchbuffer_flush)(struct brw_batchbuffer *batch,
+                             struct pipe_fence_handle **fence);
+
+   /**
+    * Destroy a batchbuffer.
+    */
+   void (*batchbuffer_destroy)(struct brw_batchbuffer *batch);
+   /*@}*/
+
+
+   /**
+    * Buffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   struct brw_winsys_buffer *(*buffer_create)(struct brw_winsys *iws,
+					      unsigned size, 
+					      unsigned alignment,
+					      enum brw_buffer_type type);
+
+
+   /* Reference and unreference buffers:
+    */
+   void (*bo_reference)( struct brw_winsys_buffer *buffer );
+   void (*bo_unreference)( struct brw_winsys_buffer *buffer );
+   void (*bo_emit_reloc)( struct brw_winsys_buffer *buffer,
+			  unsigned domain,
+			  unsigned a,
+			  unsigned b,
+			  unsigned offset,
+			  struct brw_winsys_buffer *b2);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*buffer_map)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *buffer,
+                       boolean write);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*buffer_unmap)(struct brw_winsys *iws,
+                        struct brw_winsys_buffer *buffer);
+
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pipe_buffer_write.
+    */
+   int (*buffer_write)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *dst,
+                       size_t offset,
+                       size_t size,
+                       const void *data);
+
+   void (*buffer_destroy)(struct brw_winsys *iws,
+                          struct brw_winsys_buffer *buffer);
+   /*@}*/
+
+
+   /**
+    * Fence functions.
+    */
+   /*@{*/
+   /**
+    * Reference fence and set ptr to fence.
+    */
+   void (*fence_reference)(struct brw_winsys *iws,
+                           struct pipe_fence_handle **ptr,
+                           struct pipe_fence_handle *fence);
+
+   /**
+    * Check if a fence has finished.
+    */
+   int (*fence_signalled)(struct brw_winsys *iws,
+                          struct pipe_fence_handle *fence);
+
+   /**
+    * Wait on a fence to finish.
+    */
+   int (*fence_finish)(struct brw_winsys *iws,
+                       struct pipe_fence_handle *fence);
+   /*@}*/
+
+
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct brw_winsys *iws);
+};
+
+
+/**
+ * Create i915 pipe_screen.
+ */
+struct pipe_screen *i915_create_screen(struct brw_winsys *iws, unsigned pci_id);
+
+/**
+ * Create a i915 pipe_context.
+ */
+struct pipe_context *i915_create_context(struct pipe_screen *screen);
+
+/**
+ * Get the brw_winsys buffer backing the texture.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture;
+boolean i915_get_texture_buffer_brw(struct pipe_texture *texture,
+				    struct brw_winsys_buffer **buffer,
+				    unsigned *stride);
+
+/**
+ * Wrap a brw_winsys buffer with a texture blanket.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture * i915_texture_blanket_brw(struct pipe_screen *screen,
+                                                 struct pipe_texture *tmplt,
+                                                 unsigned pitch,
+                                                 struct brw_winsys_buffer *buffer);
+
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 756a680150..18775830f9 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -34,7 +34,6 @@
 #define BRW_WM_H
 
 
-#include "shader/prog_instruction.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index 220821087c..c6659646f2 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -98,7 +98,7 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    }
    _mesa_printf("]");
 
-   if (inst->writemask != WRITEMASK_XYZW)
+   if (inst->writemask != BRW_WRITEMASK_XYZW)
       _mesa_printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index fec33f74eb..7df9b79d7a 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -72,14 +72,14 @@ static void emit_pixel_xy(struct brw_compile *p,
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
 	      brw_imm_v(0x10101010));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw,5), 2, 4, 0),
@@ -101,14 +101,14 @@ static void emit_delta_xy(struct brw_compile *p,
    /* Calc delta X,Y by subtracting origin in r1 from the pixel
     * centers.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      dst[0],
 	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
 	      negate(r1));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      dst[1],
 	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
@@ -124,7 +124,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       /* X' = X */
       brw_MOV(p,
 	      dst[0],
@@ -133,7 +133,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 
    /* XXX: is this needed any more, or is this a NOOP?
     */
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
@@ -152,7 +152,7 @@ static void emit_pixel_w( struct brw_compile *p,
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
     */
-   if (mask & WRITEMASK_W) {      
+   if (mask & BRW_WRITEMASK_W) {      
       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 
       /* Calc 1/w - just linterp wpos[3] optimized by putting the
@@ -255,7 +255,7 @@ static void emit_frontfacing( struct brw_compile *p,
    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    GLuint i;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return;
 
    for (i = 0; i < 4; i++) {
@@ -321,26 +321,26 @@ void emit_ddxy(struct brw_compile *p,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 } else {
 	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 }
 	 brw_ADD(p, dst[i], src0, negate(src1));
       }
@@ -611,12 +611,12 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -633,12 +633,12 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -656,12 +656,12 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   const int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -681,7 +681,7 @@ static void emit_xpd( struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert(!(mask & BRW_WRITEMASK_W) == BRW_WRITEMASK_X);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -704,12 +704,12 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
@@ -732,12 +732,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -795,17 +795,17 @@ static void emit_tex( struct brw_wm_compile *c,
     */
    switch (inst->tex_idx) {
    case TEXTURE_1D_INDEX:
-      emit = WRITEMASK_X;
+      emit = BRW_WRITEMASK_X;
       nr = 1;
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
-      emit = WRITEMASK_XY;
+      emit = BRW_WRITEMASK_XY;
       nr = 2;
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
-      emit = WRITEMASK_XYZ;
+      emit = BRW_WRITEMASK_XYZ;
       nr = 3;
       break;
    default:
@@ -815,7 +815,7 @@ static void emit_tex( struct brw_wm_compile *c,
 
    if (inst->tex_shadow) {
       nr = 4;
-      emit |= WRITEMASK_W;
+      emit |= BRW_WRITEMASK_W;
    }
 
    msgLength = 1;
@@ -922,18 +922,18 @@ static void emit_lit( struct brw_compile *p,
 		      GLuint mask,
 		      const struct brw_reg *arg0 )
 {
-   assert((mask & WRITEMASK_XW) == 0);
+   assert((mask & BRW_WRITEMASK_XW) == 0);
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
       brw_MOV(p, dst[1], arg0[0]);
       brw_set_saturate(p, 0);
    }
 
-   if (mask & WRITEMASK_Z) {
+   if (mask & BRW_WRITEMASK_Z) {
       emit_math2(p, BRW_MATH_FUNCTION_POW,
 		 &dst[2],
-		 WRITEMASK_X | (mask & SATURATE),
+		 BRW_WRITEMASK_X | (mask & SATURATE),
 		 &arg0[1],
 		 &arg0[3]);
    }
@@ -944,10 +944,10 @@ static void emit_lit( struct brw_compile *p,
     */
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
    {
-      if (mask & WRITEMASK_Y) 
+      if (mask & BRW_WRITEMASK_Y) 
 	 brw_MOV(p, dst[1], brw_imm_f(0));
 
-      if (mask & WRITEMASK_Z) 
+      if (mask & BRW_WRITEMASK_Z) 
 	 brw_MOV(p, dst[2], brw_imm_f(0)); 
    }
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1414,10 +1414,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* There is an scs math function, but it would need some
 	  * fixup for 16-element execution.
 	  */
-	 if (dst_flags & WRITEMASK_X)
-	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
-	 if (dst_flags & WRITEMASK_Y)
-	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_X)
+	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_Y)
+	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
 	 break;
 
       case OPCODE_POW:
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 5f47d86f71..be240031c7 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -115,7 +115,7 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    struct prog_dst_register reg;
    reg.File = file;
    reg.Index = idx;
-   reg.WriteMask = WRITEMASK_XYZW;
+   reg.WriteMask = BRW_WRITEMASK_XYZW;
    reg.RelAddr = 0;
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
@@ -249,7 +249,7 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELXY,
-	      dst_mask(pixel_xy, WRITEMASK_XY),
+	      dst_mask(pixel_xy, BRW_WRITEMASK_XY),
 	      0,
 	      payload_r0_depth,
 	      src_undef(),
@@ -272,7 +272,7 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_DELTAXY,
-	      dst_mask(delta_xy, WRITEMASK_XY),
+	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
 	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
@@ -295,7 +295,7 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELW,
-	      dst_mask(pixel_w, WRITEMASK_W),
+	      dst_mask(pixel_w, BRW_WRITEMASK_W),
 	      0,
 	      interp_wpos,
 	      deltas, 
@@ -327,13 +327,13 @@ static void emit_interp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      WM_WPOSXY,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      get_pixel_xy(c),
 	      src_undef(),
 	      src_undef());
       
-      dst = dst_mask(dst, WRITEMASK_ZW);
+      dst = dst_mask(dst, BRW_WRITEMASK_ZW);
 
       /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
        */
@@ -370,7 +370,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* Interpolate the fog coordinate */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_X),
+	      dst_mask(dst, BRW_WRITEMASK_X),
 	      0,
 	      interp,
 	      deltas,
@@ -378,7 +378,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_YZW),
+	      dst_mask(dst, BRW_WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -393,7 +393,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
               WM_FRONTFACING,
-              dst_mask(dst, WRITEMASK_X),
+              dst_mask(dst, BRW_WRITEMASK_X),
               0,
               src_undef(),
               src_undef(),
@@ -404,7 +404,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      interp,
 	      deltas,
@@ -412,7 +412,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_ZW),
+	      dst_mask(dst, BRW_WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -518,19 +518,19 @@ static void precalc_dst( struct brw_wm_compile *c,
    struct prog_src_register src1 = inst->SrcReg[1];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_Y) {      
+   if (dst.WriteMask & BRW_WRITEMASK_Y) {      
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
 	      src1,
 	      src_undef());
    }
 
-   if (dst.WriteMask & WRITEMASK_XZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
 
@@ -538,7 +538,7 @@ static void precalc_dst( struct brw_wm_compile *c,
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XZ),
+		    dst_mask(dst, BRW_WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
 		    src_undef(),
@@ -546,12 +546,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* Avoid letting negation flag of src0 affect our 1 constant. */
       swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
-   if (dst.WriteMask & WRITEMASK_W) {
+   if (dst.WriteMask & BRW_WRITEMASK_W) {
       /* dst.w = mov src1.w
        */
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_W),
+	      dst_mask(dst, BRW_WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
 	      src_undef(),
@@ -566,14 +566,14 @@ static void precalc_lit( struct brw_wm_compile *c,
    struct prog_src_register src0 = inst->SrcReg[0];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_XW) {
+   if (dst.WriteMask & BRW_WRITEMASK_XW) {
       struct prog_instruction *swz;
 
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XW),
+		    dst_mask(dst, BRW_WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
 		    src_undef(),
@@ -582,10 +582,10 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
-   if (dst.WriteMask & WRITEMASK_YZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_YZ) {
       emit_op(c,
 	      TGSI_OPCODE_LIT,
-	      dst_mask(dst, WRITEMASK_YZ),
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
 	      src_undef(),
@@ -649,7 +649,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
        /* tmp0 = 1 / tmp1 */
        emit_op(c, TGSI_OPCODE_RCP,
-               dst_mask(tmp0, WRITEMASK_X),
+               dst_mask(tmp0, BRW_WRITEMASK_X),
                0,
                tmp1src,
                src_undef(),
@@ -740,7 +740,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_ADD,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
 	      C0,
@@ -751,7 +751,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_Y),
+	      dst_mask(tmp, BRW_WRITEMASK_Y),
 	      0,
 	      tmpsrc,
 	      src_swizzle1(C0, W),
@@ -766,7 +766,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_XYZ),
+	      dst_mask(dst, BRW_WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
@@ -776,7 +776,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
 	      src_swizzle1(C1, W),
@@ -863,7 +863,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_RCP,
-	      dst_mask(tmp, WRITEMASK_W),
+	      dst_mask(tmp, BRW_WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
@@ -873,7 +873,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
@@ -1053,7 +1053,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XY;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XY;
 	 break;
 	 
       case TGSI_OPCODE_DST:
@@ -1082,7 +1082,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_KIL: 
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 0c411b57f5..de5f5fe821 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -32,7 +32,6 @@
 
 #include "brw_context.h"
 #include "brw_wm.h"
-#include "shader/prog_parameter.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index d940ec09a9..f2ae3a958f 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -91,15 +91,15 @@ static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
    case TEXTURE_1D_INDEX:
-      return WRITEMASK_X;
+      return BRW_WRITEMASK_X;
    case TEXTURE_2D_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    case TEXTURE_3D_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_CUBE_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_RECT_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    default: return 0;
    }
 }
@@ -121,16 +121,16 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint read0, read1, read2;
 
       if (inst->opcode == TGSI_OPCODE_KIL) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
 
       if (inst->opcode == WM_FB_WRITE) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
-	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, BRW_WRITEMASK_XYZW); 
 	 if (c->key.source_depth_to_render_target &&
 	     c->key.computes_depth)
-	    track_arg(c, inst, 2, WRITEMASK_Z); 
+	    track_arg(c, inst, 2, BRW_WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
 	 continue;
@@ -191,9 +191,9 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case TGSI_OPCODE_XPD: 
-	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
-	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
-	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
+	 if (writemask & BRW_WRITEMASK_X) read0 |= BRW_WRITEMASK_YZ;	 
+	 if (writemask & BRW_WRITEMASK_Y) read0 |= BRW_WRITEMASK_XZ;	 
+	 if (writemask & BRW_WRITEMASK_Z) read0 |= BRW_WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
@@ -206,12 +206,12 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
-	 read0 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_POW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_TEX:
@@ -219,57 +219,57 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
-	    read0 |= WRITEMASK_Z;
+	    read0 |= BRW_WRITEMASK_Z;
 	 break;
 
       case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
-	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
+	 read0 = get_texcoord_mask(inst->tex_idx) | BRW_WRITEMASK_W;
 	 break;
 
       case WM_WPOSXY:
-	 read0 = writemask & WRITEMASK_XY;
+	 read0 = writemask & BRW_WRITEMASK_XY;
 	 break;
 
       case WM_DELTAXY:
-	 read0 = writemask & WRITEMASK_XY;
-	 read1 = WRITEMASK_X;
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case WM_PIXELW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_LINTERP:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_PINTERP:
-	 read0 = WRITEMASK_X; /* interpolant */
-	 read1 = WRITEMASK_XY; /* deltas */
-	 read2 = WRITEMASK_W; /* pixel w */
+	 read0 = BRW_WRITEMASK_X; /* interpolant */
+	 read1 = BRW_WRITEMASK_XY; /* deltas */
+	 read2 = BRW_WRITEMASK_W; /* pixel w */
 	 break;
 
       case TGSI_OPCODE_DP3:	
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZ;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_DPH:
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_DP4:
-	 read0 = WRITEMASK_XYZW;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZW;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_LIT: 
-	 read0 = WRITEMASK_XYW;
+	 read0 = BRW_WRITEMASK_XYW;
 	 break;
 
       case TGSI_OPCODE_DST:
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index 86dcb74b5b..5045c9b4a6 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -31,7 +31,7 @@
                    
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 
diff --git a/src/gallium/drivers/i965/intel_batchbuffer.h b/src/gallium/drivers/i965/intel_batchbuffer.h
deleted file mode 100644
index be04656aec..0000000000
--- a/src/gallium/drivers/i965/intel_batchbuffer.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef INTEL_BATCHBUFFER_H
-#define INTEL_BATCHBUFFER_H
-
-#include "intel_bufmgr.h"
-#include "intel_reg.h"
-
-#define BATCH_SZ 16384
-#define BATCH_RESERVED 16
-
-enum cliprect_mode {
-   /**
-    * Batchbuffer contents may be looped over per cliprect, but do not
-    * require it.
-    */
-   IGNORE_CLIPRECTS,
-   /**
-    * Batchbuffer contents require looping over per cliprect at batch submit
-    * time.
-    *
-    * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single
-    * constant cliprect, as in DRI2 or FBO rendering.
-    */
-   LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that should not be executed multiple
-    * times.
-    */
-   NO_LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that already handles cliprects, such
-    * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE.
-    *
-    * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch
-    * outside of LOCK/UNLOCK.  This is upgraded to just NO_LOOP_CLIPRECTS when
-    * there's a constant cliprect, as in DRI2 or FBO rendering.
-    */
-   REFERENCES_CLIPRECTS
-};
-
-struct intel_batchbuffer
-{
-   struct intel_context *intel;
-
-   struct brw_winsys_buffer *buf;
-
-   GLubyte *buffer;
-
-   GLubyte *map;
-   GLubyte *ptr;
-
-   GLuint size;
-
-   /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
-   struct {
-      GLuint total;
-      GLubyte *start_ptr;
-   } emit;
-
-   GLuint dirty_state;
-};
-
-struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context
-                                                  *intel);
-
-void intel_batchbuffer_free(struct intel_batchbuffer *batch);
-
-
-void _intel_batchbuffer_flush(struct intel_batchbuffer *batch,
-			      const char *file, int line);
-
-#define intel_batchbuffer_flush(batch) \
-	_intel_batchbuffer_flush(batch, __FILE__, __LINE__)
-
-void intel_batchbuffer_reset(struct intel_batchbuffer *batch);
-
-
-/* Unlike bmBufferData, this currently requires the buffer be mapped.
- * Consider it a convenience function wrapping multple
- * intel_buffer_dword() calls.
- */
-void intel_batchbuffer_data(struct intel_batchbuffer *batch,
-                            const void *data, GLuint bytes,
-			    enum cliprect_mode cliprect_mode);
-
-void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
-                                     GLuint bytes);
-
-GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
-                                       struct brw_winsys_buffer *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-
-/* Inline functions - might actually be better off with these
- * non-inlined.  Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static INLINE GLint
-intel_batchbuffer_space(struct intel_batchbuffer *batch)
-{
-   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
-}
-
-
-static INLINE void
-intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
-{
-   assert(batch->map);
-   assert(intel_batchbuffer_space(batch) >= 4);
-   *(GLuint *) (batch->ptr) = dword;
-   batch->ptr += 4;
-}
-
-static INLINE void
-intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
-                                GLuint sz,
-				enum cliprect_mode cliprect_mode)
-{
-   assert(sz < batch->size - 8);
-   if (intel_batchbuffer_space(batch) < sz)
-      intel_batchbuffer_flush(batch);
-
-   /* All commands should be executed once regardless of cliprect
-    * mode.
-    */
-   (void)cliprect_mode;
-}
-
-/* Here are the crusty old macros, to be removed:
- */
-#define BATCH_LOCALS
-
-#define BEGIN_BATCH(n, cliprect_mode) do {				\
-   intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
-   assert(intel->batch->emit.start_ptr == NULL);			\
-   intel->batch->emit.total = (n) * 4;					\
-   intel->batch->emit.start_ptr = intel->batch->ptr;			\
-} while (0)
-
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d)
-
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   assert((unsigned) (delta) < buf->size);				\
-   intel_batchbuffer_emit_reloc(intel->batch, buf,			\
-				read_domains, write_domain, delta);	\
-} while (0)
-
-#define ADVANCE_BATCH() do {						\
-   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
-   assert(intel->batch->emit.start_ptr != NULL);			\
-   if (_n != intel->batch->emit.total) {				\
-      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
-	      _n, intel->batch->emit.total);				\
-      abort();								\
-   }									\
-   intel->batch->emit.start_ptr = NULL;					\
-} while(0)
-
-
-static INLINE void
-intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
-{
-   intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
-   intel_batchbuffer_emit_dword(batch, MI_FLUSH);
-}
-
-#endif
-- 
cgit v1.2.3


From 562ca4eae257dd3b268e7f13487c8cd91f618eae Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 25 Oct 2009 01:20:56 +0100
Subject: i965g: more compiling wip

---
 src/gallium/drivers/i965/brw_context.h    |  15 +-
 src/gallium/drivers/i965/brw_curbe.c      |   3 +-
 src/gallium/drivers/i965/brw_gs.c         |  48 +++----
 src/gallium/drivers/i965/brw_gs.h         |   4 +-
 src/gallium/drivers/i965/brw_gs_state.c   |  21 +--
 src/gallium/drivers/i965/brw_misc_state.c | 222 ++++++++++++++----------------
 src/gallium/drivers/i965/brw_pipe_blend.c |  19 +++
 src/gallium/drivers/i965/brw_pipe_rast.c  |  20 +++
 src/gallium/drivers/i965/brw_screen.h     |   7 +
 src/gallium/drivers/i965/brw_sf.c         |   2 +-
 src/gallium/drivers/i965/brw_state.h      |   4 +-
 src/gallium/drivers/i965/brw_urb.c        |   3 +-
 src/gallium/drivers/i965/brw_vs.c         |   4 +-
 src/gallium/drivers/i965/brw_vs_emit.c    |  67 +++++----
 src/gallium/drivers/i965/brw_wm.c         |   2 +-
 src/gallium/drivers/i965/brw_wm.h         |   2 +-
 16 files changed, 243 insertions(+), 200 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 7ead641811..2e17e150bb 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -184,6 +184,8 @@ struct brw_fragment_shader {
 #define PIPE_NEW_CLIP                   0x2
 #define PIPE_NEW_INDEX_BUFFER           0x2
 #define PIPE_NEW_INDEX_RANGE            0x2
+#define PIPE_NEW_BLEND_COLOR            0x2
+#define PIPE_NEW_POLYGON_STIPPLE        0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -202,7 +204,9 @@ struct brw_fragment_shader {
 #define BRW_NEW_VERTICES		0x8000
 /**
  * Used for any batch entry with a relocated pointer that will be used
- * by any 3D rendering.
+ * by any 3D rendering.  Need to re-emit these fresh in each
+ * batchbuffer as the referenced buffers may be relocated in the
+ * meantime.
  */
 #define BRW_NEW_BATCH			0x10000
 /** brw->depth_region updated */
@@ -271,7 +275,7 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint outputs_written;
+   GLuint nr_outputs_written;
    GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
@@ -487,6 +491,9 @@ struct brw_context
       struct pipe_buffer *vertex_constants;
       struct pipe_buffer *fragment_constants;
 
+      struct brw_blend_constant_color bcc;
+      struct brw_polygon_stipple bps;
+
       /**
        * Index buffer for this draw_prims call.
        *
@@ -726,11 +733,11 @@ void brw_init_shader_funcs( struct brw_context *brw );
 
 /* brw_urb.c
  */
-void brw_upload_urb_fence(struct brw_context *brw);
+int brw_upload_urb_fence(struct brw_context *brw);
 
 /* brw_curbe.c
  */
-void brw_upload_cs_urb_state(struct brw_context *brw);
+int brw_upload_cs_urb_state(struct brw_context *brw);
 
 /* brw_disasm.c */
 int brw_disasm (FILE *file, struct brw_instruction *inst);
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 278ffa4ca2..3dd08f6eeb 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -126,7 +126,7 @@ const struct brw_tracked_state brw_curbe_offsets = {
  * fixed-function hardware in a double-buffering scheme to avoid a
  * pipeline stall each time the contents of the curbe is changed.
  */
-void brw_upload_cs_urb_state(struct brw_context *brw)
+int brw_upload_cs_urb_state(struct brw_context *brw)
 {
    struct brw_cs_urb_state cs_urb;
    memset(&cs_urb, 0, sizeof(cs_urb));
@@ -144,6 +144,7 @@ void brw_upload_cs_urb_state(struct brw_context *brw)
 
    assert(brw->urb.nr_cs_entries);
    BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   return 0;
 }
 
 static GLfloat fixed_plane[6][4] = {
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 692ce46679..3ecaa74e4f 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -54,7 +54,7 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   c.nr_attrs = c.key.nr_attrs;
 
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -80,30 +80,30 @@ static void compile_gs_prog( struct brw_context *brw,
     * already been weeded out by this stage:
     */
    switch (key->primitive) {
-   case GL_QUADS:
+   case PIPE_PRIM_QUADS:
       brw_gs_quads( &c ); 
       break;
-   case GL_QUAD_STRIP:
+   case PIPE_PRIM_QUAD_STRIP:
       brw_gs_quad_strip( &c );
       break;
-   case GL_LINE_LOOP:
+   case PIPE_PRIM_LINE_LOOP:
       brw_gs_lines( &c );
       break;
-   case GL_LINES:
+   case PIPE_PRIM_LINES:
       if (key->hint_gs_always)
 	 brw_gs_lines( &c );
       else {
 	 return;
       }
       break;
-   case GL_TRIANGLES:
+   case PIPE_PRIM_TRIANGLES:
       if (key->hint_gs_always)
 	 brw_gs_tris( &c );
       else {
 	 return;
       }
       break;
-   case GL_POINTS:
+   case PIPE_PRIM_POINTS:
       if (key->hint_gs_always)
 	 brw_gs_points( &c );
       else {
@@ -129,17 +129,17 @@ static void compile_gs_prog( struct brw_context *brw,
 				       &brw->gs.prog_data );
 }
 
-static const GLenum gs_prim[GL_POLYGON+1] = {  
-   GL_POINTS,
-   GL_LINES,
-   GL_LINE_LOOP,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_QUADS,
-   GL_QUAD_STRIP,
-   GL_TRIANGLES
+static const unsigned gs_prim[PIPE_PRIM_MAX] = {  
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
 };
 
 static void populate_key( struct brw_context *brw,
@@ -148,7 +148,7 @@ static void populate_key( struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
-   key->attrs = brw->vs.prog_data->outputs_written;
+   key->nr_attrs = brw->vs.prog_data->nr_outputs_written;
 
    /* BRW_NEW_PRIMITIVE */
    key->primitive = gs_prim[brw->primitive];
@@ -156,14 +156,14 @@ static void populate_key( struct brw_context *brw,
    key->hint_gs_always = 0;	/* debug code? */
 
    key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == GL_QUADS ||
-			brw->primitive == GL_QUAD_STRIP ||
-			brw->primitive == GL_LINE_LOOP);
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void prepare_gs_prog(struct brw_context *brw)
+static int prepare_gs_prog(struct brw_context *brw)
 {
    struct brw_gs_prog_key key;
    /* Populate the key:
@@ -184,6 +184,8 @@ static void prepare_gs_prog(struct brw_context *brw)
       if (brw->gs.prog_bo == NULL)
 	 compile_gs_prog( brw, &key );
    }
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_gs.h b/src/gallium/drivers/i965/brw_gs.h
index bbb991ea2e..6e616dcb87 100644
--- a/src/gallium/drivers/i965/brw_gs.h
+++ b/src/gallium/drivers/i965/brw_gs.h
@@ -40,11 +40,11 @@
 #define MAX_GS_VERTS (4)	     
 
 struct brw_gs_prog_key {
-   GLuint attrs:32;
+   GLuint nr_attrs:8;
    GLuint primitive:4;
    GLuint hint_gs_always:1;
    GLuint need_gs_prog:1;
-   GLuint pad:26;
+   GLuint pad:18;
 };
 
 struct brw_gs_compile {
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
index 6d03d72d96..15a66c9741 100644
--- a/src/gallium/drivers/i965/brw_gs_state.c
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -29,11 +29,12 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
  
-
+#include "util/u_math.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
 
 struct brw_gs_unit_key {
    unsigned int total_grf;
@@ -76,7 +77,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    memset(&gs, 0, sizeof(gs));
 
-   gs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   gs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    if (key->prog_active) /* reloc */
       gs.thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
 
@@ -100,7 +101,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    if (BRW_IS_IGDNG(brw))
       gs.thread4.rendering_enable = 1;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
@@ -111,17 +112,17 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    if (key->prog_active) {
       /* Emit GS program relocation */
-      dri_bo_emit_reloc(bo,
-			I915_GEM_DOMAIN_INSTRUCTION, 0,
-			gs.thread0.grf_reg_count << 1,
-			offsetof(struct brw_gs_unit_state, thread0),
-			brw->gs.prog_bo);
+      brw->sws->bo_emit_reloc(bo,
+			      I915_GEM_DOMAIN_INSTRUCTION, 0,
+			      gs.thread0.grf_reg_count << 1,
+			      offsetof(struct brw_gs_unit_state, thread0),
+			      brw->gs.prog_bo);
    }
 
    return bo;
 }
 
-static void prepare_gs_unit(struct brw_context *brw)
+static int prepare_gs_unit(struct brw_context *brw)
 {
    struct brw_gs_unit_key key;
 
@@ -135,6 +136,8 @@ static void prepare_gs_unit(struct brw_context *brw)
    if (brw->gs.state_bo == NULL) {
       brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_gs_unit = {
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index 98fec85c1d..ccebe08b4f 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -31,10 +31,12 @@
  
 
+#include "brw_debug.h"
 #include "brw_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_screen.h"
 
 
@@ -44,25 +46,16 @@
  * Blend color
  */
 
-static void upload_blend_constant_color(struct brw_context *brw)
+static int upload_blend_constant_color(struct brw_context *brw)
 {
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));      
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bcc);
+   return 0;
 }
 
 
 const struct brw_tracked_state brw_blend_constant_color = {
    .dirty = {
-      .mesa = _NEW_COLOR,
+      .mesa = PIPE_NEW_BLEND_COLOR,
       .brw = 0,
       .cache = 0
    },
@@ -70,30 +63,32 @@ const struct brw_tracked_state brw_blend_constant_color = {
 };
 
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void upload_drawing_rect(struct brw_context *brw)
+static int upload_drawing_rect(struct brw_context *brw)
 {
    BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
    OUT_BATCH(0);
-   OUT_BATCH(((brw->fb.width - 1) & 0xffff) |
-	    ((brw->fb.height - 1) << 16));
+   OUT_BATCH(((brw->curr.fb.width - 1) & 0xffff) |
+	    ((brw->curr.fb.height - 1) << 16));
    OUT_BATCH(0);
    ADVANCE_BATCH();
+   return 0;
 }
 
 const struct brw_tracked_state brw_drawing_rect = {
    .dirty = {
-      .mesa = _NEW_BUFFERS,
+      .mesa = PIPE_NEW_FRAMEBUFFER,
       .brw = 0,
       .cache = 0
    },
    .emit = upload_drawing_rect
 };
 
-static void prepare_binding_table_pointers(struct brw_context *brw)
+static int prepare_binding_table_pointers(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->vs.bind_bo);
    brw_add_validated_bo(brw, brw->wm.bind_bo);
+   return 0;
 }
 
 /**
@@ -103,7 +98,7 @@ static void prepare_binding_table_pointers(struct brw_context *brw)
  * The binding table pointers are relative to the surface state base address,
  * which is 0.
  */
-static void upload_binding_table_pointers(struct brw_context *brw)
+static int upload_binding_table_pointers(struct brw_context *brw)
 {
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
@@ -116,6 +111,7 @@ static void upload_binding_table_pointers(struct brw_context *brw)
    OUT_BATCH(0); /* sf */
    OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
    ADVANCE_BATCH();
+   return 0;
 }
 
 const struct brw_tracked_state brw_binding_table_pointers = {
@@ -135,7 +131,7 @@ const struct brw_tracked_state brw_binding_table_pointers = {
  * The state pointers in this packet are all relative to the general state
  * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
  */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
+static int upload_pipelined_state_pointers(struct brw_context *brw )
 {
    BEGIN_BATCH(7, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
@@ -151,10 +147,11 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    ADVANCE_BATCH();
 
    brw->state.dirty.brw |= BRW_NEW_PSP;
+   return 0;
 }
 
 
-static void prepare_psp_urb_cbs(struct brw_context *brw)
+static int prepare_psp_urb_cbs(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->vs.state_bo);
    brw_add_validated_bo(brw, brw->gs.state_bo);
@@ -162,13 +159,26 @@ static void prepare_psp_urb_cbs(struct brw_context *brw)
    brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
    brw_add_validated_bo(brw, brw->cc.state_bo);
+   return 0;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+static int upload_psp_urb_cbs(struct brw_context *brw )
 {
-   upload_pipelined_state_pointers(brw);
-   brw_upload_urb_fence(brw);
-   brw_upload_cs_urb_state(brw);
+   int ret;
+   
+   ret = upload_pipelined_state_pointers(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_urb_fence(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cs_urb_state(brw);
+   if (ret)
+      return ret;
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_psp_urb_cbs = {
@@ -187,20 +197,22 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
    .emit = upload_psp_urb_cbs,
 };
 
-static void prepare_depthbuffer(struct brw_context *brw)
+static int prepare_depthbuffer(struct brw_context *brw)
 {
-   struct intel_region *region = brw->state.depth_region;
+   struct pipe_surface *zsbuf = brw->curr.fb.zsbuf;
 
-   if (region != NULL)
-      brw_add_validated_bo(brw, region->buffer);
+   if (zsbuf)
+      brw_add_validated_bo(brw, brw_surface_bo(zsbuf));
+
+   return 0;
 }
 
-static void emit_depthbuffer(struct brw_context *brw)
+static int emit_depthbuffer(struct brw_context *brw)
 {
-   struct intel_region *region = brw->state.depth_region;
+   struct pipe_surface *surface = brw->curr.fb.zsbuf;
    unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
-   if (region == NULL) {
+   if (surface == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
@@ -214,38 +226,45 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       ADVANCE_BATCH();
    } else {
+      struct brw_winsys_buffer *bo;
       unsigned int format;
+      unsigned int pitch;
+      unsigned int cpp;
 
-      switch (region->cpp) {
-      case 2:
+      switch (surface->format) {
+      case PIPE_FORMAT_Z16_UNORM:
 	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 cpp = 2;
+	 break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+	 format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 cpp = 4;
 	 break;
-      case 4:
-	 if (intel->depth_buffer_is_float)
-	    format = BRW_DEPTHFORMAT_D32_FLOAT;
-	 else
-	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+	 format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 cpp = 4;
 	 break;
       default:
 	 assert(0);
-	 return;
+	 return PIPE_ERROR_BAD_INPUT;
       }
 
-      assert(region->tiling != I915_TILING_X);
+      bo = brw_surface_bo(surface);
+      pitch = brw_surface_pitch(surface);
 
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
-      OUT_BATCH(((region->pitch * region->cpp) - 1) |
+      OUT_BATCH(((pitch * cpp) - 1) |
 		(format << 18) |
 		(BRW_TILEWALK_YMAJOR << 26) |
-		((region->tiling != I915_TILING_NONE) << 27) |
+		((surface->layout != PIPE_SURFACE_LAYOUT_LINEAR) << 27) |
 		(BRW_SURFACE_2D << 29));
-      OUT_RELOC(region->buffer,
+      OUT_RELOC(bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		0);
+		surface->offset);
       OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
-		((region->pitch - 1) << 6) |
-		((region->height - 1) << 19));
+		((pitch - 1) << 6) |
+		((surface->height - 1) << 19));
       OUT_BATCH(0);
 
       if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
@@ -253,6 +272,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       ADVANCE_BATCH();
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_depthbuffer = {
@@ -271,37 +292,15 @@ const struct brw_tracked_state brw_depthbuffer = {
  * Polygon stipple packet
  */
 
-static void upload_polygon_stipple(struct brw_context *brw)
+static int upload_polygon_stipple(struct brw_context *brw)
 {
-   struct brw_polygon_stipple bps;
-   GLuint i;
-
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
-
-   /* Polygon stipple is provided in OpenGL order, i.e. bottom
-    * row first.  If we're rendering to a window (i.e. the
-    * default frame buffer object, 0), then we need to invert
-    * it to match our pixel layout.  But if we're rendering
-    * to a FBO (i.e. any named frame buffer object), we *don't*
-    * need to invert - we already match the layout.
-    */
-   if (ctx->DrawBuffer->Name == 0) {
-      for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
-   }
-   else {
-      for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
-   }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bps);
+   return 0;
 }
 
 const struct brw_tracked_state brw_polygon_stipple = {
    .dirty = {
-      .mesa = _NEW_POLYGONSTIPPLE,
+      .mesa = PIPE_NEW_POLYGON_STIPPLE,
       .brw = 0,
       .cache = 0
    },
@@ -313,37 +312,26 @@ const struct brw_tracked_state brw_polygon_stipple = {
  * Polygon stipple offset packet
  */
 
-static void upload_polygon_stipple_offset(struct brw_context *brw)
+static int upload_polygon_stipple_offset(struct brw_context *brw)
 {
    struct brw_polygon_stipple_offset bpso;
 
+   /* This is invarient state in gallium:
+    */
    memset(&bpso, 0, sizeof(bpso));
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
-
-   /* Never need to offset stipple coordinates.
-    *
-    * XXX: is it ever necessary to invert Y values?
-    */
-   if (0) {
-      int x = 0, y = 0, h = 0;
-      bpso.bits0.x_offset = (32 - (x & 31)) & 31;
-      bpso.bits0.y_offset = (32 - ((y + h) & 31)) & 31;
-   }
-   else {
-      bpso.bits0.y_offset = 0;
-      bpso.bits0.x_offset = 0;
-   }
+   bpso.bits0.y_offset = 0;
+   bpso.bits0.x_offset = 0;
 
    BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+   return 0;
 }
 
-#define _NEW_WINDOW_POS 0x40000000
-
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
-      .mesa = _NEW_WINDOW_POS,
-      .brw = 0,
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple_offset
@@ -352,12 +340,12 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
 /**********************************************************************
  * AA Line parameters
  */
-static void upload_aa_line_parameters(struct brw_context *brw)
+static int upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
    if (BRW_IS_965(brw))
-      return;
+      return 0;
 
    /* use legacy aa line coverage computation */
    memset(&balp, 0, sizeof(balp));
@@ -365,6 +353,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
    balp.header.length = sizeof(balp) / 4 - 2;
    
    BRW_CACHED_BATCH_STRUCT(brw, &balp);
+   return 0;
 }
 
 const struct brw_tracked_state brw_aa_line_parameters = {
@@ -380,31 +369,16 @@ const struct brw_tracked_state brw_aa_line_parameters = {
  * Line stipple packet
  */
 
-static void upload_line_stipple(struct brw_context *brw)
+static int upload_line_stipple(struct brw_context *brw)
 {
-   struct brw_line_stipple bls;
-   GLfloat tmp;
-   GLint tmpi;
-
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
-
-   bls.bits0.pattern = ctx->Line.StipplePattern;
-   bls.bits1.repeat_count = ctx->Line.StippleFactor;
-
-   tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
-   tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+   struct brw_line_stipple *bls = NULL; //brw->curr.rast->bls;
+   BRW_CACHED_BATCH_STRUCT(brw, bls);
+   return 0;
 }
 
 const struct brw_tracked_state brw_line_stipple = {
    .dirty = {
-      .mesa = _NEW_LINE,
+      .mesa = PIPE_NEW_RAST,
       .brw = 0,
       .cache = 0
    },
@@ -416,7 +390,7 @@ const struct brw_tracked_state brw_line_stipple = {
  * Misc invarient state packets
  */
 
-static void upload_invarient_state( struct brw_context *brw )
+static int upload_invarient_state( struct brw_context *brw )
 {
    {
       /* 0x61040000  Pipeline Select */
@@ -424,7 +398,10 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_pipeline_select ps;
 
       memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+	 ps.header.opcode = CMD_PIPELINE_SELECT_GM45;
+      else
+	 ps.header.opcode = CMD_PIPELINE_SELECT_965;
       ps.header.pipeline_select = 0;
       BRW_BATCH_STRUCT(brw, &ps);
    }
@@ -460,12 +437,18 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_vf_statistics vfs;
       memset(&vfs, 0, sizeof(vfs));
 
-      vfs.opcode = CMD_VF_STATISTICS(brw);
-      if (INTEL_DEBUG & DEBUG_STATS)
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) 
+	 vfs.opcode = CMD_VF_STATISTICS_GM45;
+      else 
+	 vfs.opcode = CMD_VF_STATISTICS_965;
+
+      if (BRW_DEBUG & DEBUG_STATS)
 	 vfs.statistics_enable = 1; 
 
       BRW_BATCH_STRUCT(brw, &vfs);
    }
+   
+   return 0;
 }
 
 const struct brw_tracked_state brw_invarient_state = {
@@ -485,7 +468,7 @@ const struct brw_tracked_state brw_invarient_state = {
  * state pools.  This comes at the expense of memory, and more expensive cache
  * misses.
  */
-static void upload_state_base_address( struct brw_context *brw )
+static int upload_state_base_address( struct brw_context *brw )
 {
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
@@ -511,6 +494,7 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_BATCH(1); /* Indirect object upper bound */
        ADVANCE_BATCH();
    }
+   return 0;
 }
 
 const struct brw_tracked_state brw_state_base_address = {
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
index 17895d2782..54d09d9e45 100644
--- a/src/gallium/drivers/i965/brw_pipe_blend.c
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -43,3 +43,22 @@
    if (INTEL_DEBUG & DEBUG_STATS)
       cc.cc5.statistics_enable = 1;
 }
+
+
+
+static void brw_set_blend_color(struct pipe_context *pipe,
+				const float *blend_color)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_blend_constant_color *bcc = &brw->curr.blend_color.bcc;
+
+   memset(bcc, 0, sizeof(*bcc));      
+   bcc->header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc->header.length = sizeof(*bcc)/4-2;
+   bcc->blend_constant_color[0] = blend_color[0];
+   bcc->blend_constant_color[1] = blend_color[1];
+   bcc->blend_constant_color[2] = blend_color[2];
+   bcc->blend_constant_color[3] = blend_color[3];
+
+   brw->state.dirty.pipe |= PIPE_NEW_BLEND_COLOR;
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
index ff64dbd48d..86822d478a 100644
--- a/src/gallium/drivers/i965/brw_pipe_rast.c
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -44,3 +44,23 @@ calculate_clip_key_rast()
       }
    }
 }
+
+
+static void
+calculate_line_stipple_rast()
+{
+   GLfloat tmp;
+   GLint tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+   bls.bits0.pattern = brw->curr.rast.line_stipple_pattern;
+   bls.bits1.repeat_count = brw->curr.rast.line_stipple_factor + 1;
+
+   tmp = 1.0 / (GLfloat) bls.bits1.repeat_count;
+   tmpi = tmp * (1<<13);
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+}
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
index b0be0e1f8a..eafd8ddf77 100644
--- a/src/gallium/drivers/i965/brw_screen.h
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -95,4 +95,11 @@ brw_buffer_is_user_buffer( const struct pipe_buffer *buf )
    return ((const struct brw_buffer *)buf)->is_user_buffer;
 }
 
+struct brw_winsys_buffer *
+brw_surface_bo( struct pipe_surface *surface );
+
+unsigned
+brw_surface_pitch( const struct pipe_surface *surface );
+
+
 #endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e2db2e76e6..1b73b3fd51 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -131,7 +131,7 @@ static void upload_sf_prog(struct brw_context *brw)
    /* Populate the key, noting state dependencies:
     */
    /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written; 
+   key.attrs = brw->vs.prog_data->nr_outputs_written; 
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
    switch (brw->reduced_primitive) {
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 9bf34c3fe4..663fc839df 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -33,9 +33,11 @@
 #ifndef BRW_STATE_H
 #define BRW_STATE_H
 
-#include "brw_context.h"
+#include "pipe/p_error.h"
 #include "util/u_memory.h"
 
+#include "brw_context.h"
+
 static inline void
 brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
 {
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index 18d79c5ebb..a2277519ad 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -221,7 +221,7 @@ const struct brw_tracked_state brw_recalculate_urb_fence = {
 
 
-void brw_upload_urb_fence(struct brw_context *brw)
+int brw_upload_urb_fence(struct brw_context *brw)
 {
    struct brw_urb_fence uf;
    memset(&uf, 0, sizeof(uf));
@@ -247,4 +247,5 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = URB_SIZES(brw);
 
    BRW_BATCH_STRUCT(brw, &uf);
+   return 0;
 }
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index dcd687ac34..010ac115d3 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -51,11 +51,11 @@ static void do_vs_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
    c.vp = vp;
 
-   c.prog_data.outputs_written = vp->program.Base.OutputsWritten;
+   c.prog_data.nr_outputs_written = vp->program.Base.OutputsWritten;
    c.prog_data.inputs_read = vp->program.Base.InputsRead;
 
    if (c.key.copy_edgeflag) {
-      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.nr_outputs_written |= 1<<VERT_RESULT_EDGE;
       c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
    }
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index e946944295..086f54799e 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -140,35 +140,33 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_overflow_output = 0;
 
    if (BRW_IS_IGDNG(c->func.brw))
-       mrf = 8;
+      mrf = 8;
    else
-       mrf = 4;
+      mrf = 4;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1 << i)) {
-	 c->nr_outputs++;
-         assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
-	 if (i == VERT_RESULT_HPOS) {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
+   for (i = 0; i < c->prog_data.nr_outputs_written; i++) {
+      c->nr_outputs++;
+      assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
+      if (i == VERT_RESULT_HPOS) {
+	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+      }
+      else if (i == VERT_RESULT_PSIZ) {
+	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+	 if (mrf < 16) {
+	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+	    mrf++;
 	 }
-	 else if (i == VERT_RESULT_PSIZ) {
+	 else {
+	    /* too many vertex results to fit in MRF, use GRF for overflow */
+	    if (!c->first_overflow_output)
+	       c->first_overflow_output = i;
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
-	 }
-	 else {
-            if (mrf < 16) {
-               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-               mrf++;
-            }
-            else {
-               /* too many vertex results to fit in MRF, use GRF for overflow */
-               if (!c->first_overflow_output)
-                  c->first_overflow_output = i;
-               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-               reg++;
-            }
 	 }
       }
    }     
@@ -238,9 +236,9 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
    if (BRW_IS_IGDNG(c->func.brw))
-       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
-       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
    c->prog_data.total_grf = reg;
 
@@ -1050,8 +1048,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
-   if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || BRW_IS_965(p->brw))
+   if (c->prog_data.writes_psiz ||
+       c->key.nr_userclip || 
+       BRW_IS_965(p->brw))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1060,7 +1059,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
-      if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
+      if (c->prog_data.writes_psiz) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
@@ -1149,12 +1148,10 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * at mrf[4] atm...
        */
       GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
-         if (c->prog_data.outputs_written & (1 << i)) {
-            /* move from GRF to MRF */
-            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
-            mrf++;
-         }
+      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs_written; i++) {
+	 /* move from GRF to MRF */
+	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+	 mrf++;
       }
 
       brw_urb_WRITE(p,
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 4948ea0dff..764708f7df 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -310,7 +310,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    }
 
    /* CACHE_NEW_VS_PROG */
-   key->vp_outputs_written = brw->vs.prog_data->outputs_written; /* bitmask */
+   key->vp_nr_outputs_written = brw->vs.prog_data->nr_outputs_written;
 
    /* The unique fragment program ID */
    key->program_string_id = fp->id;
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index e06de95a8a..bf241f5fa4 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -76,7 +76,7 @@ struct brw_wm_prog_key {
 
    GLuint program_string_id:32;
    GLuint drawable_height;
-   GLuint vp_outputs_written;
+   GLuint vp_nr_outputs_written;
 };
 
 
-- 
cgit v1.2.3


From 5a304995e09d8dbfd40a2dfab32eacb7e85798e3 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 26 Oct 2009 01:11:36 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_context.h          |  15 +-
 src/gallium/drivers/i965/brw_gs.c               |   2 +-
 src/gallium/drivers/i965/brw_swtnl.c            | 144 ++---
 src/gallium/drivers/i965/brw_urb.c              |   5 +-
 src/gallium/drivers/i965/brw_vs.c               |  31 +-
 src/gallium/drivers/i965/brw_vs.h               |  14 +-
 src/gallium/drivers/i965/brw_vs_emit.c          | 733 ++++++++++++------------
 src/gallium/drivers/i965/brw_wm_glsl.c          |   4 +-
 src/gallium/drivers/i965/brw_wm_surface_state.c |   7 +-
 9 files changed, 485 insertions(+), 470 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 10c1cf6f33..8aaf895d20 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -154,6 +154,7 @@ struct brw_vertex_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
+   unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
@@ -165,6 +166,7 @@ struct brw_fragment_shader {
 
    GLboolean isGLSL;
 
+   unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
@@ -280,10 +282,13 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint nr_outputs_written;
-   GLuint nr_params;       /**< number of float params/constants */
 
-   GLuint inputs_read;
+   GLuint nr_outputs;
+   GLuint nr_inputs;
+
+   GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
+
+   GLboolean copy_edgeflag;
 
    /* Used for calculating urb partitions:
     */
@@ -475,8 +480,8 @@ struct brw_context
    /* Active state from the state tracker: 
     */
    struct {
-      const struct brw_vertex_shader *vertex_shader;
-      const struct brw_fragment_shader *fragment_shader;
+      struct brw_vertex_shader *vertex_shader;
+      struct brw_fragment_shader *fragment_shader;
       const struct brw_blend_state *blend;
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_state *zstencil;
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 3ecaa74e4f..693d8bfdf8 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -148,7 +148,7 @@ static void populate_key( struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
-   key->nr_attrs = brw->vs.prog_data->nr_outputs_written;
+   key->nr_attrs = brw->vs.prog_data->nr_outputs;
 
    /* BRW_NEW_PRIMITIVE */
    key->primitive = gs_prim[brw->primitive];
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
index d2df8af9f4..464013e7c4 100644
--- a/src/gallium/drivers/i965/brw_swtnl.c
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -1,111 +1,93 @@
 
-/* XXX: could split the primitive list to fallback only on the
- * non-conformant primitives.
- */
-static GLboolean check_fallbacks( struct brw_context *brw,
-				  const struct _mesa_prim *prim,
-				  GLuint nr_prims )
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+
+
+static GLboolean need_swtnl( struct brw_context *brw )
 {
-   GLuint i;
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
 
    /* If we don't require strict OpenGL conformance, never 
     * use fallbacks.  If we're forcing fallbacks, always
     * use fallfacks.
     */
    if (brw->flags.no_swtnl)
-      return GL_FALSE;
+      return FALSE;
 
    if (brw->flags.force_swtnl)
-      return GL_TRUE;
+      return TRUE;
 
-   if (brw->curr.rast->tmpl.smooth_polys) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
-	    return GL_TRUE;
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->curr.num_vertex_elements == 0 ||
+       brw->curr.num_vertex_elements >= BRW_VEP_MAX) {
+      return TRUE;
    }
 
-   /* BRW hardware will do AA lines, but they are non-conformant it
-    * seems.  TBD whether we keep this fallback:
+   /* Position array with zero stride?
+    *
+    * XXX: position isn't always at zero...
+    * XXX: eliminate zero-stride arrays
     */
-   if (ctx->Line.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_LINES) 
-	    return GL_TRUE;
+   {
+      int ve0_vb = brw->curr.vertex_element[0].vertex_buffer_index;
+      
+      if (brw->curr.vertex_buffer[ve0_vb].stride == 0)
+	 return TRUE;
    }
 
-   /* Stipple -- these fallbacks could be resolved with a little
-    * bit of work?
+   /* XXX: short-circuit
     */
-   if (ctx->Line.StippleFlag) {
-      for (i = 0; i < nr_prims; i++) {
-	 /* GS doesn't get enough information to know when to reset
-	  * the stipple counter?!?
-	  */
-	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
-	    return GL_TRUE;
-	    
-	 if (prim[i].mode == GL_POLYGON &&
-	     (ctx->Polygon.FrontMode == GL_LINE ||
-	      ctx->Polygon.BackMode == GL_LINE))
-	    return GL_TRUE;
-      }
-   }
+   return FALSE;
 
-   if (ctx->Point.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (prim[i].mode == GL_POINTS) 
-	    return GL_TRUE;
-   }
+   if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+      if (rast->poly_smooth)
+	 return TRUE;
 
-   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
-    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
-    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
-    * we want strict conformance, force the fallback.
-    * Right now, we only do this for 2D textures.
-    */
+   }
+   
+   if (brw->reduced_primitive == PIPE_PRIM_LINES ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_LINE)))
    {
-      int u;
-      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
-         if (texUnit->Enabled) {
-            if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-         }
-      }
+      /* BRW hardware will do AA lines, but they are non-conformant it
+       * seems.  TBD whether we keep this fallback:
+       */
+      if (rast->line_smooth)
+	 return TRUE;
+
+      /* XXX: was a fallback in mesa (gs doesn't get enough
+       * information to know when to reset stipple counter), but there
+       * must be a way around it.
+       */
+      if (rast->line_stipple_enable &&
+	  (brw->reduced_primitive == PIPE_PRIM_TRIANGLES ||
+	   brw->primitive == PIPE_PRIM_LINE_LOOP || 
+	   brw->primitive == PIPE_PRIM_LINE_STRIP))
+	 return TRUE;
    }
 
-   /* Exceeding hw limits on number of VS inputs?
-    */
-   if (brw->nr_ve == 0 ||
-       brw->nr_ve >= BRW_VEP_MAX) {
-      return TRUE;
+   
+   if (brw->reduced_primitive == PIPE_PRIM_POINTS ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_POINT ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_POINT)))
+   {
+      if (rast->point_smooth)
+	 return TRUE;
    }
 
-   /* Position array with zero stride?
+   /* BRW hardware doesn't handle CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats CLAMP
+    * as CLAMP_TO_EDGE instead.  If we're using CLAMP, and
+    * we want strict conformance, force the fallback.
+    *
+    * XXX: need a workaround for this.
     */
-   if (brw->vs[brw->ve[0]]->stride == 0)
-      return TRUE;
-
-
       
    /* Nothing stopping us from the fast path now */
-   return GL_FALSE;
+   return FALSE;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index ff2466528d..57fd8f20b2 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -35,6 +35,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
 
 #define VS 0
 #define GS 1
@@ -111,7 +112,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
 /* Most minimal update, forces re-emit of URB fence packet after GS
  * unit turned on/off.
  */
-static void recalculate_urb_fence( struct brw_context *brw )
+static int recalculate_urb_fence( struct brw_context *brw )
 {
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
@@ -204,6 +205,8 @@ done:
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 010ac115d3..3965ca6c54 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -28,17 +28,19 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-           
+
+#include "tgsi/tgsi_dump.h"           
 
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
+#include "brw_pipe_rast.h"
 
 
 static void do_vs_prog( struct brw_context *brw, 
-			struct brw_vertex_program *vp,
+			struct brw_vertex_shader *vp,
 			struct brw_vs_prog_key *key )
 {
    GLuint program_size;
@@ -51,16 +53,12 @@ static void do_vs_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
    c.vp = vp;
 
-   c.prog_data.nr_outputs_written = vp->program.Base.OutputsWritten;
-   c.prog_data.inputs_read = vp->program.Base.InputsRead;
-
-   if (c.key.copy_edgeflag) {
-      c.prog_data.nr_outputs_written |= 1<<VERT_RESULT_EDGE;
-      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
-   }
+   c.prog_data.nr_outputs = vp->info.num_outputs;
+   c.prog_data.nr_inputs = vp->info.num_inputs;
+   c.prog_data.copy_edgeflag = c.key.copy_edgeflag;
 
    if (0)
-      tgsi_dump(&c.vp->tokens, 0);
+      tgsi_dump(c.vp->tokens, 0);
 
    /* Emit GEN4 code.
     */
@@ -80,11 +78,10 @@ static void do_vs_prog( struct brw_context *brw,
 }
 
 
-static void brw_upload_vs_prog(struct brw_context *brw)
+static int brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
-   struct brw_vertex_program *vp = 
-      (struct brw_vertex_program *)brw->vertex_program;
+   struct brw_vertex_shader *vp = brw->curr.vertex_shader;
 
    memset(&key, 0, sizeof(key));
 
@@ -92,9 +89,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw->nr_userclip;
-   key.copy_edgeflag = (brw->rast->fill_ccw != PIPE_POLYGON_MODE_FILL ||
-			brw->rast->fill_cw != PIPE_POLYGON_MODE_FILL);
+   key.nr_userclip = brw->curr.ucp.nr;
+   key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
+			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
    /* Make an early check for the key.
     */
@@ -105,6 +102,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index e33fa2f0aa..58119567dc 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -46,17 +46,22 @@ struct brw_vs_prog_key {
 };
 
 
+
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+
 struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
 
-   struct brw_vertex_program *vp;
+   struct brw_vertex_shader *vp;
 
    GLuint nr_inputs;
+   GLuint nr_outputs;
+   GLboolean copy_edgeflag;
 
    GLuint first_output;
-   GLuint nr_outputs;
    GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
@@ -80,8 +85,13 @@ struct brw_vs_compile {
       GLint index;
       struct brw_reg reg;
    } current_const[3];
+
+   struct brw_instruction *if_inst[MAX_IF_DEPTH];
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+
 };
 
+
 void brw_vs_emit( struct brw_vs_compile *c );
 
 #endif
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 04132a167b..4daa98b29e 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -28,11 +28,25 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-            
 
 #include "pipe/p_shader_tokens.h"
+            
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "tgsi/tgsi_ureg.h"
+
 #include "brw_context.h"
 #include "brw_vs.h"
+#include "brw_debug.h"
+
+
+struct ureg_instruction {
+   unsigned opcode:8;
+   unsigned tex_target:3;
+   struct ureg_dst dst;
+   struct ureg_src src[3];
+};
 
 
 static struct brw_reg get_tmp( struct brw_vs_compile *c )
@@ -72,8 +86,8 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * works if everything fits in the GRF.
     * XXX this heuristic/check may need some fine tuning...
     */
-   if (c->vp->program.Base.Parameters->NumParameters +
-       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+   if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
+       c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
       c->vp->use_const_buffer = GL_TRUE;
    else
       c->vp->use_const_buffer = GL_FALSE;
@@ -106,25 +120,21 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    }
    else {
       /* use a section of the GRF for constants */
-      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+      GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
       for (i = 0; i < nr_params; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+         c->regs[TGSI_FILE_CONSTANT][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
       }
       reg += (nr_params + 1) / 2;
       c->prog_data.curb_read_length = reg - 1;
-
       c->prog_data.nr_params = nr_params * 4;
    }
 
    /* Allocate input regs:  
     */
-   c->nr_inputs = 0;
-   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (c->prog_data.inputs_read & (1 << i)) {
-	 c->nr_inputs++;
-	 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
-      }
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+      c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+      reg++;
    }
 
    /* If there are no inputs, we'll still be reading one attribute's worth
@@ -144,45 +154,51 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    else
       mrf = 4;
 
-   for (i = 0; i < c->prog_data.nr_outputs_written; i++) {
-      c->nr_outputs++;
-      assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
-      if (i == VERT_RESULT_HPOS) {
-	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+   /* XXX: need to access vertex output semantics here:
+    */
+   c->nr_outputs = c->prog_data.nr_outputs;
+   for (i = 0; i < c->prog_data.nr_outputs; i++) {
+      assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
+
+      /* XXX: Hardwire position to zero:
+       */
+      if (i == 0) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
       }
-      else if (i == VERT_RESULT_PSIZ) {
-	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+      /* XXX: disable psiz:
+       */
+      else if (0) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
 	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
       }
+      else if (mrf < 16) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+	 mrf++;
+      }
       else {
-	 if (mrf < 16) {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
-	 }
-	 else {
-	    /* too many vertex results to fit in MRF, use GRF for overflow */
-	    if (!c->first_overflow_output)
-	       c->first_overflow_output = i;
-	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
-	 }
+	 /* too many vertex results to fit in MRF, use GRF for overflow */
+	 if (!c->first_overflow_output)
+	    c->first_overflow_output = i;
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
       }
    }     
 
    /* Allocate program temporaries:
     */
-   for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
-      c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+   
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
       reg++;
    }
 
    /* Address reg(s).  Don't try to use the internal address reg until
     * deref time.
     */
-   for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
-      c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 					     reg,
 					     0,
 					     BRW_REGISTER_TYPE_D,
@@ -243,8 +259,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->prog_data.total_grf = reg;
 
    if (BRW_DEBUG & DEBUG_VS) {
-      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
-      debug_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, 
+		   c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__,
+		   c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
    }
 }
@@ -740,25 +758,25 @@ static void emit_nrm( struct brw_vs_compile *c,
 
 static struct brw_reg
 get_constant(struct brw_vs_compile *c,
-             const struct prog_instruction *inst,
+             const struct ureg_instruction *inst,
              GLuint argIndex)
 {
-   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   const struct ureg_src src = inst->src[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
-   const GLboolean relAddr = src->RelAddr;
+   const GLboolean relAddr = src.Indirect;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || relAddr) {
-      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+   if (c->current_const[argIndex].index != src.Index || relAddr) {
+      struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 
-      c->current_const[argIndex].index = src->Index;
+      c->current_const[argIndex].index = src.Index;
 
 #if 0
       printf("  fetch const[%d] for arg %d into reg %d\n",
-             src->Index, argIndex, c->current_const[argIndex].reg.nr);
+             src.Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
@@ -766,7 +784,7 @@ get_constant(struct brw_vs_compile *c,
                        0,                             /* oword */
                        relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
-                       16 * src->Index,               /* byte offset */
+                       16 * src.Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
@@ -783,7 +801,7 @@ get_constant(struct brw_vs_compile *c,
                           1,                       /* oword */
                           relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
-                          16 * src->Index,         /* byte offset */
+                          16 * src.Index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
                           );
       }
@@ -813,30 +831,24 @@ get_constant(struct brw_vs_compile *c,
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       gl_register_file file,
+			       enum tgsi_file_type file,
 			       GLuint index )
 {
    switch (file) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_INPUT:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_CONSTANT:
       assert(c->regs[file][index].nr != 0);
       return c->regs[file][index];
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_CONSTANT:
-   case PROGRAM_UNIFORM:
-      assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-      return c->regs[PROGRAM_STATE_VAR][index];
-   case PROGRAM_ADDRESS:
+
+   case TGSI_FILE_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
 
-   case PROGRAM_UNDEFINED:			/* undef values */
+   case TGSI_FILE_NULL:			/* undef values */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
-   case PROGRAM_ENV_PARAM: 
-   case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
       return brw_null_reg();
@@ -853,7 +865,7 @@ static struct brw_reg deref( struct brw_vs_compile *c,
 {
    struct brw_compile *p = &c->func;
    struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
@@ -886,17 +898,17 @@ static struct brw_reg deref( struct brw_vs_compile *c,
  */
 static struct brw_reg
 get_src_reg( struct brw_vs_compile *c,
-             const struct prog_instruction *inst,
+             const struct ureg_instruction *inst,
              GLuint argIndex )
 {
-   const GLuint file = inst->SrcReg[argIndex].File;
-   const GLint index = inst->SrcReg[argIndex].Index;
-   const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+   const GLuint file = inst->src[argIndex].File;
+   const GLint index = inst->src[argIndex].Index;
+   const GLboolean relAddr = inst->src[argIndex].Indirect;
 
    switch (file) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_INPUT:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
       if (relAddr) {
          return deref(c, c->regs[file][0], index);
       }
@@ -905,30 +917,25 @@ get_src_reg( struct brw_vs_compile *c,
          return c->regs[file][index];
       }
 
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_CONSTANT:
-   case PROGRAM_UNIFORM:
-   case PROGRAM_ENV_PARAM:
+   case TGSI_FILE_CONSTANT:
       if (c->vp->use_const_buffer) {
          return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+         return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
       }
       else {
-         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-         return c->regs[PROGRAM_STATE_VAR][index];
+         assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+         return c->regs[TGSI_FILE_CONSTANT][index];
       }
-   case PROGRAM_ADDRESS:
+   case TGSI_FILE_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
 
-   case PROGRAM_UNDEFINED:
+   case TGSI_FILE_NULL:
       /* this is a normal case since we loop over all three src args */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
-   case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
       return brw_null_reg();
@@ -959,27 +966,27 @@ static void emit_arl( struct brw_vs_compile *c,
  * Return the brw reg for the given instruction's src argument.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-                               const struct prog_instruction *inst,
+                               const struct ureg_instruction *inst,
                                GLuint argIndex )
 {
-   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   const struct ureg_src src = inst->src[argIndex];
    struct brw_reg reg;
 
-   if (src->File == PROGRAM_UNDEFINED)
+   if (src.File == TGSI_FILE_NULL)
       return brw_null_reg();
 
    reg = get_src_reg(c, inst, argIndex);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
-				       GET_SWZ(src->Swizzle, 1),
-				       GET_SWZ(src->Swizzle, 2),
-				       GET_SWZ(src->Swizzle, 3));
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src.SwizzleX,
+				       src.SwizzleY,
+				       src.SwizzleZ,
+				       src.SwizzleW);
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src->Negate ? 1 : 0;   
+   reg.negate = src.Negate ? 1 : 0;   
 
    return reg;
 }
@@ -989,21 +996,21 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
  * Get brw register for the given program dest register.
  */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
-			       struct prog_dst_register dst )
+			       struct ureg_dst dst )
 {
    struct brw_reg reg;
 
    switch (dst.File) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_OUTPUT:
       assert(c->regs[dst.File][dst.Index].nr != 0);
       reg = c->regs[dst.File][dst.Index];
       break;
-   case PROGRAM_ADDRESS:
+   case TGSI_FILE_ADDRESS:
       assert(dst.Index == 0);
       reg = c->regs[dst.File][dst.Index];
       break;
-   case PROGRAM_UNDEFINED:
+   case TGSI_FILE_NULL:
       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
       reg = brw_null_reg();
       break;
@@ -1027,15 +1034,16 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
    struct brw_reg m0 = brw_message_reg(0);
-   struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
    GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
+      assert(0);
       brw_MOV(p, 
-	      get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
-	      get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
+	      get_reg(c, TGSI_FILE_OUTPUT, 0),
+	      get_reg(c, TGSI_FILE_INPUT, 0));
    }
 
    /* Build ndc coords */
@@ -1060,7 +1068,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
       if (c->prog_data.writes_psiz) {
-	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
       }
@@ -1138,7 +1146,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
-!
+
    if (c->first_overflow_output > 0) {
       /* Not all of the vertex outputs/results fit into the MRF.
        * Move the overflowed attributes from the GRF to the MRF and
@@ -1148,9 +1156,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * at mrf[4] atm...
        */
       GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs_written; i++) {
+      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
 	 /* move from GRF to MRF */
-	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
 	 mrf++;
       }
 
@@ -1195,9 +1203,9 @@ post_vs_emit( struct brw_vs_compile *c,
 }
 
 static uint32_t
-get_predicate(const struct prog_instruction *inst)
+get_predicate(const struct ureg_instruction *inst)
 {
-   if (inst->DstReg.CondMask == COND_TR)
+   if (inst->dst.CondMask == COND_TR)
       return BRW_PREDICATE_NONE;
 
    /* All of GLSL only produces predicates for COND_NE and one channel per
@@ -1213,9 +1221,9 @@ get_predicate(const struct prog_instruction *inst)
     * predicate on that.  We can probably support this, but it won't
     * necessarily be easy.
     */
-   assert(inst->DstReg.CondMask == COND_NE);
+/*   assert(inst->dst.CondMask == COND_NE); */
 
-   switch (inst->DstReg.CondSwizzle) {
+   switch (inst->dst.CondSwizzle) {
    case SWIZZLE_XXXX:
       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
    case SWIZZLE_YYYY:
@@ -1225,26 +1233,281 @@ get_predicate(const struct prog_instruction *inst)
    case SWIZZLE_WWWW:
       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
    default:
-      _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
-		    inst->DstReg.CondMask);
+      debug_printf("Unexpected predicate: 0x%08x\n",
+		    inst->dst.CondMask);
       return BRW_PREDICATE_NORMAL;
    }
 }
 
+static void emit_insn(struct brw_vs_compile *c,
+		      const struct tgsi_full_instruction *insn)
+{
+   struct brw_reg args[3], dst;
+   GLuint i;
+
+#if 0
+   printf("%d: ", insn);
+   _mesa_print_instruction(inst);
+#endif
+
+   /* Get argument regs.
+    */
+   for (i = 0; i < 3; i++) {
+      const struct ureg_src src = inst->src[i];
+      index = src.Index;
+      file = src.File;	
+      args[i] = get_arg(c, inst, i);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */ 
+   index = inst->dst.Index;
+   file = inst->dst.File;
+   dst = get_dst(c, inst->dst);
+
+   if (inst->SaturateMode != SATURATE_OFF) {
+      debug_printf("Unsupported saturate %d in vertex shader",
+		   inst->SaturateMode);
+   }
+
+   switch (inst->Opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_NRM3:
+      emit_nrm(c, dst, args[0], 3);
+      break;
+   case TGSI_OPCODE_NRM4:
+      emit_nrm(c, dst, args[0], 4);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_LRP:
+      unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_TRUNC:
+      /* round toward zero */
+      brw_RNDZ(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(if_depth < MAX_IF_DEPTH);
+      if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      /* Note that brw_IF smashes the predicate_control field. */
+      if_inst[if_depth]->header.predicate_control = get_predicate(inst);
+      if_depth++;
+      break;
+   case TGSI_OPCODE_ELSE:
+      if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(if_depth > 0);
+      brw_ENDIF(p, if_inst[--if_depth]);
+      break;			
+   case TGSI_OPCODE_BGNLOOP:
+      loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP: 
+   {
+      struct brw_instruction *inst0, *inst1;
+      GLuint br = 1;
+
+      loop_depth--;
+
+      if (BRW_IS_IGDNG(brw))
+	 br = 2;
+
+      inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+      /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+      while (inst0 > loop_inst[loop_depth]) {
+	 inst0--;
+	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+	 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+   }
+   break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(stack_index),
+	      get_addr_reg(stack_index), brw_imm_d(4));
+      brw_save_call(p, inst->Comment, p->nr_insn);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_RET:
+      brw_ADD(p, get_addr_reg(stack_index),
+	      get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      break;
+   case TGSI_OPCODE_END:	
+      end_offset = p->nr_insn;
+      /* this instruction will get patched later to jump past subroutine
+       * code, etc.
+       */
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_PRINT:
+      /* no-op */
+      break;
+   case TGSI_OPCODE_BGNSUB:
+      brw_save_label(p, inst->Comment, p->nr_insn);
+      break;
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+   default:
+      debug_printf("Unsupported opcode %i (%s) in vertex shader",
+		   inst->Opcode, inst->Opcode < MAX_OPCODE ?
+		   _mesa_opcode_string(inst->Opcode) :
+		   "unknown");
+   }
+
+   /* Set the predication update on the last instruction of the native
+    * instruction sequence.
+    *
+    * This would be problematic if it was set on a math instruction,
+    * but that shouldn't be the case with the current GLSL compiler.
+    */
+   if (inst->CondUpdate) {
+      struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+      assert(hw_insn->header.destreg__conditionalmod == 0);
+      hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+   }
+
+   release_tmps(c);
+}
+
+
 /* Emit the vertex program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
-#define MAX_IF_DEPTH 32
-#define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
-   const GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
    const struct brw_indirect stack_index = brw_indirect(0, 0);   
+   struct tgsi_parse_context parse;
+   struct tgsi_full_declaration *decl;
    GLuint index;
    GLuint file;
 
@@ -1264,258 +1527,8 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
-      struct brw_reg args[3], dst;
-      GLuint i;
+      const struct ureg_instruction *inst = &c->vp->program.Base.Instructions[insn];
       
-#if 0
-      printf("%d: ", insn);
-      _mesa_print_instruction(inst);
-#endif
-
-      /* Get argument regs.
-       */
-      for (i = 0; i < 3; i++) {
-	 const struct prog_src_register *src = &inst->SrcReg[i];
-	 index = src->Index;
-	 file = src->File;	
-	 args[i] = get_arg(c, inst, i);
-      }
-
-      /* Get dest regs.  Note that it is possible for a reg to be both
-       * dst and arg, given the static allocation of registers.  So
-       * care needs to be taken emitting multi-operation instructions.
-       */ 
-      index = inst->DstReg.Index;
-      file = inst->DstReg.File;
-      dst = get_dst(c, inst->DstReg);
-
-      if (inst->SaturateMode != SATURATE_OFF) {
-	 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
-                       inst->SaturateMode);
-      }
-
-      switch (inst->Opcode) {
-      case TGSI_OPCODE_ABS:
-	 brw_MOV(p, dst, brw_abs(args[0]));
-	 break;
-      case TGSI_OPCODE_ADD:
-	 brw_ADD(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_COS:
-	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_DP3:
-	 brw_DP3(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_DP4:
-	 brw_DP4(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_DPH:
-	 brw_DPH(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_NRM3:
-	 emit_nrm(c, dst, args[0], 3);
-	 break;
-      case TGSI_OPCODE_NRM4:
-	 emit_nrm(c, dst, args[0], 4);
-	 break;
-      case TGSI_OPCODE_DST:
-	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
-	 break;
-      case TGSI_OPCODE_EXP:
-	 unalias1(c, dst, args[0], emit_exp_noalias);
-	 break;
-      case TGSI_OPCODE_EX2:
-	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_ARL:
-	 emit_arl(c, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_FLR:
-	 brw_RNDD(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_FRC:
-	 brw_FRC(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_LOG:
-	 unalias1(c, dst, args[0], emit_log_noalias);
-	 break;
-      case TGSI_OPCODE_LG2:
-	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_LIT:
-	 unalias1(c, dst, args[0], emit_lit_noalias);
-	 break;
-      case TGSI_OPCODE_LRP:
-	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
-	 break;
-      case TGSI_OPCODE_MAD:
-	 brw_MOV(p, brw_acc_reg(), args[2]);
-	 brw_MAC(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MAX:
-	 emit_max(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MIN:
-	 emit_min(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MOV:
-	 brw_MOV(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_MUL:
-	 brw_MUL(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_POW:
-	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
-	 break;
-      case TGSI_OPCODE_RCP:
-	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_RSQ:
-	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_SEQ:
-         emit_seq(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SIN:
-	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_SNE:
-         emit_sne(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SGE:
-	 emit_sge(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_SGT:
-         emit_sgt(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SLT:
-	 emit_slt(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_SLE:
-         emit_sle(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SUB:
-	 brw_ADD(p, dst, args[0], negate(args[1]));
-	 break;
-      case TGSI_OPCODE_TRUNC:
-         /* round toward zero */
-	 brw_RNDZ(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_XPD:
-	 emit_xpd(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_IF:
-	 assert(if_depth < MAX_IF_DEPTH);
-	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
-	 /* Note that brw_IF smashes the predicate_control field. */
-	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
-	 if_depth++;
-	 break;
-      case TGSI_OPCODE_ELSE:
-	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
-	 break;
-      case TGSI_OPCODE_ENDIF:
-         assert(if_depth > 0);
-	 brw_ENDIF(p, if_inst[--if_depth]);
-	 break;			
-      case TGSI_OPCODE_BGNLOOP:
-         loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
-         break;
-      case TGSI_OPCODE_BRK:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_BREAK(p);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_CONT:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_CONT(p);
-         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_ENDLOOP: 
-         {
-            struct brw_instruction *inst0, *inst1;
-	    GLuint br = 1;
-
-            loop_depth--;
-
-	    if (BRW_IS_IGDNG(brw))
-	       br = 2;
-
-            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-            while (inst0 > loop_inst[loop_depth]) {
-               inst0--;
-               if (inst0->header.opcode == BRW_TGSI_OPCODE_BREAK) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-                  inst0->bits3.if_else.pop_count = 0;
-               }
-               else if (inst0->header.opcode == BRW_TGSI_OPCODE_CONTINUE) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-                  inst0->bits3.if_else.pop_count = 0;
-               }
-            }
-         }
-         break;
-      case TGSI_OPCODE_BRA:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_CAL:
-	 brw_set_access_mode(p, BRW_ALIGN_1);
-	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-	 brw_set_access_mode(p, BRW_ALIGN_16);
-	 brw_ADD(p, get_addr_reg(stack_index),
-			 get_addr_reg(stack_index), brw_imm_d(4));
-         brw_save_call(p, inst->Comment, p->nr_insn);
-	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-         break;
-      case TGSI_OPCODE_RET:
-	 brw_ADD(p, get_addr_reg(stack_index),
-			 get_addr_reg(stack_index), brw_imm_d(-4));
-	 brw_set_access_mode(p, BRW_ALIGN_1);
-         brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
-	 brw_set_access_mode(p, BRW_ALIGN_16);
-	 break;
-      case TGSI_OPCODE_END:	
-         end_offset = p->nr_insn;
-         /* this instruction will get patched later to jump past subroutine
-          * code, etc.
-          */
-         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-         break;
-      case TGSI_OPCODE_PRINT:
-         /* no-op */
-         break;
-      case TGSI_OPCODE_BGNSUB:
-         brw_save_label(p, inst->Comment, p->nr_insn);
-         break;
-      case TGSI_OPCODE_ENDSUB:
-         /* no-op */
-         break;
-      default:
-	 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
-                       inst->Opcode, inst->Opcode < MAX_OPCODE ?
-				    _mesa_opcode_string(inst->Opcode) :
-				    "unknown");
-      }
-
-      /* Set the predication update on the last instruction of the native
-       * instruction sequence.
-       *
-       * This would be problematic if it was set on a math instruction,
-       * but that shouldn't be the case with the current GLSL compiler.
-       */
-      if (inst->CondUpdate) {
-	 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
-
-	 assert(hw_insn->header.destreg__conditionalmod == 0);
-	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
-      }
-
-      release_tmps(c);
    }
 
    end_inst = &p->store[end_offset];
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 3118e615f9..23f7ba16fd 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -167,8 +167,8 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	case PROGRAM_PAYLOAD:
 	    break;
 	default:
-	    _mesa_problem(NULL, "Unexpected file in get_reg()");
-	    return brw_null_reg();
+	   debug_printf("Unexpected file in get_reg()");
+	   return brw_null_reg();
     }
 
     assert(index < 256);
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index e1ed6438dc..7157feb6f3 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -516,8 +516,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 	 break;
       default:
-	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
-		       irb->texformat->MesaFormat);
+	 debug_printf("Bad renderbuffer format: %d\n",
+		      irb->texformat->MesaFormat);
+	 assert(0);
+	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 return;
       }
       key.tiling = region->tiling;
       if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
-- 
cgit v1.2.3


From 81b8589f064204d9ddcd7d1f9d43d2dcf5676235 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Wed, 28 Oct 2009 21:24:03 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_vs.c               |   2 +-
 src/gallium/drivers/i965/brw_vs.h               |   3 +
 src/gallium/drivers/i965/brw_vs_emit.c          | 199 +++++++++++++++++-------
 src/gallium/drivers/i965/brw_vs_state.c         |  25 +--
 src/gallium/drivers/i965/brw_vs_surface_state.c |  33 +++-
 src/gallium/drivers/i965/brw_wm.c               |  36 +++--
 6 files changed, 208 insertions(+), 90 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 3965ca6c54..26a28114d9 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -57,7 +57,7 @@ static void do_vs_prog( struct brw_context *brw,
    c.prog_data.nr_inputs = vp->info.num_inputs;
    c.prog_data.copy_edgeflag = c.key.copy_edgeflag;
 
-   if (0)
+   if (1)
       tgsi_dump(c.vp->tokens, 0);
 
    /* Emit GEN4 code.
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 2a2dbb3457..b4e450d89b 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -60,6 +60,9 @@ struct brw_vs_compile {
 
    GLuint nr_inputs;
    GLuint nr_outputs;
+   GLuint nr_immediates;
+   GLfloat immediate[128][4];
+
    GLboolean copy_edgeflag;
 
    GLuint first_output;
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 5366ab8514..6809bccdec 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -34,8 +34,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_ureg_parse.h"
+#include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 
@@ -67,6 +66,7 @@ static void release_tmps( struct brw_vs_compile *c )
 }
 
 
+
 /**
  * Preallocate GRF register before code emit.
  * Do things as simply as possible.  Allocate and populate all regs
@@ -83,10 +83,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * XXX this heuristic/check may need some fine tuning...
     */
    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
+       c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
       c->vp->use_const_buffer = GL_TRUE;
-   else
+   else {
+      /* XXX: immediates can go elsewhere if necessary:
+       */
+      assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
+	     c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF);
+
       c->vp->use_const_buffer = GL_FALSE;
+   }
 
    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
@@ -139,6 +146,29 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    if (c->nr_inputs == 0)
       reg++;
 
+   /* Allocate a GRF and load immediate values by hand with 4 MOVs!!!
+    *
+    * XXX: Try to encode float immediates as brw immediates
+    * XXX: Put immediates into the CURBE.
+    * XXX: Make sure ureg sets minimal immediate size and respect it
+    * here.
+    */
+   for (i = 0; i < c->nr_immediates; i++) {
+      struct brw_reg r;
+      int j;
+
+      r = brw_vec8_grf(reg, 0);
+
+      for (j = 0; j < 4; j++) {
+	 brw_MOV(&c->func, 
+		 brw_writemask(r, (1<<j)), 
+		 brw_imm_f(c->immediate[i][j]));
+      }
+
+      reg++;
+   }
+
+
    /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = 0;
@@ -754,21 +784,20 @@ static void emit_nrm( struct brw_vs_compile *c,
 
 static struct brw_reg
 get_constant(struct brw_vs_compile *c,
-             const struct ureg_instruction *inst,
-             GLuint argIndex)
+	     GLuint argIndex,
+	     GLuint index,
+	     GLboolean relAddr)
 {
-   const struct ureg_src src = inst->src[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
-   const GLboolean relAddr = src.Indirect;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src.Index || relAddr) {
+   if (c->current_const[argIndex].index != index || relAddr) {
       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 
-      c->current_const[argIndex].index = src.Index;
+      c->current_const[argIndex].index = index;
 
 #if 0
       printf("  fetch const[%d] for arg %d into reg %d\n",
@@ -780,7 +809,7 @@ get_constant(struct brw_vs_compile *c,
                        0,                             /* oword */
                        relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
-                       16 * src.Index,               /* byte offset */
+                       16 * index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
@@ -797,7 +826,7 @@ get_constant(struct brw_vs_compile *c,
                           1,                       /* oword */
                           relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
-                          16 * src.Index,         /* byte offset */
+                          16 * index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
                           );
       }
@@ -894,12 +923,11 @@ static struct brw_reg deref( struct brw_vs_compile *c,
  */
 static struct brw_reg
 get_src_reg( struct brw_vs_compile *c,
-             const struct ureg_instruction *inst,
-             GLuint argIndex )
+	     GLuint argIndex,
+	     GLuint file,
+	     GLint index,
+	     GLboolean relAddr )
 {
-   const GLuint file = inst->src[argIndex].File;
-   const GLint index = inst->src[argIndex].Index;
-   const GLboolean relAddr = inst->src[argIndex].Indirect;
 
    switch (file) {
    case TGSI_FILE_TEMPORARY:
@@ -913,9 +941,12 @@ get_src_reg( struct brw_vs_compile *c,
          return c->regs[file][index];
       }
 
+   case TGSI_FILE_IMMEDIATE:
+      return c->regs[file][index];
+
    case TGSI_FILE_CONSTANT:
       if (c->vp->use_const_buffer) {
-         return get_constant(c, inst, argIndex);
+         return get_constant(c, argIndex, index, relAddr);
       }
       else if (relAddr) {
          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
@@ -962,27 +993,32 @@ static void emit_arl( struct brw_vs_compile *c,
  * Return the brw reg for the given instruction's src argument.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-                               const struct ureg_instruction *inst,
+                               const struct tgsi_full_src_register *src,
                                GLuint argIndex )
 {
-   const struct ureg_src src = inst->src[argIndex];
    struct brw_reg reg;
 
-   if (src.File == TGSI_FILE_NULL)
+   if (src->SrcRegister.File == TGSI_FILE_NULL)
       return brw_null_reg();
 
-   reg = get_src_reg(c, inst, argIndex);
+   reg = get_src_reg(c, argIndex,
+		     src->SrcRegister.File,
+		     src->SrcRegister.Index,
+		     src->SrcRegister.Indirect);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src.SwizzleX,
-				       src.SwizzleY,
-				       src.SwizzleZ,
-				       src.SwizzleW);
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SrcRegister.SwizzleX,
+				       src->SrcRegister.SwizzleY,
+				       src->SrcRegister.SwizzleZ,
+				       src->SrcRegister.SwizzleW);
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src.Negate ? 1 : 0;   
+   reg.negate = src->SrcRegister.Negate ? 1 : 0;   
+
+   /* XXX: abs, absneg
+    */
 
    return reg;
 }
@@ -992,19 +1028,21 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
  * Get brw register for the given program dest register.
  */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
-			       struct ureg_dst dst )
+			       unsigned file,
+			       unsigned index,
+			       unsigned writemask )
 {
    struct brw_reg reg;
 
-   switch (dst.File) {
+   switch (file) {
    case TGSI_FILE_TEMPORARY:
    case TGSI_FILE_OUTPUT:
-      assert(c->regs[dst.File][dst.Index].nr != 0);
-      reg = c->regs[dst.File][dst.Index];
+      assert(c->regs[file][index].nr != 0);
+      reg = c->regs[file][index];
       break;
    case TGSI_FILE_ADDRESS:
-      assert(dst.Index == 0);
-      reg = c->regs[dst.File][dst.Index];
+      assert(index == 0);
+      reg = c->regs[file][index];
       break;
    case TGSI_FILE_NULL:
       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
@@ -1015,7 +1053,7 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
       reg = brw_null_reg();
    }
 
-   reg.dw1.bits.writemask = dst.WriteMask;
+   reg.dw1.bits.writemask = writemask;
 
    return reg;
 }
@@ -1199,7 +1237,7 @@ post_vs_emit( struct brw_vs_compile *c,
 }
 
 static uint32_t
-get_predicate(const struct ureg_instruction *inst)
+get_predicate(const struct tgsi_full_instruction *inst)
 {
    /* XXX: disabling for now
     */
@@ -1242,8 +1280,10 @@ get_predicate(const struct ureg_instruction *inst)
 }
 
 static void emit_insn(struct brw_vs_compile *c,
-		      const struct ureg_instruction *inst)
+		      const struct tgsi_full_instruction *inst)
 {
+   unsigned opcode = inst->Instruction.Opcode;
+   unsigned label = inst->InstructionExtLabel.Label;
    struct brw_compile *p = &c->func;
    struct brw_reg args[3], dst;
    GLuint i;
@@ -1256,20 +1296,25 @@ static void emit_insn(struct brw_vs_compile *c,
    /* Get argument regs.
     */
    for (i = 0; i < 3; i++) {
-      args[i] = get_arg(c, inst, i);
+      args[i] = get_arg(c, &inst->FullSrcRegisters[i], i);
    }
 
    /* Get dest regs.  Note that it is possible for a reg to be both
     * dst and arg, given the static allocation of registers.  So
     * care needs to be taken emitting multi-operation instructions.
     */ 
-   dst = get_dst(c, inst->dst);
+   dst = get_dst(c, 
+		 inst->FullDstRegisters[0].DstRegister.File,
+		 inst->FullDstRegisters[0].DstRegister.Index,
+		 inst->FullDstRegisters[0].DstRegister.WriteMask);
 
-   if (inst->dst.Saturate) {
+   /* XXX: saturate
+    */
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
       debug_printf("Unsupported saturate in vertex shader");
    }
 
-   switch (inst->opcode) {
+   switch (opcode) {
    case TGSI_OPCODE_ABS:
       brw_MOV(p, dst, brw_abs(args[0]));
       break;
@@ -1443,7 +1488,7 @@ static void emit_insn(struct brw_vs_compile *c,
       brw_set_access_mode(p, BRW_ALIGN_16);
       brw_ADD(p, get_addr_reg(c->stack_index),
 	      get_addr_reg(c->stack_index), brw_imm_d(4));
-      brw_save_call(p, inst->label, p->nr_insn);
+      brw_save_call(p, label, p->nr_insn);
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
    case TGSI_OPCODE_RET:
@@ -1468,8 +1513,8 @@ static void emit_insn(struct brw_vs_compile *c,
       break;
    default:
       debug_printf("Unsupported opcode %i (%s) in vertex shader",
-		   inst->opcode, 
-		   tgsi_get_opcode_name(inst->opcode));
+		   opcode, 
+		   tgsi_get_opcode_name(opcode));
    }
 
    /* Set the predication update on the last instruction of the native
@@ -1498,11 +1543,12 @@ static void emit_insn(struct brw_vs_compile *c,
 void brw_vs_emit(struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
+   const struct tgsi_token *tokens = c->vp->tokens;
    struct brw_instruction *end_inst, *last_inst;
-   struct ureg_parse_context parse;
-   struct ureg_declaration *decl;
-   struct ureg_declaration *imm;
-   struct ureg_declaration *insn;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+   boolean done = FALSE;
+   int i;
 
    if (BRW_DEBUG & DEBUG_VS)
       tgsi_dump(c->vp->tokens, 0); 
@@ -1512,21 +1558,66 @@ void brw_vs_emit(struct brw_vs_compile *c)
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
+   /* Inputs */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* Nothing to do -- using info from tgsi_scan().
+	  */
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+	 static const float id[4] = {0,0,0,1};
+	 const float *imm = &parse.FullToken.FullImmediate.u[i].Float;
+	 unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+
+	 for (i = 0; i < size; i++)
+	    c->immediate[c->nr_immediates][i] = imm[i];
+
+	 for ( ; i < 4; i++)
+	    c->immediate[c->nr_immediates][i] = id[i];
+
+	 c->nr_immediates++;
+	 break;
+      }
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+	 done = 1;
+	 break;
+      }
+   }
+
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
    brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
 
-   while (ureg_next_decl(&parse, &decl)) {
-   }
-
-   while (ureg_next_immediate(&parse, &imm)) {
-   }
-
-   while (ureg_next_instruction(&parse, &insn)) {
+   /* Instructions
+    */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn( c, inst );
+         break;
+
+      default:
+         assert( 0 );
+      }
    }
+   tgsi_parse_free( &parse );
 
-   end_inst = &p->store[end_offset];
+   end_inst = &p->store[c->end_offset];
    last_inst = &p->store[p->nr_insn];
 
    /* The END instruction will be patched to jump to this code */
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index 05a91f2de4..549696f7ae 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -29,8 +29,10 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
             
+#include "util/u_math.h"
 
 
+#include "brw_debug.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
@@ -64,8 +66,8 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    /* BRW_NEW_NR_VS_SURFACES */
    key->nr_surfaces = brw->vs.nr_surfaces;
 
-   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
       /* Note that we read in the userclip planes as well, hence
        * clip_start:
        */
@@ -86,7 +88,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    memset(&vs, 0, sizeof(vs));
 
    vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
-   vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   vs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
    /* Choosing multiple program flow means that we may get 2-vertex threads,
     * which will have the channel mask for dwords 4-7 enabled in the thread,
@@ -119,6 +121,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
       chipset_max_threads = 32;
    else
       chipset_max_threads = 16;
+
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
 				  1, chipset_max_threads) - 1;
 
@@ -145,16 +148,16 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 			 NULL, NULL);
 
    /* Emit VS program relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     vs.thread0.grf_reg_count << 1,
-		     offsetof(struct brw_vs_unit_state, thread0),
-		     brw->vs.prog_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   vs.thread0.grf_reg_count << 1,
+			   offsetof(struct brw_vs_unit_state, thread0),
+			   brw->vs.prog_bo);
 
    return bo;
 }
 
-static void prepare_vs_unit(struct brw_context *brw)
+static int prepare_vs_unit(struct brw_context *brw)
 {
    struct brw_vs_unit_key key;
 
@@ -168,11 +171,13 @@ static void prepare_vs_unit(struct brw_context *brw)
    if (brw->vs.state_bo == NULL) {
       brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM,
+      .mesa  = (PIPE_NEW_CLIP),
       .brw   = (BRW_NEW_CURBE_OFFSETS |
                 BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
index 319e29bfcb..9a9d47a8a3 100644
--- a/src/gallium/drivers/i965/brw_vs_surface_state.c
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -32,6 +32,11 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_winsys.h"
+
+/* XXX: disabled true constant buffer functionality
+ */
+
 
 /* Creates a new VS constant buffer reflecting the current VS program's
  * constants, if needed by the VS program.
@@ -39,9 +44,12 @@
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
+#if 0
+static struct brw_winsys_buffer *
 brw_vs_update_constant_buffer(struct brw_context *brw)
 {
+   /* XXX: true constant buffers
+    */
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
@@ -61,21 +69,20 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
 
    return const_buffer;
 }
+#endif
 
 /**
  * Update the surface state for a VS constant buffer.
  *
  * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
  */
+#if 0
 static void
 brw_update_vs_constant_surface( struct brw_context *brw,
                                 GLuint surf)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_surface_key key;
-   struct brw_vertex_program *vp =
-      (struct brw_vertex_program *) brw->vertex_program;
-   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   struct pipe_buffer *cb = brw->curr.vs_constants;
 
    assert(surf == 0);
 
@@ -121,6 +128,7 @@ brw_update_vs_constant_surface( struct brw_context *brw,
       brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
    }
 }
+#endif
 
 
 /**
@@ -129,6 +137,7 @@ brw_update_vs_constant_surface( struct brw_context *brw,
 static struct brw_winsys_buffer *
 brw_vs_get_binding_table(struct brw_context *brw)
 {
+#if 0
    struct brw_winsys_buffer *bind_bo;
 
    bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
@@ -169,6 +178,9 @@ brw_vs_get_binding_table(struct brw_context *brw)
    }
 
    return bind_bo;
+#else
+   return NULL;
+#endif
 }
 
 /**
@@ -178,8 +190,9 @@ brw_vs_get_binding_table(struct brw_context *brw)
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static void prepare_vs_surfaces(struct brw_context *brw )
+static int prepare_vs_surfaces(struct brw_context *brw )
 {
+#if 0
    int i;
    int nr_surfaces = 0;
 
@@ -195,6 +208,7 @@ static void prepare_vs_surfaces(struct brw_context *brw )
       brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
       brw->vs.nr_surfaces = nr_surfaces;
    }
+#endif
 
    /* Note that we don't end up updating the bind_bo if we don't have a
     * surface to be pointing at.  This should be relatively harmless, as it
@@ -204,12 +218,15 @@ static void prepare_vs_surfaces(struct brw_context *brw )
       brw->sws->bo_unreference(brw->vs.bind_bo);
       brw->vs.bind_bo = brw_vs_get_binding_table(brw);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .mesa = (PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_VERTEX_SHADER),
+      .brw = 0,
       .cache = 0
    },
    .prepare = prepare_vs_surfaces,
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 3d889699f8..f0dabfcfd0 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -28,11 +28,14 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-             
+
+#include "tgsi/tgsi_info.h"
+
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_wm.h"
 #include "brw_state.h"
+#include "brw_debug.h"
 
 
 /** Return number of src args for given instruction */
@@ -54,7 +57,7 @@ GLuint brw_wm_nr_args( GLuint opcode )
       return 3;
    default:
       assert(opcode < MAX_OPCODE);
-      return _mesa_num_inst_src_regs(opcode);
+      return tgsi_get_opcode_info(opcode)->num_src;
    }
 }
 
@@ -62,17 +65,17 @@ GLuint brw_wm_nr_args( GLuint opcode )
 GLuint brw_wm_is_scalar_result( GLuint opcode )
 {
    switch (opcode) {
-   case OPCODE_COS:
-   case OPCODE_EX2:
-   case OPCODE_LG2:
-   case OPCODE_POW:
-   case OPCODE_RCP:
-   case OPCODE_RSQ:
-   case OPCODE_SIN:
-   case OPCODE_DP3:
-   case OPCODE_DP4:
-   case OPCODE_DPH:
-   case OPCODE_DST:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_DST:
       return 1;
       
    default:
@@ -134,7 +137,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
  * we'll use one of two code generators.
  */
 static void do_wm_prog( struct brw_context *brw,
-			struct brw_fragment_program *fp, 
+			struct brw_fragment_shader *fp, 
 			struct brw_wm_prog_key *key)
 {
    struct brw_wm_compile *c;
@@ -163,7 +166,7 @@ static void do_wm_prog( struct brw_context *brw,
    brw_init_compile(brw, &c->func);
 
    /* temporary sanity check assertion */
-   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   assert(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
 
    /*
     * Shader which use GLSL features such as flow control are handled
@@ -200,8 +203,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct brw_fragment_program *fp = 
-      (struct brw_fragment_program *)brw->fragment_program;
+   const struct brw_fragment_program *fp = brw->curr.fragment_shader;
    GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
-- 
cgit v1.2.3


From c796aed5ddad011d66e631c4cafdbf779e73f213 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 5 Nov 2009 13:57:05 +0000
Subject: i965g: add lots of error checks and early returns

Any allocation that may fail should be checked, and propogate the
error upwards.  At the highest level we will flush batch and retry.

This is an alternate strategy to what the original DRI driver did of
attempting to flush batch from the lowest levels (eg inside
BEGIN_BATCH).  The trouble with that strategy was that flushes could
occur at unexpected times, and additionally there was a need for a
wierd notification mechanism to propogate the 'lost context' state
back up to higher levels.

Propogating the errors directly gives us a lot of flexibility how to
deal with these states, at the expense of a lot more checking in the
code.

Will add some sanity checks later to make sure that out-of-memory
conditions are properly escalated and not lost halfway up the stack.
---
 src/gallium/drivers/i965/brw_batchbuffer.c        |  19 +-
 src/gallium/drivers/i965/brw_batchbuffer.h        |   3 +-
 src/gallium/drivers/i965/brw_cc.c                 |  73 ++++---
 src/gallium/drivers/i965/brw_clip.c               |  60 ++++--
 src/gallium/drivers/i965/brw_clip_state.c         |  60 +++---
 src/gallium/drivers/i965/brw_context.c            |  46 +++--
 src/gallium/drivers/i965/brw_context.h            |   2 +-
 src/gallium/drivers/i965/brw_curbe.c              |  18 +-
 src/gallium/drivers/i965/brw_draw.c               |   3 +-
 src/gallium/drivers/i965/brw_draw_upload.c        |  18 +-
 src/gallium/drivers/i965/brw_eu.c                 |  13 +-
 src/gallium/drivers/i965/brw_eu.h                 |   8 +-
 src/gallium/drivers/i965/brw_gs.c                 |  69 ++++---
 src/gallium/drivers/i965/brw_gs_state.c           |  48 +++--
 src/gallium/drivers/i965/brw_pipe_query.c         |  31 +--
 src/gallium/drivers/i965/brw_pipe_shader.c        |   3 +-
 src/gallium/drivers/i965/brw_pipe_vertex.c        |   2 +-
 src/gallium/drivers/i965/brw_screen_buffers.c     |  16 +-
 src/gallium/drivers/i965/brw_screen_surface.c     |   7 +-
 src/gallium/drivers/i965/brw_screen_texture.c     |  17 +-
 src/gallium/drivers/i965/brw_sf.c                 |  52 +++--
 src/gallium/drivers/i965/brw_sf_state.c           |  86 ++++----
 src/gallium/drivers/i965/brw_state.h              |  71 +++----
 src/gallium/drivers/i965/brw_state_cache.c        | 115 +++++------
 src/gallium/drivers/i965/brw_state_upload.c       |   3 +-
 src/gallium/drivers/i965/brw_vs.c                 |  56 +++---
 src/gallium/drivers/i965/brw_vs_state.c           |  58 +++---
 src/gallium/drivers/i965/brw_vs_surface_state.c   |  97 ++++++----
 src/gallium/drivers/i965/brw_winsys.h             |  56 ++++--
 src/gallium/drivers/i965/brw_wm.c                 |  78 ++++----
 src/gallium/drivers/i965/brw_wm_constant_buffer.c |  87 +++++----
 src/gallium/drivers/i965/brw_wm_sampler_state.c   |  98 ++++++----
 src/gallium/drivers/i965/brw_wm_state.c           | 103 ++++++----
 src/gallium/drivers/i965/brw_wm_surface_state.c   | 226 ++++++++++++----------
 src/gallium/winsys/drm/i965/xlib/xlib_i965.c      |  46 ++---
 35 files changed, 1003 insertions(+), 745 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
index ca612e5ed0..e5f73bd6a3 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.c
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -38,17 +38,17 @@
 #define USE_MALLOC_BUFFER 1
 #define ALWAYS_EMIT_MI_FLUSH 1
 
-void
+enum pipe_error
 brw_batchbuffer_reset(struct brw_batchbuffer *batch)
 {
-   if (batch->buf) {
-      batch->sws->bo_unreference(batch->buf);
-      batch->buf = NULL;
-   }
+   enum pipe_error ret;
 
-   batch->buf = batch->sws->bo_alloc(batch->sws,
-				     BRW_BUFFER_TYPE_BATCH,
-				     BRW_BATCH_SIZE, 4096);
+   ret = batch->sws->bo_alloc( batch->sws,
+                               BRW_BUFFER_TYPE_BATCH,
+                               BRW_BATCH_SIZE, 4096,
+                               &batch->buf );
+   if (ret)
+      return ret;
 
    if (batch->malloc_buffer)
       batch->map = batch->malloc_buffer;
@@ -59,6 +59,7 @@ brw_batchbuffer_reset(struct brw_batchbuffer *batch)
 
    batch->size = BRW_BATCH_SIZE;
    batch->ptr = batch->map;
+   return PIPE_OK;
 }
 
 struct brw_batchbuffer *
@@ -91,7 +92,7 @@ brw_batchbuffer_free(struct brw_batchbuffer *batch)
       batch->map = NULL;
    }
 
-   batch->sws->bo_unreference(batch->buf);
+   bo_reference(&batch->buf, NULL);
    FREE(batch);
 }
 
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
index 1f04826aea..288a9d2755 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.h
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -65,7 +65,8 @@ void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
 			      const char *file, int line);
 
 
-void brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+enum pipe_error
+brw_batchbuffer_reset(struct brw_batchbuffer *batch);
 
 
 /* Unlike bmBufferData, this currently requires the buffer be mapped.
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 20967f0191..8e25fe8585 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -57,10 +57,11 @@ static void calc_sane_viewport( const struct pipe_viewport_state *vp,
    svp->far = 1;
 }
 
-static int prepare_cc_vp( struct brw_context *brw )
+static enum pipe_error prepare_cc_vp( struct brw_context *brw )
 {
    struct brw_cc_viewport ccv;
    struct sane_viewport svp;
+   enum pipe_error ret;
 
    memset(&ccv, 0, sizeof(ccv));
 
@@ -70,10 +71,12 @@ static int prepare_cc_vp( struct brw_context *brw )
    ccv.min_depth = svp.near;
    ccv.max_depth = svp.far;
 
-   brw->sws->bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
-
-   return 0;
+   ret = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0,
+                         &brw->cc.vp_bo );
+   if (ret)
+      return ret;
+                
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_cc_vp = {
@@ -123,11 +126,13 @@ cc_unit_populate_key(const struct brw_context *brw,
 /**
  * Creates the state cache entry for the given CC unit key.
  */
-static struct brw_winsys_buffer *
-cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+static enum pipe_error
+cc_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_cc_unit_key *key,
+                        struct brw_winsys_buffer **bo_out)
 {
    struct brw_cc_unit_state cc;
-   struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
 
    memset(&cc, 0, sizeof(cc));
 
@@ -143,38 +148,48 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    cc.cc6 = key->cc6;
    cc.cc7 = key->cc7;
 
-   bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
-			 key, sizeof(*key),
-			 &brw->cc.vp_bo, 1,
-			 &cc, sizeof(cc),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
+                          key, sizeof(*key),
+                          &brw->cc.vp_bo, 1,
+                          &cc, sizeof(cc),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
-   /* Emit CC viewport relocation */
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   0,
-			   offsetof(struct brw_cc_unit_state, cc4),
-			   brw->cc.vp_bo);
 
-   return bo;
+   /* Emit CC viewport relocation */
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 0,
+                                 offsetof(struct brw_cc_unit_state, cc4),
+                                 brw->cc.vp_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 static int prepare_cc_unit( struct brw_context *brw )
 {
    struct brw_cc_unit_key key;
+   enum pipe_error ret;
 
    cc_unit_populate_key(brw, &key);
 
-   brw->sws->bo_unreference(brw->cc.state_bo);
-   brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT,
-				       &key, sizeof(key),
-				       &brw->cc.vp_bo, 1,
-				       NULL);
-
-   if (brw->cc.state_bo == NULL)
-      brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
+   if (brw_search_cache(&brw->cache, BRW_CC_UNIT,
+                        &key, sizeof(key),
+                        &brw->cc.vp_bo, 1,
+                        NULL,
+                        &brw->cc.state_bo))
+      return PIPE_OK;
+
+   ret = cc_unit_create_from_key(brw, &key, 
+                                 &brw->cc.state_bo);
+   if (ret)
+      return ret;
    
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_cc_unit = {
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 1a52fa771b..35e1d2fdbd 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -48,9 +48,12 @@
 #define BACK_UNFILLED_BIT   0x2
 
 
-static void compile_clip_prog( struct brw_context *brw,
-			     struct brw_clip_prog_key *key )
+static enum pipe_error
+compile_clip_prog( struct brw_context *brw,
+                   struct brw_clip_prog_key *key,
+                   struct brw_winsys_buffer **bo_out )
 {
+   enum pipe_error ret;
    struct brw_clip_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -123,31 +126,39 @@ static void compile_clip_prog( struct brw_context *brw,
       break;
    default:
       assert(0);
-      return;
+      return PIPE_ERROR_BAD_INPUT;
    }
 
 	 
    /* get the program
     */
-   program = brw_get_program(&c.func, &program_size);
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
 
    /* Upload
     */
-   brw->sws->bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_upload_cache( &brw->cache,
-					 BRW_CLIP_PROG,
-					 &c.key, sizeof(c.key),
-					 NULL, 0,
-					 program, program_size,
-					 &c.prog_data,
-					 &brw->clip.prog_data );
+   ret = brw_upload_cache( &brw->cache,
+                           BRW_CLIP_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->clip.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static int upload_clip_prog(struct brw_context *brw)
+static enum pipe_error
+upload_clip_prog(struct brw_context *brw)
 {
+   enum pipe_error ret;
    struct brw_clip_prog_key key;
 
    /* Populate the key, starting from the almost-complete version from
@@ -166,15 +177,22 @@ static int upload_clip_prog(struct brw_context *brw)
    /* PIPE_NEW_CLIP */
    key.nr_userclip = brw->curr.ucp.nr;
 
-   brw->sws->bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
-					&key, sizeof(key),
-					NULL, 0,
-					&brw->clip.prog_data);
-   if (brw->clip.prog_bo == NULL)
-      compile_clip_prog( brw, &key );
+   /* Already cached?
+    */
+   if (brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->clip.prog_data,
+                        &brw->clip.prog_bo))
+      return PIPE_OK;
+
+   /* Compile new program:
+    */
+   ret = compile_clip_prog( brw, &key, &brw->clip.prog_bo );
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 6f8309fea9..d4e3c43c61 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -72,12 +72,13 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    key->depth_clamp = 0; // XXX: add this to gallium: ctx->Transform.DepthClamp;
 }
 
-static struct brw_winsys_buffer *
+static enum pipe_error
 clip_unit_create_from_key(struct brw_context *brw,
-			  struct brw_clip_unit_key *key)
+                          struct brw_clip_unit_key *key,
+                          struct brw_winsys_buffer **bo_out)
 {
    struct brw_clip_unit_state clip;
-   struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
 
    memset(&clip, 0, sizeof(clip));
 
@@ -141,39 +142,50 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.viewport_ymin = -1;
    clip.viewport_ymax = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
-			 key, sizeof(*key),
-			 &brw->clip.prog_bo, 1,
-			 &clip, sizeof(clip),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
+                          key, sizeof(*key),
+                          &brw->clip.prog_bo, 1,
+                          &clip, sizeof(clip),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
    /* Emit clip program relocation */
    assert(brw->clip.prog_bo);
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   clip.thread0.grf_reg_count << 1,
-			   offsetof(struct brw_clip_unit_state, thread0),
-			   brw->clip.prog_bo);
-
-   return bo;
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 clip.thread0.grf_reg_count << 1,
+                                 offsetof(struct brw_clip_unit_state, thread0),
+                                 brw->clip.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 static int upload_clip_unit( struct brw_context *brw )
 {
    struct brw_clip_unit_key key;
+   enum pipe_error ret;
 
    clip_unit_populate_key(brw, &key);
 
-   brw->sws->bo_unreference(brw->clip.state_bo);
-   brw->clip.state_bo = brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
-					 &key, sizeof(key),
-					 &brw->clip.prog_bo, 1,
-					 NULL);
-   if (brw->clip.state_bo == NULL) {
-      brw->clip.state_bo = clip_unit_create_from_key(brw, &key);
-   }
+   if (brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
+                        &key, sizeof(key),
+                        &brw->clip.prog_bo, 1,
+                        NULL,
+                        &brw->clip.state_bo))
+      return PIPE_OK;
+      
+   /* Create new:
+    */
+   ret = clip_unit_create_from_key(brw, &key, 
+                                   &brw->clip.state_bo);
+   if (ret)
+      return ret;
    
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_clip_unit = {
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index aaf7d1834e..2cee7a7a3c 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -72,29 +72,33 @@ static void brw_destroy_context( struct pipe_context *pipe )
    brw->curr.fb.nr_cbufs = 0;
    pipe_surface_reference(&brw->curr.fb.zsbuf, NULL);
 
-   brw->sws->bo_unreference(brw->curbe.curbe_bo);
-   brw->sws->bo_unreference(brw->vs.prog_bo);
-   brw->sws->bo_unreference(brw->vs.state_bo);
-   brw->sws->bo_unreference(brw->vs.bind_bo);
-   brw->sws->bo_unreference(brw->gs.prog_bo);
-   brw->sws->bo_unreference(brw->gs.state_bo);
-   brw->sws->bo_unreference(brw->clip.prog_bo);
-   brw->sws->bo_unreference(brw->clip.state_bo);
-   brw->sws->bo_unreference(brw->clip.vp_bo);
-   brw->sws->bo_unreference(brw->sf.prog_bo);
-   brw->sws->bo_unreference(brw->sf.state_bo);
-   brw->sws->bo_unreference(brw->sf.vp_bo);
+   bo_reference(&brw->curbe.curbe_bo, NULL);
+   bo_reference(&brw->vs.prog_bo, NULL);
+   bo_reference(&brw->vs.state_bo, NULL);
+   bo_reference(&brw->vs.bind_bo, NULL);
+   bo_reference(&brw->gs.prog_bo, NULL);
+   bo_reference(&brw->gs.state_bo, NULL);
+   bo_reference(&brw->clip.prog_bo, NULL);
+   bo_reference(&brw->clip.state_bo, NULL);
+   bo_reference(&brw->clip.vp_bo, NULL);
+   bo_reference(&brw->sf.prog_bo, NULL);
+   bo_reference(&brw->sf.state_bo, NULL);
+   bo_reference(&brw->sf.vp_bo, NULL);
+
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
-      brw->sws->bo_unreference(brw->wm.sdc_bo[i]);
-   brw->sws->bo_unreference(brw->wm.bind_bo);
+      bo_reference(&brw->wm.sdc_bo[i], NULL);
+
+   bo_reference(&brw->wm.bind_bo, NULL);
+
    for (i = 0; i < BRW_WM_MAX_SURF; i++)
-      brw->sws->bo_unreference(brw->wm.surf_bo[i]);
-   brw->sws->bo_unreference(brw->wm.sampler_bo);
-   brw->sws->bo_unreference(brw->wm.prog_bo);
-   brw->sws->bo_unreference(brw->wm.state_bo);
-   brw->sws->bo_unreference(brw->cc.prog_bo);
-   brw->sws->bo_unreference(brw->cc.state_bo);
-   brw->sws->bo_unreference(brw->cc.vp_bo);
+      bo_reference(&brw->wm.surf_bo[i], NULL);
+
+   bo_reference(&brw->wm.sampler_bo, NULL);
+   bo_reference(&brw->wm.prog_bo, NULL);
+   bo_reference(&brw->wm.state_bo, NULL);
+   bo_reference(&brw->cc.prog_bo, NULL);
+   bo_reference(&brw->cc.state_bo, NULL);
+   bo_reference(&brw->cc.vp_bo, NULL);
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 09d34615c7..580251d2f1 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -744,7 +744,7 @@ struct brw_context
  * brw_queryobj.c
  */
 void brw_init_query(struct brw_context *brw);
-void brw_prepare_query_begin(struct brw_context *brw);
+enum pipe_error brw_prepare_query_begin(struct brw_context *brw);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
 
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 1e2e232204..ca7774a7cc 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -160,10 +160,11 @@ static GLfloat fixed_plane[6][4] = {
  * cache mechanism, but maybe would benefit from a comparison against
  * the current uploaded set of constants.
  */
-static int prepare_curbe_buffer(struct brw_context *brw)
+static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
 {
    const GLuint sz = brw->curbe.total_size;
    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   enum pipe_error ret;
    GLfloat *buf;
    GLuint i;
 
@@ -267,17 +268,20 @@ static int prepare_curbe_buffer(struct brw_context *brw)
 	  (brw->curbe.need_new_bo ||
 	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
       {
-	 brw->sws->bo_unreference(brw->curbe.curbe_bo);
-	 brw->curbe.curbe_bo = NULL;
+	 bo_reference(&brw->curbe.curbe_bo, NULL);
       }
 
       if (brw->curbe.curbe_bo == NULL) {
 	 /* Allocate a single page for CURBE entries for this batchbuffer.
 	  * They're generally around 64b.
 	  */
-	 brw->curbe.curbe_bo = brw->sws->bo_alloc(brw->sws, 
-						  BRW_BUFFER_TYPE_CURBE,
-						  4096, 1 << 6);
+	 ret = brw->sws->bo_alloc(brw->sws, 
+                                  BRW_BUFFER_TYPE_CURBE,
+                                  4096, 1 << 6,
+                                  &brw->curbe.curbe_bo);
+         if (ret)
+            return ret;
+
 	 brw->curbe.curbe_next_offset = 0;
       }
 
@@ -313,7 +317,7 @@ static int prepare_curbe_buffer(struct brw_context *brw)
    return 0;
 }
 
-static int emit_curbe_buffer(struct brw_context *brw)
+static enum pipe_error emit_curbe_buffer(struct brw_context *brw)
 {
    GLuint sz = brw->curbe.total_size;
 
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 6d6b1c7c5c..88cb31ad54 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -280,6 +280,5 @@ void brw_draw_cleanup( struct brw_context *brw )
    u_upload_destroy( brw->vb.upload_vertex );
    u_upload_destroy( brw->vb.upload_index );
 
-   brw->sws->bo_unreference(brw->ib.bo);
-   brw->ib.bo = NULL;
+   bo_reference(&brw->ib.bo, NULL);
 }
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index 4fa7d549eb..188605a0c1 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -251,9 +251,8 @@ static int brw_prepare_vertices(struct brw_context *brw)
       brw->vb.vb[i].vertex_count = (vb->stride == 0 ?
 				    1 :
 				    (bo->size - offset) / vb->stride);
-      brw->sws->bo_unreference(brw->vb.vb[i].bo);
-      brw->vb.vb[i].bo = bo;
-      brw->sws->bo_reference(brw->vb.vb[i].bo);
+
+      bo_reference( &brw->vb.vb[i].bo,  bo );
 
       /* Don't need to retain this reference.  We have a reference on
        * the underlying winsys buffer:
@@ -417,6 +416,7 @@ const struct brw_tracked_state brw_vertices = {
 static int brw_prepare_indices(struct brw_context *brw)
 {
    struct pipe_buffer *index_buffer = brw->curr.index_buffer;
+   struct pipe_buffer *upload_buf = NULL;
    struct brw_winsys_buffer *bo = NULL;
    GLuint offset;
    GLuint index_size;
@@ -438,7 +438,6 @@ static int brw_prepare_indices(struct brw_context *brw)
    /* Turn userbuffer into a proper hardware buffer?
     */
    if (brw_buffer_is_user_buffer(index_buffer)) {
-      struct pipe_buffer *upload_buf;
 
       ret = u_upload_buffer( brw->vb.upload_index,
 			     0,
@@ -450,8 +449,6 @@ static int brw_prepare_indices(struct brw_context *brw)
 	 return ret;
 
       bo = brw_buffer(upload_buf)->bo;
-      brw->sws->bo_reference(bo);
-      pipe_buffer_reference( &upload_buf, NULL );
 
       /* XXX: annotate the userbuffer with the upload information so
        * that successive calls don't get re-uploaded.
@@ -459,8 +456,6 @@ static int brw_prepare_indices(struct brw_context *brw)
    }
    else {
       bo = brw_buffer(index_buffer)->bo;
-      brw->sws->bo_reference(bo);
-      
       ib_size = bo->size;
       offset = 0;
    }
@@ -486,15 +481,12 @@ static int brw_prepare_indices(struct brw_context *brw)
    if (brw->ib.bo != bo ||
        brw->ib.size != ib_size)
    {
-      brw->sws->bo_unreference(brw->ib.bo);
-      brw->ib.bo = bo;
+      bo_reference(&brw->ib.bo, bo);
       brw->ib.size = ib_size;
       brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
    }
-   else {
-      brw->sws->bo_unreference(bo);
-   }
 
+   pipe_buffer_reference( &upload_buf, NULL );
    brw_add_validated_bo(brw, brw->ib.bo);
    return 0;
 }
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
index de43b14512..a8fcb5f97e 100644
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -118,16 +118,23 @@ void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
 }
 
 
-const GLuint *brw_get_program( struct brw_compile *p,
-			       GLuint *sz )
+enum pipe_error brw_get_program( struct brw_compile *p,
+                                 const GLuint **data,
+                                 GLuint *sz )
 {
    GLuint i;
 
    for (i = 0; i < 8; i++)
       brw_NOP(p);
 
+   /* Is the generated program malformed for some reason?
+    */
+   if (p->error)
+      return PIPE_ERROR_BAD_INPUT;
+
    *sz = p->nr_insn * sizeof(struct brw_instruction);
-   return (const GLuint *)p->store;
+   *data = (const GLuint *)p->store;
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index 7bddc3859c..565f4ef1c5 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -34,6 +34,7 @@
 #define BRW_EU_H
 
 #include "util/u_debug.h"
+#include "pipe/p_error.h"
 
 #include "brw_structs.h"
 #include "brw_defines.h"
@@ -132,6 +133,8 @@ struct brw_compile {
 
    struct brw_eu_label *first_label;  /**< linked list of labels */
    struct brw_eu_call *first_call;    /**< linked list of CALs */
+
+   boolean error;
 };
 
 
@@ -772,7 +775,10 @@ void brw_set_predicate_control( struct brw_compile *p, GLuint pc );
 void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional );
 
 void brw_init_compile( struct brw_context *, struct brw_compile *p );
-const GLuint *brw_get_program( struct brw_compile *p, GLuint *sz );
+
+enum pipe_error brw_get_program( struct brw_compile *p, 
+                                 const GLuint **program,
+                                 GLuint *sz );
 
 
 /* Helpers for regular instructions:
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 693d8bfdf8..ce77be24f6 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -40,10 +40,12 @@
 
 
-static void compile_gs_prog( struct brw_context *brw,
-			     struct brw_gs_prog_key *key )
+static enum pipe_error compile_gs_prog( struct brw_context *brw,
+                                        struct brw_gs_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
 {
    struct brw_gs_compile c;
+   enum pipe_error ret;
    const GLuint *program;
    GLuint program_size;
 
@@ -57,9 +59,9 @@ static void compile_gs_prog( struct brw_context *brw,
    c.nr_attrs = c.key.nr_attrs;
 
    if (BRW_IS_IGDNG(brw))
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
 
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
@@ -93,40 +95,47 @@ static void compile_gs_prog( struct brw_context *brw,
       if (key->hint_gs_always)
 	 brw_gs_lines( &c );
       else {
-	 return;
+	 return PIPE_OK;
       }
       break;
    case PIPE_PRIM_TRIANGLES:
       if (key->hint_gs_always)
 	 brw_gs_tris( &c );
       else {
-	 return;
+	 return PIPE_OK;
       }
       break;
    case PIPE_PRIM_POINTS:
       if (key->hint_gs_always)
 	 brw_gs_points( &c );
       else {
-	 return;
+	 return PIPE_OK;
       }
-      break;      
+      break;
    default:
-      return;
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
    }
 
    /* get the program
     */
-   program = brw_get_program(&c.func, &program_size);
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
 
    /* Upload
     */
-   brw->sws->bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->gs.prog_data );
+   ret = brw_upload_cache( &brw->cache, BRW_GS_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->gs.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 static const unsigned gs_prim[PIPE_PRIM_MAX] = {  
@@ -166,6 +175,8 @@ static void populate_key( struct brw_context *brw,
 static int prepare_gs_prog(struct brw_context *brw)
 {
    struct brw_gs_prog_key key;
+   enum pipe_error ret;
+
    /* Populate the key:
     */
    populate_key(brw, &key);
@@ -175,17 +186,21 @@ static int prepare_gs_prog(struct brw_context *brw)
       brw->gs.prog_active = key.need_gs_prog;
    }
 
-   if (brw->gs.prog_active) {
-      brw->sws->bo_unreference(brw->gs.prog_bo);
-      brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
-					 &key, sizeof(key),
-					 NULL, 0,
-					 &brw->gs.prog_data);
-      if (brw->gs.prog_bo == NULL)
-	 compile_gs_prog( brw, &key );
-   }
+   if (!brw->gs.prog_active)
+      return PIPE_OK;
+
+   if (brw_search_cache(&brw->cache, BRW_GS_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->gs.prog_data,
+                        &brw->gs.prog_bo))
+      return PIPE_OK;
+
+   ret = compile_gs_prog( brw, &key, &brw->gs.prog_bo );
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
index f27f886a65..18a66da538 100644
--- a/src/gallium/drivers/i965/brw_gs_state.c
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -69,11 +69,13 @@ gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    key->urb_size = brw->urb.vsize;
 }
 
-static struct brw_winsys_buffer *
-gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+static enum pipe_error
+gs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_gs_unit_key *key,
+                        struct brw_winsys_buffer **bo_out)
 {
    struct brw_gs_unit_state gs;
-   struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
 
    memset(&gs, 0, sizeof(gs));
 
@@ -104,40 +106,46 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    if (BRW_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
-			 key, sizeof(*key),
-			 &brw->gs.prog_bo, 1,
-			 &gs, sizeof(gs),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
+                          key, sizeof(*key),
+                          &brw->gs.prog_bo, 1,
+                          &gs, sizeof(gs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
    if (key->prog_active) {
       /* Emit GS program relocation */
-      brw->sws->bo_emit_reloc(bo,
+      brw->sws->bo_emit_reloc(*bo_out,
 			      BRW_USAGE_STATE,
 			      gs.thread0.grf_reg_count << 1,
 			      offsetof(struct brw_gs_unit_state, thread0),
 			      brw->gs.prog_bo);
    }
 
-   return bo;
+   return PIPE_OK;
 }
 
-static int prepare_gs_unit(struct brw_context *brw)
+static enum pipe_error prepare_gs_unit(struct brw_context *brw)
 {
    struct brw_gs_unit_key key;
+   enum pipe_error ret;
 
    gs_unit_populate_key(brw, &key);
 
-   brw->sws->bo_unreference(brw->gs.state_bo);
-   brw->gs.state_bo = brw_search_cache(&brw->cache, BRW_GS_UNIT,
-				       &key, sizeof(key),
-				       &brw->gs.prog_bo, 1,
-				       NULL);
-   if (brw->gs.state_bo == NULL) {
-      brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
-   }
+   if (brw_search_cache(&brw->cache, BRW_GS_UNIT,
+                        &key, sizeof(key),
+                        &brw->gs.prog_bo, 1,
+                        NULL,
+                        &brw->gs.state_bo))
+      return PIPE_OK;
+
+   ret = gs_unit_create_from_key(brw, &key, &brw->gs.state_bo);
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_gs_unit = {
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index 3370ebd262..6a01173787 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -72,8 +72,7 @@ brw_query_get_result(struct pipe_context *pipe,
       }
 
       brw->sws->bo_unmap(query->bo);
-      brw->sws->bo_unreference(query->bo);
-      query->bo = NULL;
+      bo_reference(&query->bo, NULL);
    }
 
    *result = query->result;
@@ -100,10 +99,9 @@ brw_query_create(struct pipe_context *pipe, unsigned type )
 static void
 brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
 {
-   struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
-   brw->sws->bo_unreference(query->bo);
+   bo_reference(&query->bo, NULL);
    FREE(query);
 }
 
@@ -114,9 +112,8 @@ brw_query_begin(struct pipe_context *pipe, struct pipe_query *q)
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Reset our driver's tracking of query state. */
-   brw->sws->bo_unreference(query->bo);
+   bo_reference(&query->bo, NULL);
    query->result = 0;
-   query->bo = NULL;
    query->first_index = -1;
    query->last_index = -1;
 
@@ -139,8 +136,7 @@ brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
       brw_emit_query_end(brw);
       brw_context_flush( brw );
 
-      brw->sws->bo_unreference(brw->query.bo);
-      brw->query.bo = NULL;
+      bo_reference(&brw->query.bo, NULL);
    }
 
    remove_from_list(query);
@@ -153,24 +149,30 @@ brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
  */
 
 /** Called to set up the query BO and account for its aperture space */
-void
+enum pipe_error
 brw_prepare_query_begin(struct brw_context *brw)
 {
+   enum pipe_error ret;
+
    /* Skip if we're not doing any queries. */
    if (is_empty_list(&brw->query.active_head))
-      return;
+      return PIPE_OK;
 
    /* Get a new query BO if we're going to need it. */
    if (brw->query.bo == NULL ||
        brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
-      brw->sws->bo_unreference(brw->query.bo);
-      brw->query.bo = NULL;
 
-      brw->query.bo = brw->sws->bo_alloc(brw->sws, BRW_BUFFER_TYPE_QUERY, 4096, 1);
+      ret = brw->sws->bo_alloc(brw->sws, BRW_BUFFER_TYPE_QUERY, 4096, 1,
+                               &brw->query.bo);
+      if (ret)
+         return ret;
+
       brw->query.index = 0;
    }
 
    brw_add_validated_bo(brw, brw->query.bo);
+
+   return PIPE_OK;
 }
 
 /** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
@@ -213,8 +215,7 @@ brw_emit_query_begin(struct brw_context *brw)
 				  FALSE,
 				  &tmp );
 
-	 brw->sws->bo_reference(brw->query.bo);
-	 query->bo = brw->query.bo;
+	 bo_reference( &query->bo, brw->query.bo );
 	 query->first_index = brw->query.index;
       }
       query->last_index = brw->query.index;
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 2833f2bce0..662c43c3e5 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -146,10 +146,9 @@ fail:
 
 static void brw_delete_fs_state( struct pipe_context *pipe, void *prog )
 {
-   struct brw_context *brw = brw_context(pipe);
    struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
 
-   brw->sws->bo_unreference(fs->const_buffer);
+   bo_reference(&fs->const_buffer, NULL);
    FREE( (void *)fs->tokens );
    FREE( fs );
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
index 97e9a23688..73bba5b088 100644
--- a/src/gallium/drivers/i965/brw_pipe_vertex.c
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -56,7 +56,7 @@ brw_pipe_vertex_cleanup( struct brw_context *brw )
     */
 #if 0
    for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
-      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
+      bo_reference(&brw->vb.inputs[i].bo, NULL);
       brw->vb.inputs[i].bo = NULL;
    }
 #endif
diff --git a/src/gallium/drivers/i965/brw_screen_buffers.c b/src/gallium/drivers/i965/brw_screen_buffers.c
index ba54740225..7ae386ffb3 100644
--- a/src/gallium/drivers/i965/brw_screen_buffers.c
+++ b/src/gallium/drivers/i965/brw_screen_buffers.c
@@ -43,15 +43,11 @@ brw_buffer_unmap( struct pipe_screen *screen,
 static void
 brw_buffer_destroy( struct pipe_buffer *buffer )
 {
-   struct brw_screen *bscreen = brw_screen( buffer->screen );
-   struct brw_winsys_screen *sws = bscreen->sws;
    struct brw_buffer *buf = brw_buffer( buffer );
 
    assert(!p_atomic_read(&buffer->reference.count));
 
-   if (buf->bo)
-      sws->bo_unreference(buf->bo);
-   
+   bo_reference(&buf->bo, NULL);
    FREE(buf);
 }
 
@@ -66,6 +62,7 @@ brw_buffer_create(struct pipe_screen *screen,
    struct brw_winsys_screen *sws = bscreen->sws;
    struct brw_buffer *buf;
    unsigned buffer_type;
+   enum pipe_error ret;
    
    buf = CALLOC_STRUCT(brw_buffer);
    if (!buf)
@@ -101,10 +98,11 @@ brw_buffer_create(struct pipe_screen *screen,
       break;
    }
    
-   buf->bo = sws->bo_alloc( sws,
-                            buffer_type,
-                            size,
-                            alignment );
+   ret = sws->bo_alloc( sws, buffer_type,
+                        size, alignment,
+                        &buf->bo );
+   if (ret != PIPE_OK)
+      return NULL;
       
    return &buf->base; 
 }
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index 1c408e9f2e..21a7382873 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -150,9 +150,7 @@ static struct brw_surface *create_in_place_view( struct brw_screen *brw_screen,
    surface->pitch = tex->pitch;
    surface->tiling = tex->tiling;
 
-   surface->bo = tex->bo;
-   brw_screen->sws->bo_reference(surface->bo);
-
+   bo_reference( &surface->bo, tex->bo );
    pipe_texture_reference( &surface->base.texture, &tex->base );
 
    surface->ss.ss0.surface_format = tex->ss.ss0.surface_format;
@@ -244,11 +242,10 @@ static struct pipe_surface *brw_get_tex_surface(struct pipe_screen *screen,
 static void brw_tex_surface_destroy( struct pipe_surface *surf )
 {
    struct brw_surface *surface = brw_surface(surf);
-   struct brw_screen *screen = brw_screen(surf->texture->screen);
 
    /* Unreference texture, shared buffer:
     */
-   screen->sws->bo_unreference(surface->bo);
+   bo_reference(&surface->bo, NULL);
    pipe_texture_reference( &surface->base.texture, NULL );
 
 
diff --git a/src/gallium/drivers/i965/brw_screen_texture.c b/src/gallium/drivers/i965/brw_screen_texture.c
index ba6dc7dfde..355abf0b89 100644
--- a/src/gallium/drivers/i965/brw_screen_texture.c
+++ b/src/gallium/drivers/i965/brw_screen_texture.c
@@ -187,6 +187,7 @@ static struct pipe_texture *brw_texture_create( struct pipe_screen *screen,
    struct brw_screen *bscreen = brw_screen(screen);
    struct brw_texture *tex;
    enum brw_buffer_type buffer_type;
+   enum pipe_error ret;
    
    tex = CALLOC_STRUCT(brw_texture);
    if (tex == NULL)
@@ -235,10 +236,13 @@ static struct pipe_texture *brw_texture_create( struct pipe_screen *screen,
       buffer_type = BRW_BUFFER_TYPE_TEXTURE;
    }
 
-   tex->bo = bscreen->sws->bo_alloc( bscreen->sws,
-                                     buffer_type,
-                                     tex->pitch * tex->total_height * tex->cpp,
-                                     64 );
+   ret = bscreen->sws->bo_alloc( bscreen->sws,
+                                 buffer_type,
+                                 tex->pitch * tex->total_height * tex->cpp,
+                                 64,
+                                 &tex->bo );
+   if (ret)
+      goto fail;
 
    tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    tex->ss.ss0.surface_type = translate_tex_target(tex->base.target);
@@ -289,7 +293,7 @@ static struct pipe_texture *brw_texture_create( struct pipe_screen *screen,
    return &tex->base;
 
 fail:
-   bscreen->sws->bo_unreference(tex->bo);
+   bo_reference(&tex->bo, NULL);
    FREE(tex);
    return NULL;
 }
@@ -306,7 +310,8 @@ static struct pipe_texture *brw_texture_blanket(struct pipe_screen *screen,
 
 static void brw_texture_destroy(struct pipe_texture *pt)
 {
-   //bscreen->sws->bo_unreference(tex->bo);
+   struct brw_texture *tex = brw_texture(pt);
+   bo_reference(&tex->bo, NULL);
    FREE(pt);
 }
 
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 013d839e37..24d1015bbd 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -40,9 +40,11 @@
 #include "brw_sf.h"
 #include "brw_state.h"
 
-static void compile_sf_prog( struct brw_context *brw,
-			     struct brw_sf_prog_key *key )
+static enum pipe_error compile_sf_prog( struct brw_context *brw,
+                                        struct brw_sf_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
 {
+   enum pipe_error ret;
    struct brw_sf_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -87,28 +89,35 @@ static void compile_sf_prog( struct brw_context *brw,
       break;
    default:
       assert(0);
-      return;
+      return PIPE_ERROR_BAD_INPUT;
    }
 
    /* get the program
     */
-   program = brw_get_program(&c.func, &program_size);
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
 
    /* Upload
     */
-   brw->sws->bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->sf.prog_data );
+   ret = brw_upload_cache( &brw->cache, BRW_SF_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->sf.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static int upload_sf_prog(struct brw_context *brw)
+static enum pipe_error upload_sf_prog(struct brw_context *brw)
 {
+   enum pipe_error ret;
    struct brw_sf_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -161,15 +170,18 @@ static int upload_sf_prog(struct brw_context *brw)
 			   PIPE_WINDING_CCW);
    }
 
-   brw->sws->bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
-				      &key, sizeof(key),
-				      NULL, 0,
-				      &brw->sf.prog_data);
-   if (brw->sf.prog_bo == NULL)
-      compile_sf_prog( brw, &key );
+   if (brw_search_cache(&brw->cache, BRW_SF_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->sf.prog_data,
+                        &brw->sf.prog_bo))
+      return PIPE_OK;
 
-   return 0;
+   ret = compile_sf_prog( brw, &key, &brw->sf.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
index 31343ff245..f030f26c19 100644
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -39,11 +39,12 @@
 #include "brw_debug.h"
 #include "brw_pipe_rast.h"
 
-static int upload_sf_vp(struct brw_context *brw)
+static enum pipe_error upload_sf_vp(struct brw_context *brw)
 {
    const struct pipe_viewport_state *vp = &brw->curr.vp;
    const struct pipe_scissor_state *scissor = &brw->curr.scissor;
    struct brw_sf_viewport sfv;
+   enum pipe_error ret;
 
    memset(&sfv, 0, sizeof(sfv));
 
@@ -61,10 +62,12 @@ static int upload_sf_vp(struct brw_context *brw)
    sfv.scissor.ymin = scissor->miny;
    sfv.scissor.ymax = scissor->maxy; /* -1 ?? */
 
-   brw->sws->bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+   ret = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0,
+                         &brw->sf.vp_bo );
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_sf_vp = {
@@ -128,12 +131,13 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 			   rast->point_size_max);
 }
 
-static struct brw_winsys_buffer *
+static enum pipe_error
 sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
-			struct brw_winsys_buffer **reloc_bufs)
+			struct brw_winsys_buffer **reloc_bufs,
+                        struct brw_winsys_buffer **bo_out)
 {
    struct brw_sf_unit_state sf;
-   struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
    int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
@@ -273,51 +277,65 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
       sf.sf6.dest_org_hbias = 0x0;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
-			 key, sizeof(*key),
-			 reloc_bufs, 2,
-			 &sf, sizeof(sf),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
+                          key, sizeof(*key),
+                          reloc_bufs, 2,
+                          &sf, sizeof(sf),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
    /* STATE_PREFETCH command description describes this state as being
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
     */
    /* Emit SF program relocation */
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   sf.thread0.grf_reg_count << 1,
-			   offsetof(struct brw_sf_unit_state, thread0),
-			   brw->sf.prog_bo);
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 sf.thread0.grf_reg_count << 1,
+                                 offsetof(struct brw_sf_unit_state, thread0),
+                                 brw->sf.prog_bo);
+   if (ret)
+      return ret;
 
-   /* Emit SF viewport relocation */
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
-			   offsetof(struct brw_sf_unit_state, sf5),
-			   brw->sf.vp_bo);
 
-   return bo;
+   /* Emit SF viewport relocation */
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
+                                 offsetof(struct brw_sf_unit_state, sf5),
+                                 brw->sf.vp_bo);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
 }
 
-static int upload_sf_unit( struct brw_context *brw )
+static enum pipe_error upload_sf_unit( struct brw_context *brw )
 {
    struct brw_sf_unit_key key;
    struct brw_winsys_buffer *reloc_bufs[2];
+   enum pipe_error ret;
 
    sf_unit_populate_key(brw, &key);
 
    reloc_bufs[0] = brw->sf.prog_bo;
    reloc_bufs[1] = brw->sf.vp_bo;
 
-   brw->sws->bo_unreference(brw->sf.state_bo);
-   brw->sf.state_bo = brw_search_cache(&brw->cache, BRW_SF_UNIT,
-				       &key, sizeof(key),
-				       reloc_bufs, 2,
-				       NULL);
-   if (brw->sf.state_bo == NULL) {
-      brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs);
-   }
-   return 0;
+   if (brw_search_cache(&brw->cache, BRW_SF_UNIT,
+                        &key, sizeof(key),
+                        reloc_bufs, 2,
+                        NULL,
+                        &brw->sf.state_bo))
+      return PIPE_OK;
+
+
+   ret = sf_unit_create_from_key(brw, &key, reloc_bufs,
+                                 &brw->sf.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_sf_unit = {
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 94d2cb6f82..e219a1d870 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -44,8 +44,8 @@ brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
    assert(brw->state.validated_bo_count < Elements(brw->state.validated_bos));
 
    if (bo != NULL) {
-      brw->sws->bo_reference(bo);
-      brw->state.validated_bos[brw->state.validated_bo_count++] = bo;
+      bo_reference( &brw->state.validated_bos[brw->state.validated_bo_count++],
+                    bo );
    }
 }
 
@@ -106,37 +106,42 @@ void brw_destroy_state(struct brw_context *brw);
 /***********************************************************************
  * brw_state_cache.c
  */
-struct brw_winsys_buffer *brw_cache_data(struct brw_cache *cache,
-		       enum brw_cache_id cache_id,
-		       const void *data,
-		       struct brw_winsys_buffer **reloc_bufs,
-		       GLuint nr_reloc_bufs);
-
-struct brw_winsys_buffer *brw_cache_data_sz(struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *data,
-			  GLuint data_size,
-			  struct brw_winsys_buffer **reloc_bufs,
-			  GLuint nr_reloc_bufs);
-
-struct brw_winsys_buffer *brw_upload_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_sz,
-			  struct brw_winsys_buffer **reloc_bufs,
-			  GLuint nr_reloc_bufs,
-			  const void *data,
-			  GLuint data_sz,
-			  const void *aux,
-			  void *aux_return );
-
-struct brw_winsys_buffer *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  struct brw_winsys_buffer **reloc_bufs,
-			  GLuint nr_reloc_bufs,
-			  void *aux_return);
+enum pipe_error brw_cache_data(struct brw_cache *cache,
+                               enum brw_cache_id cache_id,
+                               const void *data,
+                               struct brw_winsys_buffer **reloc_bufs,
+                               GLuint nr_reloc_bufs,
+                               struct brw_winsys_buffer **bo_out );
+
+enum pipe_error brw_cache_data_sz(struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *data,
+                                  GLuint data_size,
+                                  struct brw_winsys_buffer **reloc_bufs,
+                                  GLuint nr_reloc_bufs,
+                                  struct brw_winsys_buffer **bo_out);
+
+enum pipe_error brw_upload_cache( struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *key,
+                                  GLuint key_sz,
+                                  struct brw_winsys_buffer **reloc_bufs,
+                                  GLuint nr_reloc_bufs,
+                                  const void *data,
+                                  GLuint data_sz,
+                                  const void *aux,
+                                  void *aux_return ,
+                                  struct brw_winsys_buffer **bo_out);
+
+boolean brw_search_cache( struct brw_cache *cache,
+                          enum brw_cache_id cache_id,
+                          const void *key,
+                          GLuint key_size,
+                          struct brw_winsys_buffer **reloc_bufs,
+                          GLuint nr_reloc_bufs,
+                          void *aux_return,
+                          struct brw_winsys_buffer **bo_out);
+
 void brw_state_cache_check_size( struct brw_context *brw );
 
 void brw_init_caches( struct brw_context *brw );
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index cbd1f02d77..f8369d31ec 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -109,9 +109,8 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    if (bo == cache->last_bo[cache_id])
       return; /* no change */
 
-   cache->sws->bo_unreference(cache->last_bo[cache_id]);
-   cache->last_bo[cache_id] = bo;
-   cache->sws->bo_reference(cache->last_bo[cache_id]);
+   bo_reference( &cache->last_bo[cache_id],  bo );
+
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
@@ -174,14 +173,15 @@ rehash(struct brw_cache *cache)
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-struct brw_winsys_buffer *
+boolean
 brw_search_cache(struct brw_cache *cache,
                  enum brw_cache_id cache_id,
                  const void *key,
                  GLuint key_size,
                  struct brw_winsys_buffer **reloc_bufs, 
 		 GLuint nr_reloc_bufs,
-                 void *aux_return)
+                 void *aux_return,
+                 struct brw_winsys_buffer **bo_out)
 {
    struct brw_cache_item *item;
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@@ -189,20 +189,20 @@ brw_search_cache(struct brw_cache *cache,
    item = search_cache(cache, cache_id, hash, key, key_size,
 		       reloc_bufs, nr_reloc_bufs);
 
-   if (item == NULL)
-      return NULL;
-
-   if (aux_return)
-      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
-
-   update_cache_last(cache, cache_id, item->bo);
-
-   cache->sws->bo_reference(item->bo);
-   return item->bo;
+   if (item) {
+      if (aux_return)
+         *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+      
+      update_cache_last(cache, cache_id, item->bo);
+      bo_reference(bo_out, item->bo);
+      return TRUE;
+   }
+   
+   return FALSE;      
 }
 
 
-struct brw_winsys_buffer *
+enum pipe_error
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
 		  const void *key,
@@ -212,14 +212,15 @@ brw_upload_cache( struct brw_cache *cache,
 		  const void *data,
 		  GLuint data_size,
 		  const void *aux,
-		  void *aux_return )
+		  void *aux_return,
+                  struct brw_winsys_buffer **bo_out)
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
    GLuint relocs_size = nr_reloc_bufs * sizeof(struct brw_winsys_buffer *);
    GLuint aux_size = cache->aux_size[cache_id];
+   enum pipe_error ret;
    void *tmp;
-   struct brw_winsys_buffer *bo;
    int i;
 
    /* Create the buffer object to contain the data.  For now, use a
@@ -227,9 +228,12 @@ brw_upload_cache( struct brw_cache *cache,
     * may want to take advantage of hardware distinctions between
     * these various entities.
     */
-   bo = cache->sws->bo_alloc(cache->sws,
-                             cache->buffer_type,
-			     data_size, 1 << 6);
+   ret = cache->sws->bo_alloc(cache->sws,
+                              cache->buffer_type,
+                              data_size, 1 << 6, 
+                              bo_out);
+   if (ret)
+      return ret;
 
 
    /* Set up the memory containing the key, aux_data, and reloc_bufs */
@@ -240,7 +244,7 @@ brw_upload_cache( struct brw_cache *cache,
    memcpy((char *)tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
-	 cache->sws->bo_reference(reloc_bufs[i]);
+         p_atomic_inc(&reloc_bufs[i]->reference.count);
    }
 
    item->cache_id = cache_id;
@@ -249,9 +253,7 @@ brw_upload_cache( struct brw_cache *cache,
    item->key_size = key_size;
    item->reloc_bufs = (struct brw_winsys_buffer **)((char *)tmp + key_size + aux_size);
    item->nr_reloc_bufs = nr_reloc_bufs;
-
-   item->bo = bo;
-   cache->sws->bo_reference(bo);
+   bo_reference( &item->bo, *bo_out );
    item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
@@ -273,28 +275,28 @@ brw_upload_cache( struct brw_cache *cache,
 		   data_size, cache_id);
 
    /* Copy data to the buffer */
-   cache->sws->bo_subdata(bo, 
+   cache->sws->bo_subdata(item->bo, 
                           cache_id,
                           0, data_size, data);
 
-   update_cache_last(cache, cache_id, bo);
+   update_cache_last(cache, cache_id, item->bo);
 
-   return bo;
+   return PIPE_OK;
 }
 
 
 /**
  * This doesn't really work with aux data.  Use search/upload instead
  */
-struct brw_winsys_buffer *
+enum pipe_error
 brw_cache_data_sz(struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
 		  const void *data,
 		  GLuint data_size,
 		  struct brw_winsys_buffer **reloc_bufs,
-		  GLuint nr_reloc_bufs)
+		  GLuint nr_reloc_bufs,
+                  struct brw_winsys_buffer **bo_out)
 {
-   struct brw_winsys_buffer *bo;
    struct brw_cache_item *item;
    GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs);
 
@@ -302,17 +304,17 @@ brw_cache_data_sz(struct brw_cache *cache,
 		       reloc_bufs, nr_reloc_bufs);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
-      cache->sws->bo_reference(item->bo);
-      return item->bo;
-   }
 
-   bo = brw_upload_cache(cache, cache_id,
-			 data, data_size,
-			 reloc_bufs, nr_reloc_bufs,
-			 data, data_size,
-			 NULL, NULL);
+      bo_reference(bo_out, item->bo);
+      return PIPE_OK;
+   }
 
-   return bo;
+   return brw_upload_cache(cache, cache_id,
+                           data, data_size,
+                           reloc_bufs, nr_reloc_bufs,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
 }
 
 
@@ -323,15 +325,16 @@ brw_cache_data_sz(struct brw_cache *cache,
  * better to use, as the potentially changing offsets in the data-used-as-key
  * will result in excessive cache misses.
  */
-struct brw_winsys_buffer *
+enum pipe_error
 brw_cache_data(struct brw_cache *cache,
 	       enum brw_cache_id cache_id,
 	       const void *data,
 	       struct brw_winsys_buffer **reloc_bufs,
-	       GLuint nr_reloc_bufs)
+	       GLuint nr_reloc_bufs,
+               struct brw_winsys_buffer **bo_out)
 {
    return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
-			    reloc_bufs, nr_reloc_bufs);
+			    reloc_bufs, nr_reloc_bufs, bo_out);
 }
 
 
@@ -506,11 +509,13 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 	 int j;
 
 	 next = c->next;
+
 	 for (j = 0; j < c->nr_reloc_bufs; j++)
-	    brw->sws->bo_unreference(c->reloc_bufs[j]);
-	 brw->sws->bo_unreference(c->bo);
-	 free((void *)c->key);
-	 free(c);
+	    bo_reference(&c->reloc_bufs[j], NULL);
+
+	 bo_reference(&c->bo, NULL);
+	 FREE((void *)c->key);
+	 FREE(c);
       }
       cache->items[i] = NULL;
    }
@@ -551,10 +556,12 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 	    *prev = c->next;
 
 	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       cache->sws->bo_unreference(c->reloc_bufs[j]);
-	    cache->sws->bo_unreference(c->bo);
-	    free((void *)c->key);
-	    free(c);
+	       bo_reference(&c->reloc_bufs[j], NULL);
+
+	    bo_reference(&c->bo, NULL);
+
+	    FREE((void *)c->key);
+	    FREE(c);
 	    cache->n_items--;
 	 } else {
 	    prev = &c->next;
@@ -590,10 +597,10 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      brw->sws->bo_unreference(cache->last_bo[i]);
-      free(cache->name[i]);
+      bo_reference(&cache->last_bo[i], NULL);
+      FREE(cache->name[i]);
    }
-   free(cache->items);
+   FREE(cache->items);
    cache->items = NULL;
    cache->size = 0;
 }
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index a71af4d2b9..fdcdd59129 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -140,8 +140,7 @@ brw_clear_validated_bos(struct brw_context *brw)
 
    /* Clear the last round of validated bos */
    for (i = 0; i < brw->state.validated_bo_count; i++) {
-      brw->sws->bo_unreference(brw->state.validated_bos[i]);
-      brw->state.validated_bos[i] = NULL;
+      bo_reference(&brw->state.validated_bos[i], NULL);
    }
    brw->state.validated_bo_count = 0;
 }
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 26a28114d9..966940ceac 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -39,10 +39,12 @@
 
 
-static void do_vs_prog( struct brw_context *brw, 
-			struct brw_vertex_shader *vp,
-			struct brw_vs_prog_key *key )
+static enum pipe_error do_vs_prog( struct brw_context *brw, 
+                                   struct brw_vertex_shader *vp,
+                                   struct brw_vs_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
 {
+   enum pipe_error ret;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_compile c;
@@ -66,22 +68,29 @@ static void do_vs_prog( struct brw_context *brw,
 
    /* get the program
     */
-   program = brw_get_program(&c.func, &program_size);
-
-   brw->sws->bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->vs.prog_data );
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_VS_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->vs.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
-static int brw_upload_vs_prog(struct brw_context *brw)
+static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
    struct brw_vertex_shader *vp = brw->curr.vertex_shader;
+   enum pipe_error ret;
 
    memset(&key, 0, sizeof(key));
 
@@ -95,15 +104,18 @@ static int brw_upload_vs_prog(struct brw_context *brw)
 
    /* Make an early check for the key.
     */
-   brw->sws->bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
-				      &key, sizeof(key),
-				      NULL, 0,
-				      &brw->vs.prog_data);
-   if (brw->vs.prog_bo == NULL)
-      do_vs_prog(brw, vp, &key);
-
-   return 0;
+   if (brw_search_cache(&brw->cache, BRW_VS_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->vs.prog_data,
+                        &brw->vs.prog_bo))
+      return PIPE_OK;
+
+   ret = do_vs_prog(brw, vp, &key, &brw->vs.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index 26d5d005fa..22a4d7f01b 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -78,11 +78,13 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    }
 }
 
-static struct brw_winsys_buffer *
-vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+static enum pipe_error
+vs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_vs_unit_key *key,
+                        struct brw_winsys_buffer **bo_out)
 {
+   enum pipe_error ret;
    struct brw_vs_unit_state vs;
-   struct brw_winsys_buffer *bo;
    int chipset_max_threads;
 
    memset(&vs, 0, sizeof(vs));
@@ -141,38 +143,46 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.vs6.vs_enable = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
-			 key, sizeof(*key),
-			 &brw->vs.prog_bo, 1,
-			 &vs, sizeof(vs),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
+                          key, sizeof(*key),
+                          &brw->vs.prog_bo, 1,
+                          &vs, sizeof(vs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
    /* Emit VS program relocation */
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   vs.thread0.grf_reg_count << 1,
-			   offsetof(struct brw_vs_unit_state, thread0),
-			   brw->vs.prog_bo);
-
-   return bo;
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 vs.thread0.grf_reg_count << 1,
+                                 offsetof(struct brw_vs_unit_state, thread0),
+                                 brw->vs.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 static int prepare_vs_unit(struct brw_context *brw)
 {
    struct brw_vs_unit_key key;
+   enum pipe_error ret;
 
    vs_unit_populate_key(brw, &key);
 
-   brw->sws->bo_unreference(brw->vs.state_bo);
-   brw->vs.state_bo = brw_search_cache(&brw->cache, BRW_VS_UNIT,
-				       &key, sizeof(key),
-				       &brw->vs.prog_bo, 1,
-				       NULL);
-   if (brw->vs.state_bo == NULL) {
-      brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
-   }
+   if (brw_search_cache(&brw->cache, BRW_VS_UNIT,
+                        &key, sizeof(key),
+                        &brw->vs.prog_bo, 1,
+                        NULL,
+                        &brw->vs.state_bo))
+      return PIPE_OK;
+
+   ret = vs_unit_create_from_key(brw, &key, &brw->vs.state_bo);
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_vs_unit = {
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
index 32fb9b2a8b..b12df0ec03 100644
--- a/src/gallium/drivers/i965/brw_vs_surface_state.c
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -83,22 +83,23 @@ brw_update_vs_constant_surface( struct brw_context *brw,
 {
    struct brw_surface_key key;
    struct pipe_buffer *cb = brw->curr.vs_constants;
+   enum pipe_error ret;
 
    assert(surf == 0);
 
    /* If we're in this state update atom, we need to update VS constants, so
     * free the old buffer and create a new one for the new contents.
     */
-   brw->sws->bo_unreference(vp->const_buffer);
-   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+   ret = brw_vs_update_constant_buffer(brw, &vp->const_buffer);
+   if (ret)
+      return ret;
 
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (vp->const_buffer == 0) {
-      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
-      brw->vs.surf_bo[surf] = NULL;
-      return;
+   if (vp->const_buffer == NULL) {
+      bo_reference(brw->vs.surf_bo[surf], NULL);
+      return PIPE_OK;
    }
 
    memset(&key, 0, sizeof(key));
@@ -118,15 +119,20 @@ brw_update_vs_constant_surface( struct brw_context *brw,
           key.width, key.height, key.depth, key.cpp, key.pitch);
    */
 
-   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
-                                            NULL);
-   if (brw->vs.surf_bo[surf] == NULL) {
-      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, key.bo ? 1 : 0,
+                        NULL,
+                        &brw->vs.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key
+                                     &brw->vs.surf_bo[surf]);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
 }
 #endif
 
@@ -134,18 +140,20 @@ brw_update_vs_constant_surface( struct brw_context *brw,
 /**
  * Constructs the binding table for the VS surface state.
  */
-static struct brw_winsys_buffer *
-brw_vs_get_binding_table(struct brw_context *brw)
+static enum pipe_error
+brw_vs_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out)
 {
 #if 0
-   struct brw_winsys_buffer *bind_bo;
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
-			      NULL);
-
-   if (bind_bo == NULL) {
+   if (brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+                        NULL, 0,
+                        brw->vs.surf_bo, BRW_VS_MAX_SURF,
+                        NULL,
+                        bo_out))
+   {
+      return PIPE_OK;
+   }
+   else {
       GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
       uint32_t *data = malloc(data_size);
       int i;
@@ -156,11 +164,14 @@ brw_vs_get_binding_table(struct brw_context *brw)
          else
             data[i] = 0;
 
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
-				  data, data_size,
-				  NULL, NULL);
+      ret = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+                              NULL, 0,
+                              brw->vs.surf_bo, BRW_VS_MAX_SURF,
+                              data, data_size,
+                              NULL, NULL,
+                              bo_out);
+      if (ret)
+         return ret;
 
       /* Emit binding table relocations to surface state */
       for (i = 0; i < BRW_VS_MAX_SURF; i++) {
@@ -168,18 +179,19 @@ brw_vs_get_binding_table(struct brw_context *brw)
 	    /* The presumed offsets were set in the data values for
 	     * brw_upload_cache.
 	     */
-	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
-				    brw->vs.surf_bo[i], 0,
-				    BRW_USAGE_STATE);
+	    ret = sws->bo_emit_reloc(*bo_out, i * 4,
+                                     brw->vs.surf_bo[i], 0,
+                                     BRW_USAGE_STATE);
+            if (ret)
+               return ret;
 	 }
       }
 
-      free(data);
+      FREE(data);
+      return PIPE_OK;
    }
-
-   return bind_bo;
 #else
-   return NULL;
+   return PIPE_OK;
 #endif
 }
 
@@ -190,8 +202,10 @@ brw_vs_get_binding_table(struct brw_context *brw)
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static int prepare_vs_surfaces(struct brw_context *brw )
+static enum pipe_error prepare_vs_surfaces(struct brw_context *brw )
 {
+   enum pipe_error ret;
+
 #if 0
    int i;
    int nr_surfaces = 0;
@@ -215,11 +229,12 @@ static int prepare_vs_surfaces(struct brw_context *brw )
     * just slightly increases our working set size.
     */
    if (brw->vs.nr_surfaces != 0) {
-      brw->sws->bo_unreference(brw->vs.bind_bo);
-      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+      ret = brw_vs_get_binding_table(brw, &brw->vs.bind_bo);
+      if (ret)
+         return ret;
    }
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
index d941fbcebe..f61c541ad1 100644
--- a/src/gallium/drivers/i965/brw_winsys.h
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -28,6 +28,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_error.h"
+#include "pipe/p_refcnt.h"
 
 struct brw_winsys;
 struct pipe_fence_handle;
@@ -36,10 +37,13 @@ struct pipe_fence_handle;
  */
 #define BRW_BATCH_SIZE (32*1024)
 
+struct brw_winsys_screen;
 
 /* Need a tiny bit of information inside the abstract buffer struct:
  */
 struct brw_winsys_buffer {
+   struct pipe_reference reference;
+   struct brw_winsys_screen *sws;
    unsigned *offset;
    unsigned size;
 };
@@ -105,6 +109,10 @@ enum brw_buffer_data_type {
    BRW_DATA_MAX
 };
 
+
+
+
+
 struct brw_winsys_screen {
 
 
@@ -116,33 +124,33 @@ struct brw_winsys_screen {
    /**
     * Create a buffer.
     */
-   struct brw_winsys_buffer *(*bo_alloc)( struct brw_winsys_screen *sws,
-					  enum brw_buffer_type type,
-					  unsigned size,
-					  unsigned alignment );
+   enum pipe_error (*bo_alloc)( struct brw_winsys_screen *sws,
+                                enum brw_buffer_type type,
+                                unsigned size,
+                                unsigned alignment,
+                                struct brw_winsys_buffer **bo_out );
 
-   /* Reference and unreference buffers:
+   /* Destroy a buffer when our refcount goes to zero:
     */
-   void (*bo_reference)( struct brw_winsys_buffer *buffer );
-   void (*bo_unreference)( struct brw_winsys_buffer *buffer );
+   void (*bo_destroy)( struct brw_winsys_buffer *buffer );
 
    /* delta -- added to b2->offset, and written into buffer
     * offset -- location above value is written to within buffer
     */
-   int (*bo_emit_reloc)( struct brw_winsys_buffer *buffer,
-			 enum brw_buffer_usage usage,
-			 unsigned delta,
-			 unsigned offset,
-			 struct brw_winsys_buffer *b2);
+   enum pipe_error (*bo_emit_reloc)( struct brw_winsys_buffer *buffer,
+                                     enum brw_buffer_usage usage,
+                                     unsigned delta,
+                                     unsigned offset,
+                                     struct brw_winsys_buffer *b2);
 
-   int (*bo_exec)( struct brw_winsys_buffer *buffer,
-		   unsigned bytes_used );
+   enum pipe_error (*bo_exec)( struct brw_winsys_buffer *buffer,
+                               unsigned bytes_used );
 
-   int (*bo_subdata)(struct brw_winsys_buffer *buffer,
-                     enum brw_buffer_data_type data_type,
-                     size_t offset,
-                     size_t size,
-                     const void *data);
+   enum pipe_error (*bo_subdata)(struct brw_winsys_buffer *buffer,
+                                 enum brw_buffer_data_type data_type,
+                                 size_t offset,
+                                 size_t size,
+                                 const void *data);
 
    boolean (*bo_is_busy)(struct brw_winsys_buffer *buffer);
    boolean (*bo_references)(struct brw_winsys_buffer *a,
@@ -175,6 +183,16 @@ struct brw_winsys_screen {
 };
 
 
+static INLINE void
+bo_reference(struct brw_winsys_buffer **ptr, struct brw_winsys_buffer *buf)
+{
+   struct brw_winsys_buffer *old_buf = *ptr;
+
+   if (pipe_reference((struct pipe_reference **)ptr, &buf->reference))
+      old_buf->sws->bo_destroy(old_buf);
+}
+
+
 /**
  * Create brw pipe_screen.
  */
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 815ae8c51a..93f90bf329 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -137,30 +137,26 @@ brw_wm_linear_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
  * Depending on the instructions used (i.e. flow control instructions)
  * we'll use one of two code generators.
  */
-static int do_wm_prog( struct brw_context *brw,
-			struct brw_fragment_shader *fp, 
-			struct brw_wm_prog_key *key)
+static enum pipe_error do_wm_prog( struct brw_context *brw,
+                                   struct brw_fragment_shader *fp, 
+                                   struct brw_wm_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
 {
+   enum pipe_error ret;
    struct brw_wm_compile *c;
    const GLuint *program;
    GLuint program_size;
 
-   c = brw->wm.compile_data;
-   if (c == NULL) {
-      brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
-      c = brw->wm.compile_data;
-      if (c == NULL) {
-         /* Ouch - big out of memory problem.  Can't continue
-          * without triggering a segfault, no way to signal,
-          * so just return.
-          */
+   if (brw->wm.compile_data == NULL) {
+      brw->wm.compile_data = MALLOC(sizeof(*brw->wm.compile_data));
+      if (!brw->wm.compile_data) 
          return PIPE_ERROR_OUT_OF_MEMORY;
-      }
-   } else {
-      memset(c, 0, sizeof(*brw->wm.compile_data));
    }
-   memcpy(&c->key, key, sizeof(*key));
 
+   c = brw->wm.compile_data;
+   memset(c, 0, sizeof *c);
+
+   c->key = *key;
    c->fp = fp;
    c->env_param = NULL; /*brw->intel.ctx.FragmentProgram.Parameters;*/
 
@@ -190,17 +186,21 @@ static int do_wm_prog( struct brw_context *brw,
 
    /* get the program
     */
-   program = brw_get_program(&c->func, &program_size);
-
-   brw->sws->bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG,
-				       &c->key, sizeof(c->key),
-				       NULL, 0,
-				       program, program_size,
-				       &c->prog_data,
-				       &brw->wm.prog_data );
-
-   return 0;
+   ret = brw_get_program(&c->func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_WM_PROG,
+                           &c->key, sizeof(c->key),
+                           NULL, 0,
+                           program, program_size,
+                           &c->prog_data,
+                           &brw->wm.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
@@ -267,24 +267,28 @@ static void brw_wm_populate_key( struct brw_context *brw,
 }
 
 
-static int brw_prepare_wm_prog(struct brw_context *brw)
+static enum pipe_error brw_prepare_wm_prog(struct brw_context *brw)
 {
    struct brw_wm_prog_key key;
    struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+   enum pipe_error ret;
      
    brw_wm_populate_key(brw, &key);
 
    /* Make an early check for the key.
     */
-   brw->sws->bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
-				      &key, sizeof(key),
-				      NULL, 0,
-				      &brw->wm.prog_data);
-   if (brw->wm.prog_bo == NULL)
-      return do_wm_prog(brw, fs, &key);
-
-   return 0;
+   if (brw_search_cache(&brw->cache, BRW_WM_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->wm.prog_data,
+                        &brw->wm.prog_bo))
+      return PIPE_OK;
+
+   ret = do_wm_prog(brw, fs, &key, &brw->wm.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_constant_buffer.c b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
index 50ecef29a4..14568265dd 100644
--- a/src/gallium/drivers/i965/brw_wm_constant_buffer.c
+++ b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
@@ -6,12 +6,14 @@
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-struct brw_winsys_buffer *
+enum pipe_error
 brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_surface_key *key )
+                             struct brw_surface_key *key,
+                             struct brw_winsys_buffer **bo_out )
 {
    const GLint w = key->width - 1;
    struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
 
    memset(&surf, 0, sizeof(surf));
 
@@ -28,22 +30,27 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
    brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
  
-   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-			 key, sizeof(*key),
-			 &key->bo, key->bo ? 1 : 0,
-			 &surf, sizeof(surf),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          key, sizeof(*key),
+                          &key->bo, key->bo ? 1 : 0,
+                          &surf, sizeof(surf),
+                          NULL, NULL,
+                          &bo_out);
+   if (ret)
+      return ret;
 
    if (key->bo) {
       /* Emit relocation to surface contents */
-      brw->sws->bo_emit_reloc(bo,
-			      BRW_USAGE_SAMPLER,
-			      0,
-			      offsetof(struct brw_surface_state, ss1),
-			      key->bo);
+      ret = brw->sws->bo_emit_reloc(*bo_out,
+                                    BRW_USAGE_SAMPLER,
+                                    0,
+                                    offsetof(struct brw_surface_state, ss1),
+                                    key->bo);
+      if (ret)
+         return ret;
    }
 
-   return bo;
+   return PIPE_OK;
 }
 
 
@@ -52,7 +59,7 @@ brw_create_constant_surface( struct brw_context *brw,
  * Update the surface state for a WM constant buffer.
  * The constant buffer will be (re)allocated here if needed.
  */
-static void
+static enum pipe_error
 brw_update_wm_constant_surface( struct brw_context *brw,
                                 GLuint surf)
 {
@@ -60,20 +67,21 @@ brw_update_wm_constant_surface( struct brw_context *brw,
    struct brw_fragment_shader *fp = brw->curr.fragment_shader;
    struct pipe_buffer *cbuf = brw->curr.fragment_constants;
    int pitch = cbuf->size / (4 * sizeof(float));
+   enum pipe_error ret;
 
    /* If we're in this state update atom, we need to update WM constants, so
     * free the old buffer and create a new one for the new contents.
     */
-   brw->sws->bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+   ret = brw_wm_update_constant_buffer(brw, &fp->const_buffer);
+   if (ret)
+      return ret;
 
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
    if (cbuf == NULL) {
-      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-      brw->wm.surf_bo[surf] = NULL;
-      return;
+      bo_reference(&brw->wm.surf_bo[surf], NULL);
+      return PIPE_OK;
    }
 
    memset(&key, 0, sizeof(key));
@@ -97,16 +105,20 @@ brw_update_wm_constant_surface( struct brw_context *brw,
           key.width, key.height, key.depth, key.cpp, key.pitch);
    */
 
-   brw->sws->bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, 1,
+                        NULL,
+                        &brw->wm.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key, &brw->wm.surf_bo[surf]);
+   if (ret)
+      return ret;
+
    brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+   return PIPE_OK;
 }
 
 /**
@@ -117,28 +129,33 @@ brw_update_wm_constant_surface( struct brw_context *brw,
  * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
  * inclusion in the binding table.
  */
-static void prepare_wm_constant_surface(struct brw_context *brw )
+static enum pipe_error prepare_wm_constant_surface(struct brw_context *brw )
 {
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
    GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
 
-   drm_intel_bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+   ret = brw_wm_update_constant_buffer(brw,
+                                       &fp->const_buffer);
+   if (ret)
+      return ret;
 
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
    if (fp->const_buffer == 0) {
       if (brw->wm.surf_bo[surf] != NULL) {
-	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-	 brw->wm.surf_bo[surf] = NULL;
+	 bo_reference(&brw->wm.surf_bo[surf], NULL);
 	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
       }
-      return;
+      return PIPE_OK;
    }
 
-   brw_update_wm_constant_surface(ctx, surf);
+   ret = brw_update_wm_constant_surface(ctx, surf);
+   if (ret)
+      return ret;
+
+   return PIPE_OK
 }
 
 const struct brw_tracked_state brw_wm_constant_surface = {
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
index 2fddb4ad89..2861aa979f 100644
--- a/src/gallium/drivers/i965/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -43,16 +43,22 @@
 
 
-static struct brw_winsys_buffer *
+static enum pipe_error
 upload_default_color( struct brw_context *brw,
-		      const GLfloat *color )
+		      const GLfloat *color,
+                      struct brw_winsys_buffer **bo_out )
 {
    struct brw_sampler_default_color sdc;
+   enum pipe_error ret;
 
    COPY_4V(sdc.color, color); 
    
-   return brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
-			  NULL, 0 );
+   ret = brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
+                         NULL, 0, bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
@@ -111,9 +117,10 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 }
 
 
-static void
+static enum pipe_error
 brw_wm_sampler_update_default_colors(struct brw_context *brw)
 {
+   enum pipe_error ret;
    int nr = MIN2(brw->curr.num_textures,
 		 brw->curr.num_samplers);
    int i;
@@ -121,8 +128,7 @@ brw_wm_sampler_update_default_colors(struct brw_context *brw)
    for (i = 0; i < nr; i++) {
       const struct brw_texture *tex = brw_texture(brw->curr.texture[i]);
       const struct brw_sampler *sampler = brw->curr.sampler[i];
-
-      brw->sws->bo_unreference(brw->wm.sdc_bo[i]);
+      const float *bc;
 
       if (pf_is_depth_or_stencil(tex->base.format)) {
 	 float bordercolor[4] = {
@@ -131,15 +137,25 @@ brw_wm_sampler_update_default_colors(struct brw_context *brw)
 	    sampler->border_color[0],
 	    sampler->border_color[0]
 	 };
-	 /* GL specs that border color for depth textures is taken from the
-	  * R channel, while the hardware uses A.  Spam R into all the
-	  * channels for safety.
-	  */
-	 brw->wm.sdc_bo[i] = upload_default_color(brw, bordercolor);
-      } else {
-	 brw->wm.sdc_bo[i] = upload_default_color(brw, sampler->border_color);
+         
+         bc = bordercolor;
+      }
+      else {
+         bc = sampler->border_color;
       }
+
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      ret = upload_default_color(brw, 
+                                 bc,
+                                 &brw->wm.sdc_bo[i]);
+      if (ret) 
+         return ret;
    }
+
+   return PIPE_OK;
 }
 
 
@@ -149,6 +165,7 @@ brw_wm_sampler_update_default_colors(struct brw_context *brw)
 static int upload_wm_samplers( struct brw_context *brw )
 {
    struct wm_sampler_key key;
+   enum pipe_error ret;
    int i;
 
    brw_wm_sampler_update_default_colors(brw);
@@ -159,35 +176,40 @@ static int upload_wm_samplers( struct brw_context *brw )
       brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
    }
 
-   brw->sws->bo_unreference(brw->wm.sampler_bo);
-   brw->wm.sampler_bo = NULL;
-   if (brw->wm.sampler_count == 0)
-      return 0;
+   if (brw->wm.sampler_count == 0) {
+      bo_reference(&brw->wm.sampler_bo, NULL);
+      return PIPE_OK;
+   }
 
-   brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER,
-					 &key, sizeof(key),
-					 brw->wm.sdc_bo, key.sampler_count,
-					 NULL);
+   if (brw_search_cache(&brw->cache, BRW_SAMPLER,
+                        &key, sizeof(key),
+                        brw->wm.sdc_bo, key.sampler_count,
+                        NULL,
+                        &brw->wm.sampler_bo))
+      return PIPE_OK;
 
    /* If we didnt find it in the cache, compute the state and put it in the
     * cache.
     */
-   if (brw->wm.sampler_bo == NULL) {
-      brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER,
-					    &key, sizeof(key),
-					    brw->wm.sdc_bo, key.sampler_count,
-					    &key.sampler, sizeof(key.sampler),
-					    NULL, NULL);
-
-      /* Emit SDC relocations */
-      for (i = 0; i < key.sampler_count; i++) {
-	 brw->sws->bo_emit_reloc(brw->wm.sampler_bo,
-				 BRW_USAGE_SAMPLER,
-				 0,
-				 i * sizeof(struct brw_sampler_state) +
-				 offsetof(struct brw_sampler_state, ss2),
-				 brw->wm.sdc_bo[i]);
-      }
+   ret = brw_upload_cache(&brw->cache, BRW_SAMPLER,
+                          &key, sizeof(key),
+                          brw->wm.sdc_bo, key.sampler_count,
+                          &key.sampler, sizeof(key.sampler),
+                          NULL, NULL,
+                          &brw->wm.sampler_bo);
+   if (ret)
+      return ret;
+
+   /* Emit SDC relocations */
+   for (i = 0; i < key.sampler_count; i++) {
+      ret = brw->sws->bo_emit_reloc(brw->wm.sampler_bo,
+                                    BRW_USAGE_SAMPLER,
+                                    0,
+                                    i * sizeof(struct brw_sampler_state) +
+                                    offsetof(struct brw_sampler_state, ss2),
+                                    brw->wm.sdc_bo[i]);
+      if (ret)
+         return ret;
    }
 
    return 0;
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
index ccbb647bcd..86dc10540d 100644
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -138,12 +138,13 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 /**
  * Setup wm hardware state.  See page 225 of Volume 2
  */
-static struct brw_winsys_buffer *
+static enum pipe_error
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
-			struct brw_winsys_buffer **reloc_bufs)
+			struct brw_winsys_buffer **reloc_bufs,
+                        struct brw_winsys_buffer **bo_out)
 {
    struct brw_wm_unit_state wm;
-   struct brw_winsys_buffer *bo;
+   enum pipe_error ret;
 
    memset(&wm, 0, sizeof(wm));
 
@@ -222,45 +223,56 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    if (BRW_DEBUG & DEBUG_STATS || key->stats_wm)
       wm.wm4.stats_enable = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
-			 key, sizeof(*key),
-			 reloc_bufs, 3,
-			 &wm, sizeof(wm),
-			 NULL, NULL);
+   ret = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
+                          key, sizeof(*key),
+                          reloc_bufs, 3,
+                          &wm, sizeof(wm),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
    /* Emit WM program relocation */
-   brw->sws->bo_emit_reloc(bo,
-			   BRW_USAGE_STATE,
-			   wm.thread0.grf_reg_count << 1,
-			   offsetof(struct brw_wm_unit_state, thread0),
-			   brw->wm.prog_bo);
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_STATE,
+                                 wm.thread0.grf_reg_count << 1,
+                                 offsetof(struct brw_wm_unit_state, thread0),
+                                 brw->wm.prog_bo);
+   if (ret)
+      return ret;
 
    /* Emit scratch space relocation */
    if (key->total_scratch != 0) {
-      brw->sws->bo_emit_reloc(bo,
-			      BRW_USAGE_SCRATCH,
-			      wm.thread2.per_thread_scratch_space,
-			      offsetof(struct brw_wm_unit_state, thread2),
-			      brw->wm.scratch_bo);
+      ret = brw->sws->bo_emit_reloc(*bo_out,
+                                    BRW_USAGE_SCRATCH,
+                                    wm.thread2.per_thread_scratch_space,
+                                    offsetof(struct brw_wm_unit_state, thread2),
+                                    brw->wm.scratch_bo);
+      if (ret)
+         return ret;
    }
 
    /* Emit sampler state relocation */
    if (key->sampler_count != 0) {
-      brw->sws->bo_emit_reloc(bo,
-			      BRW_USAGE_STATE,
-			      wm.wm4.stats_enable | (wm.wm4.sampler_count << 2),
-			      offsetof(struct brw_wm_unit_state, wm4),
-			      brw->wm.sampler_bo);
+      ret = brw->sws->bo_emit_reloc(*bo_out,
+                                    BRW_USAGE_STATE,
+                                    wm.wm4.stats_enable | (wm.wm4.sampler_count << 2),
+                                    offsetof(struct brw_wm_unit_state, wm4),
+                                    brw->wm.sampler_bo);
+      if (ret)
+         return ret;
    }
 
-   return bo;
+   return PIPE_OK;
 }
 
 
-static int upload_wm_unit( struct brw_context *brw )
+static enum pipe_error upload_wm_unit( struct brw_context *brw )
 {
    struct brw_wm_unit_key key;
    struct brw_winsys_buffer *reloc_bufs[3];
+   enum pipe_error ret;
+
    wm_unit_populate_key(brw, &key);
 
    /* Allocate the necessary scratch space if we haven't already.  Don't
@@ -271,15 +283,19 @@ static int upload_wm_unit( struct brw_context *brw )
    if (key.total_scratch) {
       GLuint total = key.total_scratch * key.max_threads;
 
-      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
-	 brw->sws->bo_unreference(brw->wm.scratch_bo);
-	 brw->wm.scratch_bo = NULL;
-      }
+      /* Do we need a new buffer:
+       */
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) 
+	 bo_reference(&brw->wm.scratch_bo, NULL);
+
       if (brw->wm.scratch_bo == NULL) {
-	 brw->wm.scratch_bo = brw->sws->bo_alloc(brw->sws,
-						 BRW_BUFFER_TYPE_SHADER_SCRATCH,
-						 total,
-						 4096);
+	 ret = brw->sws->bo_alloc(brw->sws,
+                                  BRW_BUFFER_TYPE_SHADER_SCRATCH,
+                                  total,
+                                  4096,
+                                  &brw->wm.scratch_bo);
+         if (ret)
+            return ret;
       }
    }
 
@@ -287,16 +303,19 @@ static int upload_wm_unit( struct brw_context *brw )
    reloc_bufs[1] = brw->wm.scratch_bo;
    reloc_bufs[2] = brw->wm.sampler_bo;
 
-   brw->sws->bo_unreference(brw->wm.state_bo);
-   brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT,
-				       &key, sizeof(key),
-				       reloc_bufs, 3,
-				       NULL);
-   if (brw->wm.state_bo == NULL) {
-      brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs);
-   }
+   if (brw_search_cache(&brw->cache, BRW_WM_UNIT,
+                        &key, sizeof(key),
+                        reloc_bufs, 3,
+                        NULL,
+                        &brw->wm.state_bo))
+      return PIPE_OK;
+
+   ret = wm_unit_create_from_key(brw, &key, reloc_bufs,
+                                 &brw->wm.state_bo);
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_wm_unit = {
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index b055dde20c..e5d0329967 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -40,31 +40,40 @@
 
 
-static void
+static enum pipe_error
 brw_update_texture_surface( struct brw_context *brw,
 			    struct brw_texture *tex,
-			    GLuint surf )
+                            struct brw_winsys_buffer **bo_out)
 {
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &tex->ss, sizeof tex->ss,
-                                            &tex->bo, 1,
-                                            NULL);
-
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-					       &tex->ss, sizeof tex->ss,
-					       &tex->bo, 1,
-					       &tex->ss, sizeof tex->ss,
-					       NULL, NULL);
+   enum pipe_error ret;
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &tex->ss, sizeof tex->ss,
+                        &tex->bo, 1,
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          &tex->ss, sizeof tex->ss,
+                          &tex->bo, 1,
+                          &tex->ss, sizeof tex->ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
       
-      /* Emit relocation to surface contents */
-      brw->sws->bo_emit_reloc(brw->wm.surf_bo[surf],
-			      BRW_USAGE_SAMPLER,
-			      0,
-			      offsetof(struct brw_surface_state, ss1),
-			      tex->bo);
-   }
+   /* Emit relocation to surface contents */
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_SAMPLER,
+                                 0,
+                                 offsetof(struct brw_surface_state, ss1),
+                                 tex->bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
@@ -79,13 +88,14 @@ brw_update_texture_surface( struct brw_context *brw,
  * While it is only used for the front/back buffer currently, it should be
  * usable for further buffers when doing ARB_draw_buffer support.
  */
-static void
-brw_update_renderbuffer_surface(struct brw_context *brw,
-				struct brw_surface *surface,
-				unsigned int unit)
+static enum pipe_error
+brw_update_render_surface(struct brw_context *brw,
+                          struct brw_surface *surface,
+                          struct brw_winsys_buffer **bo_out)
 {
    struct brw_surf_ss0 blend_ss0 = brw->curr.blend->ss0;
    struct brw_surface_state ss;
+   enum pipe_error ret;
 
    /* Surfaces are potentially shared between contexts, so can't
     * scribble the in-place ss0 value in the surface.
@@ -98,30 +108,35 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    ss.ss0.writedisable_red   = blend_ss0.writedisable_red;
    ss.ss0.writedisable_alpha = blend_ss0.writedisable_alpha;
 
-   brw->sws->bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
-					    BRW_SS_SURFACE,
-					    &ss, sizeof(ss),
-					    &surface->bo, 1,
-					    NULL);
-
-   if (brw->wm.surf_bo[unit] == NULL) {
-
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
-                                               BRW_SS_SURFACE,
-                                               &ss, sizeof ss,
-					       &surface->bo, 1,
-					       &ss, sizeof ss,
-					       NULL, NULL);
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &ss, sizeof(ss),
+                        &surface->bo, 1,
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+       
+   ret = brw_upload_cache(&brw->surface_cache,
+                          BRW_SS_SURFACE,
+                          &ss, sizeof ss,
+                          &surface->bo, 1,
+                          &ss, sizeof ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
 
       /* XXX: we will only be rendering to this surface:
        */
-      brw->sws->bo_emit_reloc(brw->wm.surf_bo[unit],
-			      BRW_USAGE_RENDER_TARGET,
-			      ss.ss1.base_addr - surface->bo->offset[0], /* XXX */
-			      offsetof(struct brw_surface_state, ss1),
-			      surface->bo);
-   }
+   ret = brw->sws->bo_emit_reloc(*bo_out,
+                                 BRW_USAGE_RENDER_TARGET,
+                                 ss.ss1.base_addr - surface->bo->offset[0], /* XXX */
+                                 offsetof(struct brw_surface_state, ss1),
+                                 surface->bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
 }
 
 
@@ -129,60 +144,60 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
  * Constructs the binding table for the WM surface state, which maps unit
  * numbers to surface state objects.
  */
-static struct brw_winsys_buffer *
-brw_wm_get_binding_table(struct brw_context *brw)
+static enum pipe_error
+brw_wm_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out )
 {
-   struct brw_winsys_buffer *bind_bo;
+   enum pipe_error ret;
+   uint32_t data[BRW_WM_MAX_SURF];
+   GLuint data_size = brw->wm.nr_surfaces * sizeof data[0];
+   int i;
 
    assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+   assert(brw->wm.nr_surfaces > 0);
 
    /* Note there is no key for this search beyond the values in the
     * relocation array:
     */
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->wm.surf_bo, brw->wm.nr_surfaces,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      uint32_t data[BRW_WM_MAX_SURF];
-      GLuint data_size = brw->wm.nr_surfaces * sizeof data[0];
-      int i;
-
-      for (i = 0; i < brw->wm.nr_surfaces; i++)
-	 data[i] = brw->wm.surf_bo[i]->offset[0];
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->wm.surf_bo, brw->wm.nr_surfaces,
-				  data, data_size,
-				  NULL, NULL);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < brw->wm.nr_surfaces; i++) {
-	 brw->sws->bo_emit_reloc(bind_bo,
-				 BRW_USAGE_STATE,
-				 0,
-				 i * sizeof(GLuint),
-				 brw->wm.surf_bo[i]);
-      }
+   if (brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+                        NULL, 0,
+                        brw->wm.surf_bo,
+                        brw->wm.nr_surfaces,
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   for (i = 0; i < brw->wm.nr_surfaces; i++)
+      data[i] = brw->wm.surf_bo[i]->offset[0];
+
+   ret = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+                           NULL, 0,
+                           brw->wm.surf_bo, brw->wm.nr_surfaces,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   /* Emit binding table relocations to surface state */
+   for (i = 0; i < brw->wm.nr_surfaces; i++) {
+      ret = brw->sws->bo_emit_reloc(*bo_out,
+                                    BRW_USAGE_STATE,
+                                    0,
+                                    i * sizeof(GLuint),
+                                    brw->wm.surf_bo[i]);
+      if (ret)
+         return ret;
    }
 
-   return bind_bo;
+   return PIPE_OK;
 }
 
-static int prepare_wm_surfaces(struct brw_context *brw )
+static enum pipe_error prepare_wm_surfaces(struct brw_context *brw )
 {
-   GLuint i;
+   enum pipe_error ret;
    int nr_surfaces = 0;
-
-   /* Unreference old buffers
-    */
-   for (i = 0; i < brw->wm.nr_surfaces; i++) {
-      brw->sws->bo_unreference(brw->wm.surf_bo[i]);
-      brw->wm.surf_bo[i] = NULL;
-   }
-
+   GLuint i;
 
    /* PIPE_NEW_COLOR_BUFFERS | PIPE_NEW_BLEND
     *
@@ -192,38 +207,51 @@ static int prepare_wm_surfaces(struct brw_context *brw )
     * XXX: no color buffer case
     */
    for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
-      brw_update_renderbuffer_surface(brw, 
-				      brw_surface(brw->curr.fb.cbufs[i]), 
-				      nr_surfaces++);
+      ret = brw_update_render_surface(brw, 
+                                      brw_surface(brw->curr.fb.cbufs[i]), 
+                                      &brw->wm.surf_bo[nr_surfaces++]);
+      if (ret)
+         return ret;
    }
 
    /* PIPE_NEW_TEXTURE 
     */
    for (i = 0; i < brw->curr.num_textures; i++) {
-      brw_update_texture_surface(brw, 
-				 brw_texture(brw->curr.texture[i]),
-				 nr_surfaces++);
+      ret = brw_update_texture_surface(brw, 
+                                       brw_texture(brw->curr.texture[i]),
+                                       &brw->wm.surf_bo[nr_surfaces++]);
+      if (ret)
+         return ret;
    }
 
    /* PIPE_NEW_FRAGMENT_CONSTANTS
     */
 #if 0
    if (brw->curr.fragment_constants) {
-      brw_update_fragment_constant_surface(brw, 
-					   brw->curr.fragment_constants, 
-					   nr_surfaces++);
+      ret = brw_update_fragment_constant_surface(brw, 
+                                                 brw->curr.fragment_constants, 
+                                                 &brw->wm.surf_bo[nr_surfaces++]);
+      if (ret)
+         return ret;
    }
 #endif
 
    if (brw->wm.nr_surfaces != nr_surfaces) {
+
+      /* Unreference any left-over old buffers
+       */
+      for (i = nr_surfaces; i < brw->wm.nr_surfaces; i++)
+         bo_reference(&brw->wm.surf_bo[i], NULL);
+
       brw->wm.nr_surfaces = nr_surfaces;
       brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
    }
 
-   brw->sws->bo_unreference(brw->wm.bind_bo);
-   brw->wm.bind_bo = brw_wm_get_binding_table(brw);
+   ret = brw_wm_get_binding_table(brw, &brw->wm.bind_bo);
+   if (ret)
+      return ret;
 
-   return 0;
+   return PIPE_OK;
 }
 
 const struct brw_tracked_state brw_wm_surfaces = {
diff --git a/src/gallium/winsys/drm/i965/xlib/xlib_i965.c b/src/gallium/winsys/drm/i965/xlib/xlib_i965.c
index b1edca818a..fc465d7c14 100644
--- a/src/gallium/winsys/drm/i965/xlib/xlib_i965.c
+++ b/src/gallium/winsys/drm/i965/xlib/xlib_i965.c
@@ -134,11 +134,12 @@ const char *data_types[BRW_DATA_MAX] =
 };
 
 
-static struct brw_winsys_buffer *
+static enum pipe_error
 xlib_brw_bo_alloc( struct brw_winsys_screen *sws,
-		      enum brw_buffer_type type,
-		      unsigned size,
-		      unsigned alignment )
+                   enum brw_buffer_type type,
+                   unsigned size,
+                   unsigned alignment,
+                   struct brw_winsys_buffer **bo_out )
 {
    struct xlib_brw_winsys *xbw = xlib_brw_winsys(sws);
    struct xlib_brw_buffer *buf;
@@ -148,12 +149,13 @@ xlib_brw_bo_alloc( struct brw_winsys_screen *sws,
 
    buf = CALLOC_STRUCT(xlib_brw_buffer);
    if (!buf)
-      return NULL;
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   pipe_reference_init(&buf->base.reference, 1);
 
    buf->offset = align(xbw->offset, alignment);
    buf->type = type;
    buf->virtual = MALLOC(size);
-   buf->cheesy_refcount = 1;
    buf->base.offset = &buf->offset; /* hmm, cheesy */
    buf->base.size = size;
 
@@ -161,36 +163,25 @@ xlib_brw_bo_alloc( struct brw_winsys_screen *sws,
    if (xbw->offset > MAX_VRAM)
       goto err;
 
-   return &buf->base;
+   /* XXX: possibly rentrant call to bo_destroy:
+    */
+   bo_reference(bo_out, &buf->base);
+   return PIPE_OK;
 
 err:
    assert(0);
+   FREE(buf->virtual);
    FREE(buf);
-   return NULL;
-}
-
-static void 
-xlib_brw_bo_reference( struct brw_winsys_buffer *buffer )
-{
-   struct xlib_brw_buffer *buf = xlib_brw_buffer(buffer);
-
-   buf->cheesy_refcount++;
+   return PIPE_ERROR_OUT_OF_MEMORY;
 }
 
 static void 
-xlib_brw_bo_unreference( struct brw_winsys_buffer *buffer )
+xlib_brw_bo_destroy( struct brw_winsys_buffer *buffer )
 {
    struct xlib_brw_buffer *buf = xlib_brw_buffer(buffer);
 
-   /* As a special favor in this call only, buffer is allowed to be
-    * NULL:
-    */
-   if (buffer == NULL)
-      return;
-
-   if (--buf->cheesy_refcount == 0) {
-      FREE(buffer);
-   }
+   FREE(buf->virtual);
+   FREE(buf);
 }
 
 static int 
@@ -378,8 +369,7 @@ xlib_create_brw_winsys_screen( void )
 
    ws->base.destroy              = xlib_brw_winsys_destroy;
    ws->base.bo_alloc             = xlib_brw_bo_alloc;
-   ws->base.bo_reference         = xlib_brw_bo_reference;
-   ws->base.bo_unreference       = xlib_brw_bo_unreference;
+   ws->base.bo_destroy           = xlib_brw_bo_destroy;
    ws->base.bo_emit_reloc        = xlib_brw_bo_emit_reloc;
    ws->base.bo_exec              = xlib_brw_bo_exec;
    ws->base.bo_subdata           = xlib_brw_bo_subdata;
-- 
cgit v1.2.3


From 2f54d02d205468a840b35a3554f2ad8ffc31ec9c Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 10 Nov 2009 18:07:11 -0800
Subject: i965g: consult fs inputs when laying out vs output regs

Vertex shader now emits just the FS inputs, in the positions and order
expected by the fragment shader.

This means potentially regenerating the vertex shader to match
different fragment shader's input layouts.
---
 src/gallium/drivers/i965/brw_context.h     |  13 ++++
 src/gallium/drivers/i965/brw_pipe_shader.c |   6 ++
 src/gallium/drivers/i965/brw_vs.c          |  14 ++--
 src/gallium/drivers/i965/brw_vs.h          |   7 +-
 src/gallium/drivers/i965/brw_vs_emit.c     | 116 ++++++++++++++++++++---------
 5 files changed, 113 insertions(+), 43 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 4a975ecd7e..31f3cf3685 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -161,11 +161,24 @@ struct brw_vertex_shader {
    GLboolean use_const_buffer;
 };
 
+struct brw_fs_signature {
+   GLuint nr_inputs;
+   struct {
+      GLuint semantic:5;
+      GLuint semantic_index:27;
+   } input[PIPE_MAX_SHADER_INPUTS];
+};
+
+#define brw_fs_signature_size(s) (offsetof(struct brw_fs_signature, input) + \
+                                  ((s)->nr_inputs * sizeof (s)->input[0])) 
+
 
 struct brw_fragment_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
+   struct brw_fs_signature signature;
+
    unsigned iz_lookup;
    //unsigned wm_lookup;
    
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 44f9ad6f9c..7febf9e0c2 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -96,6 +96,12 @@ static void *brw_create_fs_state( struct pipe_context *pipe,
 
    tgsi_scan_shader(fs->tokens, &fs->info);
 
+   fs->signature.nr_inputs = fs->info.num_inputs;
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      fs->signature.input[i].semantic = fs->info.input_semantic_name[i];
+      fs->signature.input[i].semantic_index = fs->info.input_semantic_index[i];
+   }
+
    for (i = 0; i < fs->info.num_inputs; i++)
       if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION)
 	 fs->uses_depth = 1;
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 966940ceac..05a62ed974 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -90,22 +90,24 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
    struct brw_vertex_shader *vp = brw->curr.vertex_shader;
+   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
    enum pipe_error ret;
 
    memset(&key, 0, sizeof(key));
 
-   /* Just upload the program verbatim for now.  Always send it all
-    * the inputs it asks for, whether they are varying or not.
-    */
    key.program_string_id = vp->id;
    key.nr_userclip = brw->curr.ucp.nr;
    key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
 			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
+   memcpy(&key.fs_signature, &fs->signature,
+          brw_fs_signature_size(&fs->signature));
+
+
    /* Make an early check for the key.
     */
    if (brw_search_cache(&brw->cache, BRW_VS_PROG,
-                        &key, sizeof(key),
+                        &key, brw_vs_prog_key_size(&key),
                         NULL, 0,
                         &brw->vs.prog_data,
                         &brw->vs.prog_bo))
@@ -123,7 +125,9 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = PIPE_NEW_CLIP | PIPE_NEW_RAST,
+      .mesa  = (PIPE_NEW_CLIP | 
+                PIPE_NEW_RAST |
+                PIPE_NEW_FRAGMENT_SHADER),
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index b4e450d89b..3d1598d02b 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -43,8 +43,11 @@ struct brw_vs_prog_key {
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
    GLuint pad:26;
+   struct brw_fs_signature fs_signature;
 };
 
+#define brw_vs_prog_key_size(s) (offsetof(struct brw_vs_prog_key, fs_signature) + \
+                                 brw_fs_signature_size(&(s)->fs_signature))
 
 
 #define MAX_IF_DEPTH 32
@@ -65,8 +68,8 @@ struct brw_vs_compile {
 
    GLboolean copy_edgeflag;
 
-   GLuint first_output;
-   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
+   GLuint overflow_grf_start;
+   GLuint overflow_count;
 
    GLuint first_tmp;
    GLuint last_tmp;
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 26f0ec5a11..933c9c4d63 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -66,6 +66,38 @@ static void release_tmps( struct brw_vs_compile *c )
 }
 
 
+static boolean is_position_output( struct brw_vs_compile *c,
+                                   unsigned vs_output )
+{
+   struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+
+   return (semantic == TGSI_SEMANTIC_POSITION &&
+           index == 0);
+}
+
+
+static boolean find_output_slot( struct brw_vs_compile *c,
+                                  unsigned vs_output,
+                                  unsigned *fs_input_slot )
+{
+   struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+   unsigned i;
+
+   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+      if (c->key.fs_signature.input[i].semantic == semantic &&
+          c->key.fs_signature.input[i].semantic_index == index) {
+         *fs_input_slot = i;
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
 
 /**
  * Preallocate GRF register before code emit.
@@ -172,42 +204,50 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = c->prog_data.nr_outputs;
-   c->first_output = reg;
-   c->first_overflow_output = 0;
 
    if (c->chipset.is_igdng)
       mrf = 8;
    else
       mrf = 4;
 
+   
+   if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
+      c->overflow_grf_start = reg;
+      c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
+      reg += c->overflow_count;
+   }
+
    /* XXX: need to access vertex output semantics here:
     */
    for (i = 0; i < c->prog_data.nr_outputs; i++) {
-      assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
+      unsigned slot;
 
-      /* XXX: Hardwire position to zero:
-       */
-      if (i == 0) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
-      }
-      /* XXX: disable psiz:
+      /* XXX: Put output position in slot zero always.  Clipper, etc,
+       * need access to this reg.
        */
-      else if (0) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+      if (is_position_output(c, i)) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
 	 reg++;
-	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
       }
-      else if (mrf < 16) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
-	 mrf++;
+      else if (find_output_slot(c, i, &slot)) {
+         
+         if (0 /* is_psize_output(c, i) */ ) {
+            /* c->psize_out.grf = reg; */
+            /* c->psize_out.mrf = i; */
+         }
+         
+         /* The first (16-4) outputs can go straight into the message regs.
+          */
+         if (slot + mrf < BRW_MAX_MRF) {
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
+         }
+         else {
+            int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
+         }
       }
       else {
-	 /* too many vertex results to fit in MRF, use GRF for overflow */
-	 if (!c->first_overflow_output)
-	    c->first_overflow_output = i;
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
       }
    }     
 
@@ -1072,6 +1112,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
+   int i;
    GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
@@ -1167,7 +1208,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        len_vertext_header = 2;
    }
 
-   eot = (c->first_overflow_output == 0);
+   eot = (c->overflow_count == 0);
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -1182,19 +1223,22 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
 
-   if (c->first_overflow_output > 0) {
-      /* Not all of the vertex outputs/results fit into the MRF.
-       * Move the overflowed attributes from the GRF to the MRF and
-       * issue another brw_urb_WRITE().
-       */
+   /* Not all of the vertex outputs/results fit into the MRF.
+    * Move the overflowed attributes from the GRF to the MRF and
+    * issue another brw_urb_WRITE().
+    */
+   for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
+      unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
+      GLuint j;
+
+      eot = (i + nr >= c->overflow_count);
+
       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
        * at mrf[4] atm...
        */
-      GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
-	 /* move from GRF to MRF */
-	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
-	 mrf++;
+      for (j = 0; j < nr; j++) {
+	 brw_MOV(p, brw_message_reg(4+j), 
+                 brw_vec8_grf(c->overflow_grf_start + i + j, 0));
       }
 
       brw_urb_WRITE(p,
@@ -1203,11 +1247,11 @@ static void emit_vertex_write( struct brw_vs_compile *c)
                     c->r0,          /* src */
                     0,              /* allocate */
                     1,              /* used */
-                    mrf+1,          /* msg len */
+                    nr+1,          /* msg len */
                     0,              /* response len */
-                    1,              /* eot */
-                    1,              /* writes complete */
-                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    eot,            /* eot */
+                    eot,            /* writes complete */
+                    i-1,            /* urb destination offset */
                     BRW_URB_SWIZZLE_INTERLEAVE);
    }
 }
-- 
cgit v1.2.3


From 1877e6cd2d76143ef8a9c516122afe614ae3b4a4 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 17 Nov 2009 14:46:23 -0800
Subject: i965g: handle special vs outputs specially

Where vs output semantic tags indicate an output is signficant for
fixed function processing (such as clipping, unfilled modes, etc),
retain information about that output so that we can get to it easily
later on.

Fix up the unfilled processing, but hard-wire edgeflag to one for now.

With this change, trivial/tri-unfilled works.
---
 src/gallium/drivers/i965/brw_clip.c          | 45 ++++++++++++++++++++--------
 src/gallium/drivers/i965/brw_clip.h          | 15 +++++++---
 src/gallium/drivers/i965/brw_clip_line.c     |  5 ++--
 src/gallium/drivers/i965/brw_clip_tri.c      | 12 ++++----
 src/gallium/drivers/i965/brw_clip_unfilled.c |  9 +++---
 src/gallium/drivers/i965/brw_clip_util.c     |  2 +-
 src/gallium/drivers/i965/brw_context.h       | 19 +++++++++---
 src/gallium/drivers/i965/brw_pipe_shader.c   | 38 ++++++++++++++++++++---
 src/gallium/drivers/i965/brw_vs.c            | 13 +++++++-
 src/gallium/drivers/i965/brw_vs_emit.c       | 42 ++++++++++++++++----------
 src/gallium/drivers/i965/brw_wm_pass2.c      |  2 +-
 11 files changed, 147 insertions(+), 55 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 35e1d2fdbd..4ec7b823e8 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -58,7 +58,6 @@ compile_clip_prog( struct brw_context *brw,
    const GLuint *program;
    GLuint program_size;
    GLuint delta;
-   GLuint i;
 
    memset(&c, 0, sizeof(c));
    
@@ -82,16 +81,26 @@ compile_clip_prog( struct brw_context *brw,
    else
        delta = REG_SIZE;
 
-   /* XXX: c.offset is now pretty redundant:
-    */
-   for (i = 0; i < c.key.nr_attrs; i++) {
-      c.offset[i] = delta;
-      delta += ATTR_SIZE;
-   }
-
    /* XXX: c.nr_attrs is very redundant:
     */
    c.nr_attrs = c.key.nr_attrs;
+
+   c.offset_hpos = delta + c.key.output_hpos * ATTR_SIZE;
+
+   if (c.key.output_color0)
+      c.offset_color0 = delta + c.key.output_color0 * ATTR_SIZE;
+
+   if (c.key.output_color1)
+      c.offset_color1 = delta + c.key.output_color1 * ATTR_SIZE;
+
+   if (c.key.output_bfc0)
+      c.offset_bfc0 = delta + c.key.output_bfc0 * ATTR_SIZE;
+
+   if (c.key.output_bfc1)
+      c.offset_bfc1 = delta + c.key.output_bfc1 * ATTR_SIZE;
+
+   if (c.key.output_edgeflag)
+      c.offset_edgeflag = delta + c.key.output_edgeflag * ATTR_SIZE;
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -158,21 +167,33 @@ compile_clip_prog( struct brw_context *brw,
 static enum pipe_error
 upload_clip_prog(struct brw_context *brw)
 {
-   enum pipe_error ret;
+   const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
    struct brw_clip_prog_key key;
+   enum pipe_error ret;
 
    /* Populate the key, starting from the almost-complete version from
     * the rast state. 
     */
 
    /* PIPE_NEW_RAST */
-   memcpy(&key, &brw->curr.rast->clip_key, sizeof key);
-
+   key = brw->curr.rast->clip_key;
+   
    /* BRW_NEW_REDUCED_PRIMITIVE */
    key.primitive = brw->reduced_primitive;
 
+   /* XXX: if edgeflag is moved to a proper TGSI vs output, can remove
+    * dependency on CACHE_NEW_VS_PROG
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.nr_attrs        = brw->vs.prog_data->nr_outputs;
+   key.output_edgeflag = brw->vs.prog_data->output_edgeflag;
+
    /* PIPE_NEW_VS */
-   key.nr_attrs = brw->curr.vertex_shader->info.file_max[TGSI_FILE_OUTPUT] + 1;
+   key.output_hpos     = vs->output_hpos;
+   key.output_color0   = vs->output_color0;
+   key.output_color1   = vs->output_color1;
+   key.output_bfc0     = vs->output_bfc0;
+   key.output_bfc1     = vs->output_bfc1;
 
    /* PIPE_NEW_CLIP */
    key.nr_userclip = brw->curr.ucp.nr;
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index 9bec9643d7..8729efa47b 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -42,7 +42,7 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint nr_attrs:5;
+   GLuint nr_attrs:6;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -54,7 +54,14 @@ struct brw_clip_prog_key {
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:7;
+   GLuint output_hpos:6;        /* not always zero? */
+
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
+   GLuint pad1:2;
    
    GLfloat offset_factor;
    GLfloat offset_units;
@@ -123,7 +130,6 @@ struct brw_clip_compile {
    GLuint last_mrf;
 
    GLuint header_position_offset;
-   GLuint offset[PIPE_MAX_SHADER_OUTPUTS];
    GLboolean need_ff_sync;
 
    GLuint nr_color_attrs;
@@ -131,7 +137,8 @@ struct brw_clip_compile {
    GLuint offset_color1;
    GLuint offset_bfc0;
    GLuint offset_bfc1;
-   
+
+   GLuint offset_hpos;
    GLuint offset_edgeflag;
 };
 
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
index a4790bda95..54282d975e 100644
--- a/src/gallium/drivers/i965/brw_clip_line.c
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -132,7 +132,6 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    struct brw_instruction *is_neg2 = NULL;
    struct brw_instruction *not_culled;
    struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
-   const int hpos = 0;		/* XXX: position not always first element */
 
    brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
    brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
@@ -173,12 +172,12 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
 	 /* dp = DP4(vtx->position, plane) 
 	  */
-	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[hpos]), c->reg.plane_equation);
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset_hpos), c->reg.plane_equation);
 
 	 /* if (IS_NEGATIVE(dp1)) 
 	  */
 	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[hpos]), c->reg.plane_equation);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset_hpos), c->reg.plane_equation);
 	 is_negative = brw_IF(p, BRW_EXECUTE_1);
 	 {
              /*
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
index 5486f4fa89..fa00f6044f 100644
--- a/src/gallium/drivers/i965/brw_clip_tri.c
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -249,13 +249,13 @@ void brw_clip_tri( struct brw_clip_compile *c )
 
 	    /* IS_NEGATIVE(prev) */
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset_hpos), c->reg.plane_equation);
 	    prev_test = brw_IF(p, BRW_EXECUTE_1);
 	    {
 	       /* IS_POSITIVE(next)
 		*/
 	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
-	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
 	       next_test = brw_IF(p, BRW_EXECUTE_1);
 	       {
 
@@ -297,7 +297,7 @@ void brw_clip_tri( struct brw_clip_compile *c )
 	       /* IS_NEGATIVE(next)
 		*/
 	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
 	       next_test = brw_IF(p, BRW_EXECUTE_1);
 	       {
 		  /* Going out of bounds.  Avoid division by zero as we
@@ -462,9 +462,9 @@ static void brw_clip_test( struct brw_clip_compile *c )
     brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
     brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
     brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
-    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
-    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
-    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset_hpos));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset_hpos));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset_hpos));
     brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
 
     /* test nearz, xmin, ymin plane */
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index 0fab3a5f1a..aec835b8ce 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -45,9 +45,9 @@ static void compute_tri_direction( struct brw_clip_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg e = c->reg.tmp0;
    struct brw_reg f = c->reg.tmp1;
-   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
-   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
-   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset_hpos); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset_hpos); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset_hpos); 
 
 
    struct brw_reg v0n = get_tmp(c);
@@ -123,7 +123,8 @@ static void copy_bfc( struct brw_clip_compile *c )
 
    /* Do we have any colors to copy? 
     */
-   if (c->nr_color_attrs == 0)
+   if ((c->offset_color0 == 0 || c->offset_bfc0 == 0) &&
+       (c->offset_color1 == 0 || c->offset_bfc1 == 0))
       return;
 
    /* In some wierd degnerate cases we can end up testing the
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 018511e699..872042c9a9 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -106,7 +106,7 @@ static void brw_clip_project_vertex( struct brw_clip_compile *c,
    /* Fixup position.  Extract from the original vertex and re-project
     * to screen space:
     */
-   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset_hpos));
    brw_clip_project_position(c, tmp);
    brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
 	 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 31f3cf3685..31e04b6e14 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -152,13 +152,23 @@ struct brw_rasterizer_state;
 
 struct brw_vertex_shader {
    const struct tgsi_token *tokens;
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+
    struct tgsi_shader_info info;
 
-   unsigned  has_flow_control:1;
+   GLuint has_flow_control:1;
+   GLuint use_const_buffer:1;
+
+   /* Offsets of special vertex shader outputs required for clipping.
+    */
+   GLuint output_hpos:6;        /* not always zero? */
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
 
    unsigned id;
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
 };
 
 struct brw_fs_signature {
@@ -317,7 +327,8 @@ struct brw_vs_prog_data {
 
    GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
 
-   GLboolean copy_edgeflag;
+   GLuint output_edgeflag;
+
    GLboolean writes_psiz;
 
    /* Used for calculating urb partitions:
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 7febf9e0c2..02bc8fa130 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -124,21 +124,51 @@ static void *brw_create_vs_state( struct pipe_context *pipe,
 				  const struct pipe_shader_state *shader )
 {
    struct brw_context *brw = brw_context(pipe);
+   struct brw_vertex_shader *vs;
+   unsigned i;
 
-   struct brw_vertex_shader *vs = CALLOC_STRUCT(brw_vertex_shader);
+   vs = CALLOC_STRUCT(brw_vertex_shader);
    if (vs == NULL)
       return NULL;
 
    /* Duplicate tokens, scan shader
     */
-   vs->id = brw->program_id++;
-   vs->has_flow_control = has_flow_control(&vs->info);
-
    vs->tokens = tgsi_dup_tokens(shader->tokens);
    if (vs->tokens == NULL)
       goto fail;
 
    tgsi_scan_shader(vs->tokens, &vs->info);
+
+   vs->id = brw->program_id++;
+   vs->has_flow_control = has_flow_control(&vs->info);
+
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      int index = vs->info.output_semantic_index[i];
+      switch (vs->info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         vs->output_hpos = i;
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (index == 0)
+            vs->output_color0 = i;
+         else
+            vs->output_color1 = i;
+         break;
+      case TGSI_SEMANTIC_BCOLOR:
+         if (index == 0)
+            vs->output_bfc0 = i;
+         else
+            vs->output_bfc1 = i;
+         break;
+#if 0
+      case TGSI_SEMANTIC_EDGEFLAG:
+         vs->output_edgeflag = i;
+         break;
+#endif
+      }
+   }
+
+
    
    /* Done:
     */
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 05a62ed974..2668392919 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -57,7 +57,18 @@ static enum pipe_error do_vs_prog( struct brw_context *brw,
 
    c.prog_data.nr_outputs = vp->info.num_outputs;
    c.prog_data.nr_inputs = vp->info.num_inputs;
-   c.prog_data.copy_edgeflag = c.key.copy_edgeflag;
+
+   /* XXX: we want edgeflag handling to be integrated to the vertex
+    * shader, but are currently faking the edgeflag output:
+    */
+   if (c.key.copy_edgeflag) {
+      c.prog_data.output_edgeflag = c.prog_data.nr_outputs;
+      c.prog_data.nr_outputs++;
+   }
+   else {
+      c.prog_data.output_edgeflag = ~0;
+   }
+
 
    if (1)
       tgsi_dump(c.vp->tokens, 0);
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 933c9c4d63..bcaeaca62d 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -70,11 +70,17 @@ static boolean is_position_output( struct brw_vs_compile *c,
                                    unsigned vs_output )
 {
    struct brw_vertex_shader *vs = c->vp;
-   unsigned semantic = vs->info.output_semantic_name[vs_output];
-   unsigned index = vs->info.output_semantic_index[vs_output];
 
-   return (semantic == TGSI_SEMANTIC_POSITION &&
-           index == 0);
+   if (vs_output == c->prog_data.output_edgeflag) {
+      return FALSE;
+   }
+   else {
+      unsigned semantic = vs->info.output_semantic_name[vs_output];
+      unsigned index = vs->info.output_semantic_index[vs_output];
+      
+      return (semantic == TGSI_SEMANTIC_POSITION &&
+              index == 0);
+   }
 }
 
 
@@ -83,15 +89,22 @@ static boolean find_output_slot( struct brw_vs_compile *c,
                                   unsigned *fs_input_slot )
 {
    struct brw_vertex_shader *vs = c->vp;
-   unsigned semantic = vs->info.output_semantic_name[vs_output];
-   unsigned index = vs->info.output_semantic_index[vs_output];
-   unsigned i;
 
-   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
-      if (c->key.fs_signature.input[i].semantic == semantic &&
+   if (vs_output == c->prog_data.output_edgeflag) {
+      *fs_input_slot = c->key.fs_signature.nr_inputs;
+      return TRUE;
+   }
+   else {
+      unsigned semantic = vs->info.output_semantic_name[vs_output];
+      unsigned index = vs->info.output_semantic_index[vs_output];
+      unsigned i;
+
+      for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+         if (c->key.fs_signature.input[i].semantic == semantic &&
           c->key.fs_signature.input[i].semantic_index == index) {
-         *fs_input_slot = i;
-         return TRUE;
+            *fs_input_slot = i;
+            return TRUE;
+         }
       }
    }
 
@@ -219,7 +232,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    /* XXX: need to access vertex output semantics here:
     */
-   for (i = 0; i < c->prog_data.nr_outputs; i++) {
+   for (i = 0; i < c->nr_outputs; i++) {
       unsigned slot;
 
       /* XXX: Put output position in slot zero always.  Clipper, etc,
@@ -1116,10 +1129,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
-      assert(0);
       brw_MOV(p, 
-	      get_reg(c, TGSI_FILE_OUTPUT, 0),
-	      get_reg(c, TGSI_FILE_INPUT, 0));
+              get_reg(c, TGSI_FILE_OUTPUT, c->prog_data.output_edgeflag),
+              brw_imm_f(1));
    }
 
    /* Build ndc coords */
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
index 2a879863ab..56f39d036b 100644
--- a/src/gallium/drivers/i965/brw_wm_pass2.c
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -93,7 +93,7 @@ static void init_registers( struct brw_wm_compile *c )
    assert(c->key.vp_nr_outputs >= 1);
 
    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-   c->prog_data.urb_read_length = c->key.vp_nr_outputs * 2;
+   c->prog_data.urb_read_length = (c->key.nr_inputs + 1) * 2;
    c->prog_data.curb_read_length = c->nr_creg * 2;
 
    /* Note this allocation:
-- 
cgit v1.2.3


From 47cef2bb8f5979ae690e89943f83060999a29a55 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 19 Nov 2009 18:55:18 -0800
Subject: i965g: add new state flag tracking fs signature changes

---
 src/gallium/drivers/i965/brw_context.h     |  1 +
 src/gallium/drivers/i965/brw_pipe_shader.c | 13 ++++++++++++-
 src/gallium/drivers/i965/brw_sf.c          |  9 ++++-----
 src/gallium/drivers/i965/brw_vs.c          |  7 +++----
 4 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 31e04b6e14..65859be0ec 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -233,6 +233,7 @@ struct brw_sampler {
 #define PIPE_NEW_SCISSOR                0x100000
 #define PIPE_NEW_BOUND_TEXTURES         0x200000
 #define PIPE_NEW_NR_CBUFS               0x400000
+#define PIPE_NEW_FRAGMENT_SIGNATURE     0x800000
 
 
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 02bc8fa130..c755fa6889 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -58,9 +58,20 @@ static GLboolean has_flow_control(const struct tgsi_shader_info *info)
 
 static void brw_bind_fs_state( struct pipe_context *pipe, void *prog )
 {
+   struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
    struct brw_context *brw = brw_context(pipe);
+   
+   if (brw->curr.fragment_shader == fs)
+      return;
+
+   if (brw->curr.fragment_shader == NULL ||
+       fs == NULL ||
+       memcmp(&brw->curr.fragment_shader->signature, &fs->signature,
+              brw_fs_signature_size(&fs->signature)) != 0) {
+      brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SIGNATURE;
+   }
 
-   brw->curr.fragment_shader = (struct brw_fragment_shader *)prog;
+   brw->curr.fragment_shader = fs;
    brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SHADER;
 }
 
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 6f4502da97..aa2ab5098c 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -125,11 +125,10 @@ static enum pipe_error upload_sf_prog(struct brw_context *brw)
    /* Populate the key, noting state dependencies:
     */
 
-   /* XXX: Add one to turn the max value into a count, then add
-    * another one to account for the position input.
+   /* XXX: Add one to account for the position input.
     */
-   /* PIPE_NEW_FRAGMENT_SHADER */
-   key.nr_attrs = brw->curr.fragment_shader->info.file_max[TGSI_FILE_INPUT] + 2;
+   /* PIPE_NEW_FRAGMENT_SIGNATURE */
+   key.nr_attrs = brw->curr.fragment_shader->signature.nr_inputs + 1;
 
 
    /* XXX: this is probably where the mapping between vertex shader
@@ -194,7 +193,7 @@ static enum pipe_error upload_sf_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_VERTEX_SHADER),
+      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_FRAGMENT_SIGNATURE),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 2668392919..25b51eb41e 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -101,7 +101,7 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
    struct brw_vertex_shader *vp = brw->curr.vertex_shader;
-   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+   struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
    enum pipe_error ret;
 
    memset(&key, 0, sizeof(key));
@@ -111,8 +111,7 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
    key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
 			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
-   memcpy(&key.fs_signature, &fs->signature,
-          brw_fs_signature_size(&fs->signature));
+   memcpy(&key.fs_signature, sig, brw_fs_signature_size(sig));
 
 
    /* Make an early check for the key.
@@ -138,7 +137,7 @@ const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
       .mesa  = (PIPE_NEW_CLIP | 
                 PIPE_NEW_RAST |
-                PIPE_NEW_FRAGMENT_SHADER),
+                PIPE_NEW_FRAGMENT_SIGNATURE),
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
-- 
cgit v1.2.3


From 968a7dfb292f1eefa9ada8096bb023c051518c32 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 23 Nov 2009 01:47:57 +0000
Subject: i965g: use correct key size for vs upload

---
 src/gallium/drivers/i965/brw_vs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 25b51eb41e..14a1c3bcf1 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -84,7 +84,7 @@ static enum pipe_error do_vs_prog( struct brw_context *brw,
       return ret;
 
    ret = brw_upload_cache( &brw->cache, BRW_VS_PROG,
-                           &c.key, sizeof(c.key),
+                           &c.key, brw_vs_prog_key_size(&c.key),
                            NULL, 0,
                            program, program_size,
                            &c.prog_data,
-- 
cgit v1.2.3


From d186079520234a776c3fa88c81da935d65981fec Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Tue, 22 Dec 2009 21:26:51 +0100
Subject: i965g: fix for edgeflag changes (untested)

---
 src/gallium/drivers/i965/brw_pipe_shader.c |  3 ---
 src/gallium/drivers/i965/brw_pipe_vertex.c |  7 -------
 src/gallium/drivers/i965/brw_vs.c          | 14 --------------
 src/gallium/drivers/i965/brw_vs.h          |  3 ---
 src/gallium/drivers/i965/brw_vs_emit.c     |  6 ------
 5 files changed, 33 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.c')

diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 31a715ab65..20f20571f6 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -215,15 +215,12 @@ static void *brw_create_vs_state( struct pipe_context *pipe,
          else
             vs->output_bfc1 = i;
          break;
-#if 0
       case TGSI_SEMANTIC_EDGEFLAG:
          vs->output_edgeflag = i;
          break;
-#endif
       }
    }
 
-
    
    /* Done:
     */
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
index 3d87a2853f..e3c48e3149 100644
--- a/src/gallium/drivers/i965/brw_pipe_vertex.c
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -44,19 +44,12 @@ static void brw_set_vertex_buffers(struct pipe_context *pipe,
    brw->state.dirty.mesa |= PIPE_NEW_VERTEX_BUFFER;
 }
 
-static void brw_set_edgeflags( struct pipe_context *pipe,
-			       const unsigned *bitfield )
-{
-   /* XXX */
-}
-
 
 void 
 brw_pipe_vertex_init( struct brw_context *brw )
 {
    brw->base.set_vertex_buffers = brw_set_vertex_buffers;
    brw->base.set_vertex_elements = brw_set_vertex_elements;
-   brw->base.set_edgeflags = brw_set_edgeflags;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 14a1c3bcf1..e3ea5a3a13 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -58,18 +58,6 @@ static enum pipe_error do_vs_prog( struct brw_context *brw,
    c.prog_data.nr_outputs = vp->info.num_outputs;
    c.prog_data.nr_inputs = vp->info.num_inputs;
 
-   /* XXX: we want edgeflag handling to be integrated to the vertex
-    * shader, but are currently faking the edgeflag output:
-    */
-   if (c.key.copy_edgeflag) {
-      c.prog_data.output_edgeflag = c.prog_data.nr_outputs;
-      c.prog_data.nr_outputs++;
-   }
-   else {
-      c.prog_data.output_edgeflag = ~0;
-   }
-
-
    if (1)
       tgsi_dump(c.vp->tokens, 0);
 
@@ -108,8 +96,6 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 
    key.program_string_id = vp->id;
    key.nr_userclip = brw->curr.ucp.nr;
-   key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
-			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
    memcpy(&key.fs_signature, sig, brw_fs_signature_size(sig));
 
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 3d1598d02b..944d88c84c 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -41,7 +41,6 @@
 struct brw_vs_prog_key {
    GLuint program_string_id;
    GLuint nr_userclip:4;
-   GLuint copy_edgeflag:1;
    GLuint pad:26;
    struct brw_fs_signature fs_signature;
 };
@@ -66,8 +65,6 @@ struct brw_vs_compile {
    GLuint nr_immediates;
    GLfloat immediate[128][4];
 
-   GLboolean copy_edgeflag;
-
    GLuint overflow_grf_start;
    GLuint overflow_count;
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 1d0fff0d9e..714def5046 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -1141,12 +1141,6 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    int i;
    GLuint len_vertext_header = 2;
 
-   if (c->key.copy_edgeflag) {
-      brw_MOV(p, 
-              get_reg(c, TGSI_FILE_OUTPUT, c->prog_data.output_edgeflag),
-              brw_imm_f(1));
-   }
-
    /* Build ndc coords */
    ndc = get_tmp(c);
    /* ndc = 1.0 / pos.w */
-- 
cgit v1.2.3