52 files changed, 1267 insertions, 1770 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 7a55333e89..7758a792fd 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -24,7 +24,6 @@ DRIVER_SOURCES = \
 	intel_pixel_draw.c \
 	intel_pixel_read.c \
 	intel_state.c \
-	intel_swapbuffers.c \
 	intel_syncobj.c \
 	intel_tex.c \
 	intel_tex_copy.c \
@@ -96,7 +95,8 @@ ASM_SOURCES =
 
 DRIVER_DEFINES = -I../intel -I../intel/server
 
-DRI_LIB_DEPS += -ldrm_intel
+INCLUDES += $(INTEL_CFLAGS)
+DRI_LIB_DEPS += $(INTEL_LIBS)
 
 include ../Makefile.template
 
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 5cca605c3f..bac1c3a49c 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -34,6 +34,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
+#include "intel_fbo.h"
 #include "main/macros.h"
 #include "main/enums.h"
 
@@ -55,7 +56,8 @@ static void prepare_cc_vp( struct brw_context *brw )
    }
 
    dri_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
+   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
+				  NULL, 0);
 }
 
 const struct brw_tracked_state brw_cc_vp = {
@@ -88,6 +90,28 @@ struct brw_cc_unit_key {
    GLenum depth_func;
 };
 
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+fix_xRGB_alpha(GLenum function)
+{
+   switch (function) {
+   case GL_DST_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
@@ -131,6 +155,17 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
       key->blend_dst_rgb = ctx->Color.BlendDstRGB;
       key->blend_src_a = ctx->Color.BlendSrcA;
       key->blend_dst_a = ctx->Color.BlendDstA;
+
+      /* If the renderbuffer is XRGB, we have to frob the blend function to
+       * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
+       * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
+       */
+      if (ctx->DrawBuffer->Visual.alphaBits == 0) {
+	 key->blend_src_rgb = fix_xRGB_alpha(key->blend_src_rgb);
+	 key->blend_src_a   = fix_xRGB_alpha(key->blend_src_a);
+	 key->blend_dst_rgb = fix_xRGB_alpha(key->blend_dst_rgb);
+	 key->blend_dst_a   = fix_xRGB_alpha(key->blend_dst_a);
+      }
    }
 
    key->alpha_enabled = ctx->Color.AlphaEnabled;
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 20a927cf38..af1d975de9 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -50,6 +50,7 @@
 static void compile_clip_prog( struct brw_context *brw,
 			     struct brw_clip_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -65,27 +66,26 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        delta = 3 * REG_SIZE;
    else
        delta = REG_SIZE;
 
    for (i = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
+      if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
       }
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
    
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -143,7 +143,8 @@ static void compile_clip_prog( struct brw_context *brw,
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -156,10 +157,11 @@ static void upload_clip_prog(struct brw_context *brw)
    key.attrs = brw->vs.prog_data->outputs_written;
    /* _NEW_LIGHT */
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
+   key.pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
    /* _NEW_TRANSFORM */
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index 957df441ab..d71bac7f61 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -42,22 +42,21 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:32;		
+   GLbitfield64 attrs;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
+   GLuint pv_first:1;
    GLuint do_unfilled:1;
    GLuint fill_cw:2;		/* includes cull information */
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:17;
-
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:27;
-   
+   GLuint pad0:11;
+
    GLfloat offset_factor;
    GLfloat offset_units;
 };
@@ -119,7 +118,6 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index 048ca620fa..afc0b11049 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -46,6 +46,7 @@
 
 static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -85,7 +86,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -126,6 +127,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 static void clip_and_emit_line( struct brw_clip_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    struct brw_indirect vtx0     = brw_indirect(0, 0);
    struct brw_indirect vtx1      = brw_indirect(1, 0);
    struct brw_indirect newvtx0   = brw_indirect(2, 0);
@@ -152,7 +154,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -189,7 +191,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
               * Both can be negative on GM965/G965 due to RHW workaround
               * if so, this object should be rejected.
               */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
                  {
@@ -214,7 +216,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
              /* If both are positive, do nothing */
              /* Only on GM965/G965 */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              }
@@ -229,7 +231,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
                  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
              }
 
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_ENDIF(p, is_neg2);
              }
          }
@@ -269,8 +271,12 @@ void brw_emit_line_clip( struct brw_clip_compile *c )
    brw_clip_line_alloc_regs(c);
    brw_clip_init_ff_sync(c);
 
-   if (c->key.do_flat_shading)
-      brw_clip_copy_colors(c, 0, 1);
+   if (c->key.do_flat_shading) {
+      if (c->key.pv_first)
+         brw_clip_copy_colors(c, 1, 0);
+      else
+         brw_clip_copy_colors(c, 0, 1);
+   }
                 
    clip_and_emit_line(c);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 234b3744bf..c8f24a94e4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -74,6 +74,7 @@ static dri_bo *
 clip_unit_create_from_key(struct brw_context *brw,
 			  struct brw_clip_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_unit_state clip;
    dri_bo *bo;
 
@@ -105,7 +106,7 @@ clip_unit_create_from_key(struct brw_context *brw,
       /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
        * only 2 threads can output VUEs at a time.
        */
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake)
          clip.thread4.max_threads = 16 - 1;        
       else
          clip.thread4.max_threads = 2 - 1;
@@ -130,7 +131,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.clip5.api_mode = BRW_CLIP_API_OGL;
    clip.clip5.clip_mode = key->clip_mode;
 
-   if (BRW_IS_G4X(brw))
+   if (intel->is_g4x)
       clip.clip5.negative_w_clip_test = 1;
 
    clip.clip6.clipper_viewport_state_ptr = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 0efd77225e..cfbb8f2686 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -51,6 +51,7 @@ static void release_tmps( struct brw_clip_compile *c )
 void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
 			      GLuint nr_verts )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -78,7 +79,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
 
-         if (BRW_IS_IGDNG(c->func.brw))
+         if (intel->is_ironlake)
              delta = c->nr_attrs * 16 + 32 * 3;
 
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
@@ -119,7 +120,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -188,14 +189,20 @@ void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
 	   brw_imm_ud(_3DPRIM_POLYGON));
 
    is_poly = brw_IF(p, BRW_EXECUTE_1);
-   {   
+   {
       brw_clip_copy_colors(c, 1, 0);
       brw_clip_copy_colors(c, 2, 0);
    }
    is_poly = brw_ELSE(p, is_poly);
    {
-      brw_clip_copy_colors(c, 0, 2);
-      brw_clip_copy_colors(c, 1, 2);
+      if (c->key.pv_first) {
+         brw_clip_copy_colors(c, 1, 0);
+         brw_clip_copy_colors(c, 2, 0);
+      }
+      else {
+         brw_clip_copy_colors(c, 0, 2);
+         brw_clip_copy_colors(c, 1, 2);
+      }
    }
    brw_ENDIF(p, is_poly);
 }
@@ -565,6 +572,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 {
    struct brw_instruction *neg_rhw;
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
@@ -572,7 +580,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 5a73abdfee..86fed59fa4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -135,6 +135,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 			     GLboolean force_edgeflag)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = get_tmp(c);
    GLuint i;
 
@@ -142,7 +143,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
     */
    /*
     * After CLIP stage, only first 256 bits of the VUE are read
-    * back on IGDNG, so needn't change it
+    * back on Ironlake, so needn't change it
     */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
@@ -151,7 +152,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       if (delta == c->offset[VERT_RESULT_EDGE]) {
@@ -185,7 +186,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    if (i & 1) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
@@ -359,7 +360,9 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
 
 void brw_clip_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
         struct brw_compile *p = &c->func;
         struct brw_instruction *need_ff_sync;
 
@@ -388,7 +391,9 @@ void brw_clip_ff_sync(struct brw_clip_compile *c)
 
 void brw_clip_init_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
 	struct brw_compile *p = &c->func;
         
         brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 48685c087b..7bb15956b5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -77,7 +77,7 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 }
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate)
 {
    struct dd_function_table functions;
@@ -111,6 +111,9 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
                                      ctx->Const.MaxTextureImageUnits);
    ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
+   ctx->Const.MaxCombinedTextureImageUnits =
+      ctx->Const.MaxVertexTextureImageUnits +
+      ctx->Const.MaxTextureImageUnits;
 
    /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
     */
@@ -155,6 +158,38 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
       MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
 	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   if (intel->is_ironlake || intel->is_g4x) {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_GM45;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_GM45;
+      brw->has_surface_tile_offset = GL_TRUE;
+      brw->has_compr4 = GL_TRUE;
+      brw->has_aa_line_parameters = GL_TRUE;
+  } else {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_965;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_965;
+   }
+
+   /* WM maximum threads is number of EUs times number of threads per EU. */
+   if (intel->is_ironlake) {
+      brw->urb.size = 1024;
+      brw->vs_max_threads = 72;
+      brw->wm_max_threads = 12 * 6;
+   } else if (intel->is_g4x) {
+      brw->urb.size = 384;
+      brw->vs_max_threads = 32;
+      brw->wm_max_threads = 10 * 5;
+   } else {
+      brw->urb.size = 256;
+      brw->vs_max_threads = 16;
+      brw->wm_max_threads = 8 * 4;
+      brw->has_negative_rhw_bug = GL_TRUE;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) {
+      brw->vs_max_threads = 1;
+      brw->wm_max_threads = 1;
+   }
+
    brw_init_state( brw );
 
    brw->state.dirty.mesa = ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 59f9475b5a..0dd3087143 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -172,8 +172,8 @@ struct brw_fragment_program {
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
-   dri_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -231,7 +231,7 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint outputs_written;
+   GLbitfield64 outputs_written;
    GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
@@ -320,7 +320,6 @@ struct brw_cache_item {
    GLuint nr_reloc_bufs;
 
    dri_bo *bo;
-   GLuint data_size;
 
    struct brw_cache_item *next;
 };   
@@ -333,7 +332,6 @@ struct brw_cache {
    struct brw_cache_item **items;
    GLuint size, n_items;
 
-   GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
    GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
 
@@ -413,23 +411,6 @@ struct brw_vertex_info {
    GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
-
-
-
-/* Cache for TNL programs.
- */
-struct brw_tnl_cache_item {
-   GLuint hash;
-   void *key;
-   void *data;
-   struct brw_tnl_cache_item *next;
-};
-
-struct brw_tnl_cache {
-   struct brw_tnl_cache_item **items;
-   GLuint size, n_items;
-};
-
 struct brw_query_object {
    struct gl_query_object Base;
 
@@ -457,8 +438,11 @@ struct brw_context
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean no_batch_wrap;
-
+   GLboolean has_surface_tile_offset;
+   GLboolean has_compr4;
+   GLboolean has_negative_rhw_bug;
+   GLboolean has_aa_line_parameters;
+;
    struct {
       struct brw_state_flags dirty;
 
@@ -534,6 +518,12 @@ struct brw_context
     */
    GLuint next_free_page;
 
+   /* hw-dependent 3DSTATE_VF_STATISTICS opcode */
+   uint32_t CMD_VF_STATISTICS;
+   /* hw-dependent 3DSTATE_PIPELINE_SELECT opcode */
+   uint32_t CMD_PIPELINE_SELECT;
+   int vs_max_threads;
+   int wm_max_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -561,6 +551,7 @@ struct brw_context
       GLuint clip_start;
       GLuint sf_start;
       GLuint cs_start;
+      GLuint size; /* Hardware URB size, in KB. */
    } urb;
 
    
@@ -688,7 +679,7 @@ void brwInitVtbl( struct brw_context *brw );
  * brw_context.c
  */
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate);
 
 /*======================================================================
@@ -761,9 +752,5 @@ brw_fragment_program_const(const struct gl_fragment_program *p)
    return (const struct brw_fragment_program *) p;
 }
 
-
-
-#define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
-
 #endif
 
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 4be6c77aa1..190310afbb 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -130,7 +130,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
       .mesa = _NEW_TRANSFORM,
-      .brw  = BRW_NEW_VERTEX_PROGRAM,
+      .brw  = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_CONTEXT,
       .cache = CACHE_NEW_WM_PROG
    },
    .prepare = calculate_curbe_offsets
@@ -340,7 +340,7 @@ static void emit_constant_buffer(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(2);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 78d457ad2b..ea0d7e05d4 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -673,18 +673,10 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
 #define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
 
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+#define BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG    3
 
 /* for IGDNG only */
 #define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
@@ -840,12 +832,4 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
-#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
-                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 9fef230507..a8f6b993ac 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -239,7 +239,7 @@ char *imm_encoding[8] = {
     [2] = "UW",
     [3] = "W",
     [5] = "VF",
-    [5] = "V",
+    [6] = "V",
     [7] = "F"
 };
 
@@ -365,6 +365,7 @@ static int format (FILE *f, char *format, ...)
     va_start (args, format);
 
     vsnprintf (buf, sizeof (buf) - 1, format, args);
+    va_end (args);
     string (f, buf);
     return 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 44bb7bd588..df281b27d5 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -145,7 +145,7 @@ static void brw_emit_prim(struct brw_context *brw,
    prim_packet.base_vert_location = prim->basevertex;
 
    /* Can't wrap here, since we rely on the validated state. */
-   brw->no_batch_wrap = GL_TRUE;
+   intel->no_batch_wrap = GL_TRUE;
 
    /* If we're set to always flush, do it before and after the primitive emit.
     * We want to catch both missed flushes that hurt instruction/state cache
@@ -153,21 +153,17 @@ static void brw_emit_prim(struct brw_context *brw,
     * the besides the draw code.
     */
    if (intel->always_flush_cache) {
-      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
-      OUT_BATCH(intel->vtbl.flush_cmd());
-      ADVANCE_BATCH();
+      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
-			      sizeof(prim_packet), LOOP_CLIPRECTS);
+			      sizeof(prim_packet));
    }
    if (intel->always_flush_cache) {
-      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
-      OUT_BATCH(intel->vtbl.flush_cmd());
-      ADVANCE_BATCH();
+      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
 
-   brw->no_batch_wrap = GL_FALSE;
+   intel->no_batch_wrap = GL_FALSE;
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -343,13 +339,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
     * so can't access it earlier.
     */
 
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
-
    for (i = 0; i < nr_prims; i++) {
       uint32_t hw_prim;
 
@@ -360,8 +349,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
        * an upper bound of how much we might emit in a single
        * brw_try_draw_prims().
        */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
+      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4);
 
       hw_prim = brw_set_prim(brw, prim[i].mode);
 
@@ -408,7 +396,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
  out:
-   UNLOCK_HARDWARE(intel);
 
    brw_state_cache_check_size(brw);
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 0fefbd9d81..c773b71507 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -243,14 +243,6 @@ static void wrap_buffers( struct brw_context *brw,
       dri_bo_unreference(brw->vb.upload.bo);
    brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
 				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
 }
 
 static void get_space( struct brw_context *brw,
@@ -502,7 +494,7 @@ static void brw_emit_vertices(struct brw_context *brw)
     * a VE loads from them.
     */
    if (brw->vb.nr_enabled == 0) {
-      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(3);
       OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
       OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
 		BRW_VE0_VALID |
@@ -522,7 +514,7 @@ static void brw_emit_vertices(struct brw_context *brw)
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
 	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
@@ -535,24 +527,17 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      if (BRW_IS_IGDNG(brw)) {
-          if (input->stride) {
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->stride * input->count - 1);
-          } else {
-              assert(input->count == 1);
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->element_size - 1);
-          }
+      if (intel->is_ironlake) {
+	 OUT_RELOC(input->bo,
+		   I915_GEM_DOMAIN_VERTEX, 0,
+		   input->bo->size - 1);
       } else
           OUT_BATCH(input->stride ? input->count : 0);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2);
    OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
@@ -578,7 +563,7 @@ static void brw_emit_vertices(struct brw_context *brw)
 		(format << BRW_VE0_FORMAT_SHIFT) |
 		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
 
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake)
           OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
                     (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
                     (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
@@ -719,7 +704,7 @@ static void brw_emit_index_buffer(struct brw_context *brw)
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
 
-      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(4);
       OUT_BATCH( ib.header.dword );
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 30603bdd0e..39eb88d7c2 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -170,11 +170,11 @@ static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint writemask )
 {
    struct brw_reg reg;
-   if (type == BRW_GENERAL_REGISTER_FILE)
+   if (file == BRW_GENERAL_REGISTER_FILE)
       assert(nr < BRW_MAX_GRF);
-   else if (type == BRW_MESSAGE_REGISTER_FILE)
-      assert(nr < BRW_MAX_MRF);
-   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+   else if (file == BRW_MESSAGE_REGISTER_FILE)
+      assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
+   else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
       assert(nr <= BRW_ARF_IP);
 
    reg.type = type;
@@ -538,7 +538,7 @@ static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
 
 static INLINE struct brw_reg brw_message_reg( GLuint nr )
 {
-   assert(nr < BRW_MAX_MRF);
+   assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
 		       nr,
 		       0);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 241cdc33f8..8d6ac00839 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -55,7 +55,8 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
 			  struct brw_reg dest )
 {
-   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
 
    insn->bits1.da1.dest_reg_file = dest.file;
@@ -198,7 +199,7 @@ void brw_set_src1( struct brw_instruction *insn,
        * in the future:
        */
       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -251,9 +252,10 @@ static void brw_set_math_message( struct brw_context *brw,
 				  GLboolean saturate,
 				  GLuint dataType )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.math_igdng.function = function;
        insn->bits3.math_igdng.int_type = integer_type;
        insn->bits3.math_igdng.precision = low_precision;
@@ -318,9 +320,10 @@ static void brw_set_urb_message( struct brw_context *brw,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
+    struct intel_context *intel = &brw->intel;
     brw_set_src1(insn, brw_imm_d(0));
 
-    if (BRW_IS_IGDNG(brw)) {
+    if (intel->is_ironlake) {
         insn->bits3.urb_igdng.opcode = 0;	/* ? */
         insn->bits3.urb_igdng.offset = offset;
         insn->bits3.urb_igdng.swizzle_control = swizzle_control;
@@ -357,9 +360,10 @@ static void brw_set_dp_write_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_write_igdng.msg_control = msg_control;
        insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
@@ -394,9 +398,10 @@ static void brw_set_dp_read_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_read_igdng.msg_control = msg_control;
        insn->bits3.dp_read_igdng.msg_type = msg_type;
@@ -432,10 +437,11 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint header_present,
                                     GLuint simd_mode)
 {
+   struct intel_context *intel = &brw->intel;
    assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
       insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
       insn->bits3.sampler_igdng.sampler = sampler;
       insn->bits3.sampler_igdng.msg_type = msg_type;
@@ -446,7 +452,7 @@ static void brw_set_sampler_message(struct brw_context *brw,
       insn->bits3.sampler_igdng.end_of_thread = eot;
       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
       insn->bits2.send_igdng.end_of_thread = eot;
-   } else if (BRW_IS_G4X(brw)) {
+   } else if (intel->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -647,10 +653,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_ELSE(struct brw_compile *p, 
 				 struct brw_instruction *if_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow) {
@@ -689,9 +696,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2; 
  
    if (p->single_program_flow) {
@@ -802,10 +810,11 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
                                   struct brw_instruction *do_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow)
@@ -845,14 +854,15 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 void brw_land_fwd_jump(struct brw_compile *p, 
 		       struct brw_instruction *jmp_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *landing = &p->store[p->nr_insn];
    GLuint jmpi = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 562a17844b..fe5c1ae279 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -47,7 +47,6 @@
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
@@ -86,8 +85,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    }
 
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (!brw->has_surface_tile_offset) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 48c2b9a41c..1bc3eccf49 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -47,6 +47,7 @@
 static void compile_gs_prog( struct brw_context *brw,
 			     struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -54,13 +55,12 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -85,10 +85,10 @@ static void compile_gs_prog( struct brw_context *brw,
     */
    switch (key->primitive) {
    case GL_QUADS:
-      brw_gs_quads( &c ); 
+      brw_gs_quads( &c, key );
       break;
    case GL_QUAD_STRIP:
-      brw_gs_quad_strip( &c );
+      brw_gs_quad_strip( &c, key );
       break;
    case GL_LINE_LOOP:
       brw_gs_lines( &c );
@@ -149,6 +149,7 @@ static const GLenum gs_prim[GL_POLYGON+1] = {
 static void populate_key( struct brw_context *brw,
 			  struct brw_gs_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -158,6 +159,9 @@ static void populate_key( struct brw_context *brw,
    key->primitive = gs_prim[brw->primitive];
 
    key->hint_gs_always = 0;	/* debug code? */
+   
+   /* _NEW_LIGHT */
+   key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
 
    key->need_gs_prog = (key->hint_gs_always ||
 			brw->primitive == GL_QUADS ||
@@ -193,7 +197,7 @@ static void prepare_gs_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_gs_prog = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_LIGHT,
       .brw   = BRW_NEW_PRIMITIVE,
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index bbb991ea2e..813b8d447a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -40,11 +40,12 @@
 #define MAX_GS_VERTS (4)	     
 
 struct brw_gs_prog_key {
-   GLuint attrs:32;
+   GLbitfield64 attrs;
    GLuint primitive:4;
    GLuint hint_gs_always:1;
+   GLuint pv_first:1;
    GLuint need_gs_prog:1;
-   GLuint pad:26;
+   GLuint pad:25;
 };
 
 struct brw_gs_compile {
@@ -62,13 +63,12 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
 
-void brw_gs_quads( struct brw_gs_compile *c );
-void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
+void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
 void brw_gs_tris( struct brw_gs_compile *c );
 void brw_gs_lines( struct brw_gs_compile *c );
 void brw_gs_points( struct brw_gs_compile *c );
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index a9b2aa2eac..a81b972ef4 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -120,38 +120,60 @@ static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 }
 
 
-void brw_gs_quads( struct brw_gs_compile *c )
+void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
-   if (c->need_ff_sync)
-	   brw_gs_ff_sync(c, 1);    
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   if (intel->needs_ff_sync)
+	   brw_gs_ff_sync(c, 1);
+   if (key->pv_first) {
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
+   else {
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
 }
 
-void brw_gs_quad_strip( struct brw_gs_compile *c )
+void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
-   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   if (key->pv_first) {
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
+   else {
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
 }
 
 void brw_gs_tris( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 3);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
@@ -160,9 +182,11 @@ void brw_gs_tris( struct brw_gs_compile *c )
 
 void brw_gs_lines( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 2);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
@@ -170,9 +194,11 @@ void brw_gs_lines( struct brw_gs_compile *c )
 
 void brw_gs_points( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 1);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index ed9d2ffe60..1af5790a67 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -72,6 +72,7 @@ gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 static dri_bo *
 gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_unit_state gs;
    dri_bo *bo;
 
@@ -98,7 +99,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    else
       gs.thread4.max_threads = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       gs.thread4.rendering_enable = 1;
 
    if (INTEL_DEBUG & DEBUG_STATS)
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index ea71857548..7b70f787b7 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -66,7 +66,7 @@ static void upload_blend_constant_color(struct brw_context *brw)
 const struct brw_tracked_state brw_blend_constant_color = {
    .dirty = {
       .mesa = _NEW_COLOR,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_blend_constant_color
@@ -78,10 +78,7 @@ static void upload_drawing_rect(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &intel->ctx;
 
-   if (!intel->constant_cliprect)
-      return;
-
-   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
    OUT_BATCH(0); /* xmin, ymin */
    OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
@@ -93,7 +90,7 @@ static void upload_drawing_rect(struct brw_context *brw)
 const struct brw_tracked_state brw_drawing_rect = {
    .dirty = {
       .mesa = _NEW_BUFFERS,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_drawing_rect
@@ -116,7 +113,7 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
    if (brw->vs.bind_bo != NULL)
       OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
@@ -150,7 +147,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(7);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
    OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    if (brw->gs.prog_active)
@@ -212,10 +209,10 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+   unsigned int len = (intel->is_g4x || intel->is_ironlake) ? 6 : 5;
 
    if (region == NULL) {
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
 		(BRW_SURFACE_NULL << 29));
@@ -223,7 +220,7 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake)
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -247,7 +244,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       assert(region->tiling != I915_TILING_X);
 
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
 		(format << 18) |
@@ -262,7 +259,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake)
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -317,7 +314,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
 const struct brw_tracked_state brw_polygon_stipple = {
    .dirty = {
       .mesa = _NEW_POLYGONSTIPPLE,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple
@@ -330,7 +327,7 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
-   __DRIdrawablePrivate *dPriv = brw->intel.driDrawable;
+   __DRIdrawable *dPriv = brw->intel.driDrawable;
    struct brw_polygon_stipple_offset bpso;
 
    memset(&bpso, 0, sizeof(bpso));
@@ -362,7 +359,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
       .mesa = _NEW_WINDOW_POS,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple_offset
@@ -374,8 +371,8 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
-   
-   if (BRW_IS_965(brw))
+
+   if (!brw->has_aa_line_parameters)
       return;
 
    /* use legacy aa line coverage computation */
@@ -425,7 +422,7 @@ static void upload_line_stipple(struct brw_context *brw)
 const struct brw_tracked_state brw_line_stipple = {
    .dirty = {
       .mesa = _NEW_LINE,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_line_stipple
@@ -444,7 +441,7 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_pipeline_select ps;
 
       memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      ps.header.opcode = brw->CMD_PIPELINE_SELECT;
       ps.header.pipeline_select = 0;
       BRW_BATCH_STRUCT(brw, &ps);
    }
@@ -480,7 +477,7 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_vf_statistics vfs;
       memset(&vfs, 0, sizeof(vfs));
 
-      vfs.opcode = CMD_VF_STATISTICS(brw);
+      vfs.opcode = brw->CMD_VF_STATISTICS;
       if (INTEL_DEBUG & DEBUG_STATS)
 	 vfs.statistics_enable = 1; 
 
@@ -512,8 +509,8 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   if (BRW_IS_IGDNG(brw)) {
-       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+   if (intel->is_ironlake) {
+       BEGIN_BATCH(8);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
@@ -524,7 +521,7 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_BATCH(1); /* Instruction access upper bound */
        ADVANCE_BATCH();
    } else {
-       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       BEGIN_BATCH(6);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index a195bc32b0..5399a74244 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -188,7 +188,7 @@ brw_emit_query_begin(struct brw_context *brw)
    if (brw->query.active || is_empty_list(&brw->query.active_head))
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
@@ -227,7 +227,7 @@ brw_emit_query_end(struct brw_context *brw)
    if (!brw->query.active)
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index f350cbd74e..968890f7fb 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -61,7 +61,7 @@ static void compile_sf_prog( struct brw_context *brw,
    c.key = *key;
    c.nr_attrs = brw_count_bits(c.key.attrs);
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_attrs = brw_count_bits(c.key.attrs);
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
@@ -70,7 +70,7 @@ static void compile_sf_prog( struct brw_context *brw,
    /* Construct map from attribute number to position in the vertex.
     */
    for (i = idx = 0; i < VERT_RESULT_MAX; i++) 
-      if (c.key.attrs & (1<<i)) {
+      if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.attr_to_idx[i] = idx;
 	 c.idx_to_attr[idx] = i;
 	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
@@ -147,7 +147,7 @@ static void upload_sf_prog(struct brw_context *brw)
        * edgeflag testing here, it is already done in the clip
        * program.
        */
-      if (key.attrs & (1<<VERT_RESULT_EDGE))
+      if (key.attrs & BITFIELD64_BIT(VERT_RESULT_EDGE))
 	 key.primitive = SF_UNFILLED_TRIS;
       else
 	 key.primitive = SF_TRIANGLES;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index e835229a1d..0ba731fac9 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -45,7 +45,7 @@
 #define SF_UNFILLED_TRIS   3
 
 struct brw_sf_prog_key {
-   GLuint attrs:32;
+   GLbitfield64 attrs;
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index 561fcd501b..bb08055e3b 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -56,7 +56,7 @@ static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
 static GLboolean have_attr(struct brw_sf_compile *c,
 			   GLuint attr)
 {
-   return (c->key.attrs & (1<<attr)) ? 1 : 0;
+   return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
 }
 
 /*********************************************************************** 
@@ -122,8 +122,8 @@ static void do_twoside_color( struct brw_sf_compile *c )
  * Flat shading
  */
 
-#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
-                                 (1<<VERT_RESULT_COL1))
+#define VERT_RESULT_COLOR_BITS (BITFIELD64_BIT(VERT_RESULT_COL0) | \
+				BITFIELD64_BIT(VERT_RESULT_COL1))
 
 static void copy_colors( struct brw_sf_compile *c,
 		     struct brw_reg dst,
@@ -149,6 +149,7 @@ static void copy_colors( struct brw_sf_compile *c,
 static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -161,7 +162,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
@@ -187,6 +188,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -199,7 +201,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
@@ -312,8 +314,8 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask;
-   GLuint linear_mask;
+   GLbitfield64 persp_mask;
+   GLbitfield64 linear_mask;
 
    if (c->key.do_flat_shading || c->key.linear_color)
       persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
@@ -331,10 +333,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    *pc_linear = 0;
    *pc = 0xf;
       
-   if (persp_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (persp_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2]))
       *pc_persp = 0xf;
 
-   if (linear_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (linear_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2]))
       *pc_linear = 0xf;
 
    /* Maybe only processs one attribute on the final round:
@@ -342,10 +344,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    if (reg*2+1 < c->nr_setup_attrs) {
       *pc |= 0xf0;
 
-      if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (persp_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2+1]))
 	 *pc_persp |= 0xf0;
 
-      if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (linear_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2+1]))
 	 *pc_linear |= 0xf0;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index bc0f076073..09223b7cfb 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -93,7 +93,8 @@ static void upload_sf_vp(struct brw_context *brw)
    }
 
    dri_bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv),
+				  NULL, 0);
 }
 
 const struct brw_tracked_state brw_sf_vp = {
@@ -113,7 +114,8 @@ struct brw_sf_unit_key {
 
    unsigned int nr_urb_entries, urb_size, sfsize;
 
-   GLenum front_face, cull_face, provoking_vertex;
+   GLenum front_face, cull_face;
+   unsigned pv_first:1;
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
@@ -154,7 +156,7 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->point_attenuated = ctx->Point._Attenuated;
 
    /* _NEW_LIGHT */
-   key->provoking_vertex = ctx->Light.ProvokingVertex;
+   key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
 
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
@@ -163,6 +165,7 @@ static dri_bo *
 sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_sf_unit_state sf;
    dri_bo *bo;
    int chipset_max_threads;
@@ -175,7 +178,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread3.dispatch_grf_start_reg = 3;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        sf.thread3.urb_entry_read_offset = 3;
    else
        sf.thread3.urb_entry_read_offset = 1;
@@ -185,10 +188,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
 
-   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
-    * 48(IGDNG) threads 
+   /* Each SF thread produces 1 PUE, and there can be up to 24 (Pre-Ironlake) or
+    * 48 (Ironlake) threads.
     */
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       chipset_max_threads = 48;
    else
       chipset_max_threads = 24;
@@ -287,7 +290,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+   if (!key->pv_first) {
       sf.sf7.trifan_pv = 2;
       sf.sf7.linestrip_pv = 1;
       sf.sf7.tristrip_pv = 2;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index d639656b9d..9c9d145c4b 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -35,7 +35,7 @@
 
 #include "brw_context.h"
 
-static inline void
+static INLINE void
 brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
 {
    assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
@@ -112,6 +112,7 @@ void brw_validate_state(struct brw_context *brw);
 void brw_upload_state(struct brw_context *brw);
 void brw_init_state(struct brw_context *brw);
 void brw_destroy_state(struct brw_context *brw);
+void brw_clear_validated_bos(struct brw_context *brw);
 
 /***********************************************************************
  * brw_state_cache.c
@@ -119,16 +120,10 @@ void brw_destroy_state(struct brw_context *brw);
 dri_bo *brw_cache_data(struct brw_cache *cache,
 		       enum brw_cache_id cache_id,
 		       const void *data,
+		       GLuint size,
 		       dri_bo **reloc_bufs,
 		       GLuint nr_reloc_bufs);
 
-dri_bo *brw_cache_data_sz(struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *data,
-			  GLuint data_size,
-			  dri_bo **reloc_bufs,
-			  GLuint nr_reloc_bufs);
-
 dri_bo *brw_upload_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *key,
@@ -156,7 +151,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo);
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)))
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 7821898cf9..ed8120d617 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -48,7 +48,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      intel_batchbuffer_data(brw->intel.batch, data, sz);
       return GL_TRUE;
    }
 
@@ -75,7 +75,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   intel_batchbuffer_data(brw->intel.batch, data, sz);
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index c262e1db8b..e4c9ba7d87 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -245,7 +245,6 @@ brw_upload_cache( struct brw_cache *cache,
 
    item->bo = bo;
    dri_bo_reference(bo);
-   item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
       rehash(cache);
@@ -275,15 +274,22 @@ brw_upload_cache( struct brw_cache *cache,
 
 
 /**
- * This doesn't really work with aux data.  Use search/upload instead
+ * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
+ *
+ * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
+ * better to use, as the potentially changing offsets in the data-used-as-key
+ * will result in excessive cache misses.
+ *
+ * If aux data is involved, use search/upload instead.
+
  */
 dri_bo *
-brw_cache_data_sz(struct brw_cache *cache,
-		  enum brw_cache_id cache_id,
-		  const void *data,
-		  GLuint data_size,
-		  dri_bo **reloc_bufs,
-		  GLuint nr_reloc_bufs)
+brw_cache_data(struct brw_cache *cache,
+	       enum brw_cache_id cache_id,
+	       const void *data,
+	       GLuint data_size,
+	       dri_bo **reloc_bufs,
+	       GLuint nr_reloc_bufs)
 {
    dri_bo *bo;
    struct brw_cache_item *item;
@@ -306,25 +312,6 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
-
-/**
- * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
- *
- * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
- * better to use, as the potentially changing offsets in the data-used-as-key
- * will result in excessive cache misses.
- */
-dri_bo *
-brw_cache_data(struct brw_cache *cache,
-	       enum brw_cache_id cache_id,
-	       const void *data,
-	       dri_bo **reloc_bufs,
-	       GLuint nr_reloc_bufs)
-{
-   return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
-			    reloc_bufs, nr_reloc_bufs);
-}
-
 enum pool_type {
    DW_SURFACE_STATE,
    DW_GENERAL_STATE
@@ -335,11 +322,9 @@ static void
 brw_init_cache_id(struct brw_cache *cache,
                   const char *name,
                   enum brw_cache_id id,
-                  GLuint key_size,
                   GLuint aux_size)
 {
    cache->name[id] = strdup(name);
-   cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
@@ -359,91 +344,76 @@ brw_init_non_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
-		     sizeof(struct brw_cc_viewport),
 		     0);
 
    brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
-		     sizeof(struct brw_cc_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
-		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
    brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
-		     sizeof(struct brw_sampler_default_color),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
-		     0,		/* variable key/data size */
 		     0);
 
    brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
-		     sizeof(struct brw_wm_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
-		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
    brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
-		     sizeof(struct brw_sf_viewport),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
-		     sizeof(struct brw_sf_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
-		     sizeof(struct brw_vs_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
-		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
    brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
-		     sizeof(struct brw_clip_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
-		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
    brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
-		     sizeof(struct brw_gs_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
-		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
 }
 
@@ -463,13 +433,11 @@ brw_init_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
-		     sizeof(struct brw_surface_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
-		     0,
 		     0);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index f4283bda1b..af8dfb4c15 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -143,7 +143,7 @@ static void xor_states( struct brw_state_flags *result,
    result->cache = a->cache ^ b->cache;
 }
 
-static void
+void
 brw_clear_validated_bos(struct brw_context *brw)
 {
    int i;
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index e59e52ed86..64a9535282 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -49,7 +49,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (intel->is_ironlake) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 8c6f4355a6..f2cdb203b8 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -105,7 +105,8 @@ static GLboolean check_urb_layout( struct brw_context *brw )
    brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
    brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
 
-   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+   return brw->urb.cs_start + brw->urb.nr_cs_entries *
+      brw->urb.csize <= brw->urb.size;
 }
 
 /* Most minimal update, forces re-emit of URB fence packet after GS
@@ -113,6 +114,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
  */
 static void recalculate_urb_fence( struct brw_context *brw )
 {
+   struct intel_context *intel = &brw->intel;
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
    GLuint sfsize = brw->sf.prog_data->urb_entry_size;
@@ -146,7 +148,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
 
       brw->urb.constrained = 0;
 
-      if (BRW_IS_IGDNG(brw)) {
+      if (intel->is_ironlake) {
          brw->urb.nr_vs_entries = 128;
          brw->urb.nr_sf_entries = 48;
          if (check_urb_layout(brw)) {
@@ -156,7 +158,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
             brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
             brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
          }
-      } else if (BRW_IS_G4X(brw)) {
+      } else if (intel->is_g4x) {
 	 brw->urb.nr_vs_entries = 64;
 	 if (check_urb_layout(brw)) {
 	    goto done;
@@ -200,7 +202,7 @@ done:
 		      brw->urb.clip_start,
 		      brw->urb.sf_start,
 		      brw->urb.cs_start, 
-		      URB_SIZES(brw));
+		      brw->urb.size);
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
@@ -244,7 +246,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits0.gs_fence  = brw->urb.clip_start; 
    uf.bits0.clp_fence = brw->urb.sf_start; 
    uf.bits1.sf_fence  = brw->urb.cs_start; 
-   uf.bits1.cs_fence  = URB_SIZES(brw);
+   uf.bits1.cs_fence  = brw->urb.size;
 
    BRW_BATCH_STRUCT(brw, &uf);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index ce21aa4869..bba9249d1b 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -35,7 +35,7 @@
 #include "brw_util.h"
 #include "brw_defines.h"
 
-GLuint brw_count_bits( GLuint val )
+GLuint brw_count_bits(uint64_t val)
 {
    GLuint i;
    for (i = 0; val ; val >>= 1)
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 33e7cd87e4..04f3175d3e 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -35,7 +35,7 @@
 
 #include "main/mtypes.h"
 
-extern GLuint brw_count_bits( GLuint val );
+extern GLuint brw_count_bits(uint64_t val);
 extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index f0c79efbd9..fd055e225e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -56,7 +56,7 @@ static void do_vs_prog( struct brw_context *brw,
    c.prog_data.inputs_read = vp->program.Base.InputsRead;
 
    if (c.key.copy_edgeflag) {
-      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.outputs_written |= BITFIELD64_BIT(VERT_RESULT_EDGE);
       c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 1638ef8111..1b84dd505f 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -67,6 +67,7 @@ static void release_tmps( struct brw_vs_compile *c )
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i, reg = 0, mrf;
    int attributes_in_vue;
 
@@ -141,13 +142,13 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (intel->is_ironlake)
        mrf = 8;
    else
        mrf = 4;
 
    for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1 << i)) {
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 	 c->nr_outputs++;
          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 	 if (i == VERT_RESULT_HPOS) {
@@ -238,7 +239,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (intel->is_ironlake)
        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
@@ -331,63 +332,76 @@ static void unalias3( struct brw_vs_compile *c,
    }
 }
 
-static void emit_sop( struct brw_compile *p,
+static void emit_sop( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1, 
 		      GLuint cond)
 {
+   struct brw_compile *p = &c->func;
+
    brw_MOV(p, dst, brw_imm_f(0.0f));
    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
    brw_MOV(p, dst, brw_imm_f(1.0f));
    brw_set_predicate_control_flag_value(p, 0xff);
 }
 
-static void emit_seq( struct brw_compile *p,
+static void emit_seq( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 }
 
-static void emit_sne( struct brw_compile *p,
+static void emit_sne( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 }
-static void emit_slt( struct brw_compile *p, 
+static void emit_slt( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 }
 
-static void emit_sle( struct brw_compile *p, 
+static void emit_sle( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 }
 
-static void emit_sgt( struct brw_compile *p, 
+static void emit_sgt( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 }
 
-static void emit_sge( struct brw_compile *p, 
+static void emit_sge( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+  emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_cmp( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      struct brw_reg arg2 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
+   brw_SEL(p, dst, arg1, arg2);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
 static void emit_max( struct brw_compile *p, 
@@ -912,6 +926,7 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_CONSTANT:
    case PROGRAM_UNIFORM:
    case PROGRAM_ENV_PARAM:
+   case PROGRAM_LOCAL_PARAM:
       if (c->vp->use_const_buffer) {
          return get_constant(c, inst, argIndex);
       }
@@ -930,7 +945,6 @@ get_src_reg( struct brw_vs_compile *c,
       /* this is a normal case since we loop over all three src args */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
    case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
@@ -1100,6 +1114,8 @@ static void emit_swz( struct brw_vs_compile *c,
 static void emit_vertex_write( struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
@@ -1122,8 +1138,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
-   if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || BRW_IS_965(p->brw))
+   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+       c->key.nr_userclip || brw->has_negative_rhw_bug)
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1132,7 +1148,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
-      if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
@@ -1154,7 +1170,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (BRW_IS_965(p->brw)) {
+      if (brw->has_negative_rhw_bug) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1182,8 +1198,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
 
-   if (BRW_IS_IGDNG(p->brw)) {
-       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+   if (intel->is_ironlake) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on Ironlake */
        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
        /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
         * Seems it is useless for us.
@@ -1222,7 +1238,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        */
       GLuint i, mrf = 0;
       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
-         if (c->prog_data.outputs_written & (1 << i)) {
+         if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
             /* move from GRF to MRF */
             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
             mrf++;
@@ -1269,6 +1285,38 @@ post_vs_emit( struct brw_vs_compile *c,
    }
 }
 
+static GLboolean
+accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
+
+   if (p->nr_insn == 0)
+      return GL_FALSE;
+
+   if (val.address_mode != BRW_ADDRESS_DIRECT)
+      return GL_FALSE;
+
+   switch (prev_insn->header.opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MUL:
+      if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
+	  prev_insn->header.execution_size == val.width &&
+	  prev_insn->bits1.da1.dest_reg_file == val.file &&
+	  prev_insn->bits1.da1.dest_reg_type == val.type &&
+	  prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
+	  prev_insn->bits1.da1.dest_reg_nr == val.nr &&
+	  prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
+	  prev_insn->bits1.da16.dest_writemask == 0xf)
+	 return GL_TRUE;
+      else
+	 return GL_FALSE;
+   default:
+      return GL_FALSE;
+   }
+}
+
 static uint32_t
 get_predicate(const struct prog_instruction *inst)
 {
@@ -1314,6 +1362,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 #define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
@@ -1447,9 +1496,13 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
 	 break;
       case OPCODE_MAD:
-	 brw_MOV(p, brw_acc_reg(), args[2]);
+	 if (!accumulator_contains(c, args[2]))
+	    brw_MOV(p, brw_acc_reg(), args[2]);
 	 brw_MAC(p, dst, args[0], args[1]);
 	 break;
+      case OPCODE_CMP:
+	 emit_cmp(p, dst, args[0], args[1], args[2]);
+	 break;
       case OPCODE_MAX:
 	 emit_max(p, dst, args[0], args[1]);
 	 break;
@@ -1473,25 +1526,25 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 break;
 
       case OPCODE_SEQ:
-         emit_seq(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_seq);
          break;
       case OPCODE_SIN:
 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_SNE:
-         emit_sne(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sne);
          break;
       case OPCODE_SGE:
-	 emit_sge(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sge);
 	 break;
       case OPCODE_SGT:
-         emit_sgt(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sgt);
          break;
       case OPCODE_SLT:
-	 emit_slt(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_slt);
 	 break;
       case OPCODE_SLE:
-         emit_sle(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sle);
          break;
       case OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
@@ -1543,7 +1596,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
             loop_depth--;
 
-	    if (BRW_IS_IGDNG(brw))
+	    if (intel->is_ironlake)
 	       br = 2;
 
             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index d790ab6555..345ffa7ee1 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -82,9 +82,9 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 static dri_bo *
 vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_vs_unit_state vs;
    dri_bo *bo;
-   int chipset_max_threads;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -98,7 +98,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.thread1.single_program_flow = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       vs.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -109,24 +109,44 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw))
-       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
-   else
-       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   if (intel->is_ironlake) {
+      switch (key->nr_urb_entries) {
+      case 8:
+      case 12:
+      case 16:
+      case 32:
+      case 64:
+      case 96:
+      case 128:
+      case 168:
+      case 192:
+      case 224:
+      case 256:
+	 vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+	 break;
+      default:
+	 assert(0);
+      }
+   } else {
+      switch (key->nr_urb_entries) {
+      case 8:
+      case 12:
+      case 16:
+      case 32:
+	 break;
+      case 64:
+	 assert(intel->is_g4x);
+	 break;
+      default:
+	 assert(0);
+      }
+      vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   }
 
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_IGDNG(brw))
-      chipset_max_threads = 72;
-   else if (BRW_IS_G4X(brw))
-      chipset_max_threads = 32;
-   else
-      chipset_max_threads = 16;
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
-				  1, chipset_max_threads) - 1;
-
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      vs.thread4.max_threads = 0;
+				  1, brw->vs_max_threads) - 1;
 
    /* No samplers for ARB_vp programs:
     */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 4fa3269bed..3bc9840a97 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -52,6 +52,7 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
    drm_intel_bo *const_buffer;
+   int i;
 
    /* BRW_NEW_VERTEX_PROGRAM */
    if (!vp->use_const_buffer)
@@ -61,7 +62,19 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
 				     size, 64);
 
    /* _NEW_PROGRAM_CONSTANTS */
-   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   /* Updates the ParamaterValues[i] pointers for all parameters of the
+    * basic type of PROGRAM_STATE_VAR.
+    */
+   _mesa_load_state_parameters(&brw->intel.ctx, vp->program.Base.Parameters);
+
+   intel_bo_map_gtt_preferred(intel, const_buffer, GL_TRUE);
+   for (i = 0; i < params->NumParameters; i++) {
+      memcpy(const_buffer->virtual + i * 4 * sizeof(float),
+	     params->ParameterValues[i],
+	     4 * sizeof(float));
+   }
+   intel_bo_unmap_gtt_preferred(intel, const_buffer);
 
    return const_buffer;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 124fde25fe..72749b3859 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -46,7 +46,7 @@
 #include "brw_state.h"
 #include "brw_fallback.h"
 #include "brw_vs.h"
-
+#include "brw_wm.h"
 
 static void
 dri_bo_release(dri_bo **bo)
@@ -66,8 +66,14 @@ static void brw_destroy_context( struct intel_context *intel )
 
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
-
-   _mesa_free(brw->wm.compile_data);
+   brw_clear_validated_bos(brw);
+   if (brw->wm.compile_data) {
+      _mesa_free(brw->wm.compile_data->instruction);
+      _mesa_free(brw->wm.compile_data->vreg);
+      _mesa_free(brw->wm.compile_data->refs);
+      _mesa_free(brw->wm.compile_data->prog_instructions);
+      _mesa_free(brw->wm.compile_data);
+   }
 
    for (i = 0; i < brw->state.nr_color_regions; i++)
       intel_region_release(&brw->state.color_regions[i]);
@@ -144,9 +150,6 @@ static void brw_new_batch( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
 
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!brw->no_batch_wrap);
-
    brw->curbe.need_new_bo = GL_TRUE;
 
    /* Mark all context state as needing to be re-emitted.
@@ -175,20 +178,6 @@ static void brw_note_fence( struct intel_context *intel, GLuint fence )
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
 
-/* called from intelWaitForIdle() and intelFlush()
- *
- * For now, just flush everything.  Could be smarter later.
- */
-static GLuint brw_flush_cmd( void )
-{
-   struct brw_mi_flush flush;
-   flush.opcode = CMD_MI_FLUSH;
-   flush.pad = 0;
-   flush.flags = BRW_FLUSH_STATE_CACHE;
-   return *(GLuint *)&flush;
-}
-
-
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -209,6 +198,5 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
-   brw->intel.vtbl.flush_cmd = brw_flush_cmd;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 964ee104c2..6895f64410 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -152,8 +152,21 @@ static void do_wm_prog( struct brw_context *brw,
           */
          return;
       }
+      c->instruction = _mesa_calloc(BRW_WM_MAX_INSN * sizeof(*c->instruction));
+      c->prog_instructions = _mesa_calloc(BRW_WM_MAX_INSN *
+					  sizeof(*c->prog_instructions));
+      c->vreg = _mesa_calloc(BRW_WM_MAX_VREG * sizeof(*c->vreg));
+      c->refs = _mesa_calloc(BRW_WM_MAX_REF * sizeof(*c->refs));
    } else {
+      void *instruction = c->instruction;
+      void *prog_instructions = c->prog_instructions;
+      void *vreg = c->vreg;
+      void *refs = c->refs;
       memset(c, 0, sizeof(*brw->wm.compile_data));
+      c->instruction = instruction;
+      c->prog_instructions = prog_instructions;
+      c->vreg = vreg;
+      c->refs = refs;
    }
    memcpy(&c->key, key, sizeof(*key));
 
@@ -217,7 +230,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
        ctx->Color.AlphaEnabled)
       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
+   if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 
    /* _NEW_DEPTH */
@@ -333,7 +346,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    key->nr_color_regions = brw->state.nr_color_regions;
 
    /* CACHE_NEW_VS_PROG */
-   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written;
 
    /* The unique fragment program ID */
    key->program_string_id = fp->id;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 47aa4da306..b9b987ea70 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -76,10 +76,10 @@ struct brw_wm_prog_key {
 
    GLushort tex_swizzles[BRW_MAX_TEX_UNIT];
 
-   GLuint program_string_id:32;
    GLushort origin_x, origin_y;
    GLushort drawable_height;
-   GLuint vp_outputs_written;
+   GLbitfield64 vp_outputs_written;
+   GLuint program_string_id:32;
 };
 
 
@@ -162,6 +162,8 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_CONST 256
 #define BRW_WM_MAX_SUBROUTINE 16
 
+/* used in masks next to WRITEMASK_*. */
+#define SATURATE (1<<5)
 
 
 /* New opcodes to track internal operations required for WM unit.
@@ -200,7 +202,7 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   struct prog_instruction *prog_instructions;
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
@@ -211,7 +213,7 @@ struct brw_wm_compile {
    struct prog_src_register pixel_w;
 
 
-   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   struct brw_wm_value *vreg;
    GLuint nr_vreg;
 
    struct brw_wm_value creg[BRW_WM_MAX_PARAM];
@@ -228,10 +230,10 @@ struct brw_wm_compile {
    struct brw_wm_ref undef_ref;
    struct brw_wm_value undef_value;
 
-   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   struct brw_wm_ref *refs;
    GLuint nr_refs;
 
-   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   struct brw_wm_instruction *instruction;
    GLuint nr_insns;
 
    struct brw_wm_constref constref[BRW_WM_MAX_CONST];
@@ -306,10 +308,141 @@ void brw_wm_lookup_iz( GLuint line_aa,
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
+/* brw_wm_emit.c */
+void emit_alu1(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0);
+void emit_alu2(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0,
+	       const struct brw_reg *arg1);
+void emit_cinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0);
 void emit_ddxy(struct brw_compile *p,
 	       const struct brw_reg *dst,
 	       GLuint mask,
 	       GLboolean is_ddx,
 	       const struct brw_reg *arg0);
+void emit_delta_xy(struct brw_compile *p,
+		   const struct brw_reg *dst,
+		   GLuint mask,
+		   const struct brw_reg *arg0);
+void emit_dp3(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_dp4(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_dph(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_fb_write(struct brw_wm_compile *c,
+		   struct brw_reg *arg0,
+		   struct brw_reg *arg1,
+		   struct brw_reg *arg2,
+		   GLuint target,
+		   GLuint eot);
+void emit_frontfacing(struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask);
+void emit_linterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas);
+void emit_lrp(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2);
+void emit_mad(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2);
+void emit_math1(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0);
+void emit_math2(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0,
+		const struct brw_reg *arg1);
+void emit_min(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_max(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_pinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas,
+		  const struct brw_reg *w);
+void emit_pixel_xy(struct brw_wm_compile *c,
+		   const struct brw_reg *dst,
+		   GLuint mask);
+void emit_pixel_w(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas);
+void emit_sop(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      GLuint cond,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_tex(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler,
+	      GLboolean shadow);
+void emit_txb(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler);
+void emit_wpos_xy(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0);
+void emit_xpd(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index bf80a2942a..f316e0cda4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -44,6 +44,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
    return reg;
 }
 
+
 /* Payload R0:
  *
  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
@@ -60,42 +61,50 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
  * R1.8 -- ?
  */
 
-
-static void emit_pixel_xy(struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask)
+void emit_pixel_xy(struct brw_wm_compile *c,
+		   const struct brw_reg *dst,
+		   GLuint mask)
 {
+   struct brw_compile *p = &c->func;
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+   struct brw_reg dst0_uw, dst1_uw;
 
+   brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+   if (c->dispatch_width == 16) {
+      dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   } else {
+      dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   }
+
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
    if (mask & WRITEMASK_X) {
       brw_ADD(p,
-	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
+	      dst0_uw,
 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
 	      brw_imm_v(0x10101010));
    }
 
    if (mask & WRITEMASK_Y) {
       brw_ADD(p,
-	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
+	      dst1_uw,
 	      stride(suboffset(r1_uw,5), 2, 4, 0),
 	      brw_imm_v(0x11001100));
    }
-
-   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   brw_pop_insn_state(p);
 }
 
 
-
-static void emit_delta_xy(struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0)
+void emit_delta_xy(struct brw_compile *p,
+		   const struct brw_reg *dst,
+		   GLuint mask,
+		   const struct brw_reg *arg0)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
@@ -118,10 +127,10 @@ static void emit_delta_xy(struct brw_compile *p,
    }
 }
 
-static void emit_wpos_xy(struct brw_wm_compile *c,
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0)
+void emit_wpos_xy(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
 
@@ -146,12 +155,14 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 }
 
 
-static void emit_pixel_w( struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *deltas)
+void emit_pixel_w(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas)
 {
+   struct brw_compile *p = &c->func;
+
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
     */
@@ -165,21 +176,29 @@ static void emit_pixel_w( struct brw_compile *p,
       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 
       /* Calc w */
-      brw_math_16( p, dst[3],
-		   BRW_MATH_FUNCTION_INV,
-		   BRW_MATH_SATURATE_NONE,
-		   2, brw_null_reg(),
-		   BRW_MATH_PRECISION_FULL);
+      if (c->dispatch_width == 16) {
+	 brw_math_16(p, dst[3],
+		     BRW_MATH_FUNCTION_INV,
+		     BRW_MATH_SATURATE_NONE,
+		     2, brw_null_reg(),
+		     BRW_MATH_PRECISION_FULL);
+      } else {
+	 brw_math(p, dst[3],
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  2, brw_null_reg(),
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+      }
    }
 }
 
 
-
-static void emit_linterp( struct brw_compile *p, 
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0,
-			 const struct brw_reg *deltas )
+void emit_linterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -199,12 +218,12 @@ static void emit_linterp( struct brw_compile *p,
 }
 
 
-static void emit_pinterp( struct brw_compile *p, 
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *deltas,
-			  const struct brw_reg *w)
+void emit_pinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas,
+		  const struct brw_reg *w)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -229,10 +248,10 @@ static void emit_pinterp( struct brw_compile *p,
 }
 
 
-static void emit_cinterp( struct brw_compile *p, 
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0 )
+void emit_cinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -251,9 +270,9 @@ static void emit_cinterp( struct brw_compile *p,
 }
 
 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
-static void emit_frontfacing( struct brw_compile *p,
-			      const struct brw_reg *dst,
-			      GLuint mask )
+void emit_frontfacing(struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask)
 {
    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    GLuint i;
@@ -352,13 +371,13 @@ void emit_ddxy(struct brw_compile *p,
       brw_set_saturate(p, 0);
 }
 
-static void emit_alu1( struct brw_compile *p, 
-		       struct brw_instruction *(*func)(struct brw_compile *, 
-						       struct brw_reg, 
-						       struct brw_reg),
-		       const struct brw_reg *dst,
-		       GLuint mask,
-		       const struct brw_reg *arg0 )
+void emit_alu1(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0)
 {
    GLuint i;
 
@@ -376,15 +395,15 @@ static void emit_alu1( struct brw_compile *p,
 }
 
 
-static void emit_alu2( struct brw_compile *p, 
-		       struct brw_instruction *(*func)(struct brw_compile *, 
-						       struct brw_reg, 
-						       struct brw_reg, 
-						       struct brw_reg),
-		       const struct brw_reg *dst,
-		       GLuint mask,
-		       const struct brw_reg *arg0,
-		       const struct brw_reg *arg1 )
+void emit_alu2(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0,
+	       const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -402,12 +421,12 @@ static void emit_alu2( struct brw_compile *p,
 }
 
 
-static void emit_mad( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1,
-		      const struct brw_reg *arg2 )
+void emit_mad(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2)
 {
    GLuint i;
 
@@ -422,26 +441,12 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
-static void emit_trunc( struct brw_compile *p,
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0)
-{
-   GLuint i;
-
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 brw_RNDZ(p, dst[i], arg0[i]);
-      }
-   }
-}
-
-static void emit_lrp( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1,
-		      const struct brw_reg *arg2 )
+void emit_lrp(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2)
 {
    GLuint i;
 
@@ -461,21 +466,24 @@ static void emit_lrp( struct brw_compile *p,
    }
 }
 
-static void emit_sop( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      GLuint cond,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_sop(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      GLuint cond,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
-	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_push_insn_state(p);
 	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
-	 brw_set_predicate_control_flag_value(p, 0xff);
+	 brw_pop_insn_state(p);
       }
    }
 }
@@ -559,11 +567,11 @@ static void emit_cmp( struct brw_compile *p,
    }
 }
 
-static void emit_max( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_max(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -583,11 +591,11 @@ static void emit_max( struct brw_compile *p,
    }
 }
 
-static void emit_min( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_min(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -608,11 +616,11 @@ static void emit_min( struct brw_compile *p,
 }
 
 
-static void emit_dp3( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dp3(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -630,11 +638,11 @@ static void emit_dp3( struct brw_compile *p,
 }
 
 
-static void emit_dp4( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dp4(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -653,11 +661,11 @@ static void emit_dp4( struct brw_compile *p,
 }
 
 
-static void emit_dph( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dph(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -676,15 +684,15 @@ static void emit_dph( struct brw_compile *p,
 }
 
 
-static void emit_xpd( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_xpd(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert((mask & WRITEMASK_W) != WRITEMASK_W);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -701,41 +709,68 @@ static void emit_xpd( struct brw_compile *p,
 }
 
 
-static void emit_math1( struct brw_compile *p, 
-			GLuint function,
-			const struct brw_reg *dst,
-			GLuint mask,
-			const struct brw_reg *arg0 )
+void emit_math1(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0)
 {
+   struct brw_compile *p = &c->func;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+		      BRW_MATH_SATURATE_SATURATE :
+		      BRW_MATH_SATURATE_NONE);
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
+   /* If compressed, this will write message reg 2,3 from arg0.x's 16
+    * channels.
+    */
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
    /* Send two messages to perform all 16 operations:
     */
-   brw_math_16(p, 
-	       dst[dst_chan],
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_math(p,
+	    dst[dst_chan],
+	    function,
+	    saturate,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+	       offset(dst[dst_chan],1),
 	       function,
-	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	       2,
+	       saturate,
+	       3,
 	       brw_null_reg(),
+	       BRW_MATH_DATA_VECTOR,
 	       BRW_MATH_PRECISION_FULL);
+   }
+   brw_pop_insn_state(p);
 }
 
 
-static void emit_math2( struct brw_compile *p, 
-			GLuint function,
-			const struct brw_reg *dst,
-			GLuint mask,
-			const struct brw_reg *arg0,
-			const struct brw_reg *arg1)
+void emit_math2(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0,
+		const struct brw_reg *arg1)
 {
+   struct brw_compile *p = &c->func;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+		      BRW_MATH_SATURATE_SATURATE :
+		      BRW_MATH_SATURATE_NONE);
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
@@ -746,173 +781,233 @@ static void emit_math2( struct brw_compile *p,
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_MOV(p, brw_message_reg(2), arg0[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   }
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_MOV(p, brw_message_reg(3), arg1[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   }
 
-   
-   /* Send two messages to perform all 16 operations:
-    */
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, 
 	    dst[dst_chan],
 	    function,
-	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    saturate,
 	    2,
 	    brw_null_reg(),
 	    BRW_MATH_DATA_VECTOR,
 	    BRW_MATH_PRECISION_FULL);
 
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_math(p, 
-	    offset(dst[dst_chan],1),
-	    function,
-	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    4,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-   
+   /* Send two messages to perform all 16 operations:
+    */
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+	       offset(dst[dst_chan],1),
+	       function,
+	       saturate,
+	       4,
+	       brw_null_reg(),
+	       BRW_MATH_DATA_VECTOR,
+	       BRW_MATH_PRECISION_FULL);
+   }
    brw_pop_insn_state(p);
 }
-		     
 
 
-static void emit_tex( struct brw_wm_compile *c,
-		      const struct brw_wm_instruction *inst,
-		      struct brw_reg *dst,
-		      GLuint dst_flags,
-		      struct brw_reg *arg )
+void emit_tex(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler,
+	      GLboolean shadow)
 {
    struct brw_compile *p = &c->func;
-   GLuint msgLength, responseLength;
-   GLuint i, nr;
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg dst_retyped;
+   GLuint cur_mrf = 2, response_length;
+   GLuint i, nr_texcoords;
    GLuint emit;
    GLuint msg_type;
+   GLuint mrf_per_channel;
+   GLuint simd_mode;
+
+   if (c->dispatch_width == 16) {
+      mrf_per_channel = 2;
+      response_length = 8;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   } else {
+      mrf_per_channel = 1;
+      response_length = 4;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+   }
 
    /* How many input regs are there?
     */
-   switch (inst->tex_idx) {
+   switch (tex_idx) {
    case TEXTURE_1D_INDEX:
       emit = WRITEMASK_X;
-      nr = 1;
+      nr_texcoords = 1;
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
       emit = WRITEMASK_XY;
-      nr = 2;
+      nr_texcoords = 2;
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
       emit = WRITEMASK_XYZ;
-      nr = 3;
+      nr_texcoords = 3;
       break;
    default:
       /* unexpected target */
       abort();
    }
 
-   if (inst->tex_shadow) {
-      nr = 4;
-      emit |= WRITEMASK_W;
-   }
+   /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
+   if (!intel->is_ironlake && c->dispatch_width == 8)
+      nr_texcoords = 3;
 
-   msgLength = 1;
+   /* For shadow comparisons, we have to supply u,v,r. */
+   if (shadow)
+      nr_texcoords = 3;
 
-   for (i = 0; i < nr; i++) {
-      static const GLuint swz[4] = {0,1,2,2};
-      if (emit & (1<<i)) 
-	 brw_MOV(p, brw_message_reg(msgLength+1), arg[swz[i]]);
+   /* Emit the texcoords. */
+   for (i = 0; i < nr_texcoords; i++) {
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
       else
-	 brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
-      msgLength += 2;
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+      cur_mrf += mrf_per_channel;
    }
 
-   responseLength = 8;		/* always */
+   /* Fill in the shadow comparison reference value. */
+   if (shadow) {
+      if (intel->is_ironlake) {
+	 /* Fill in the cube map array index value. */
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+	 cur_mrf += mrf_per_channel;
+      } else if (c->dispatch_width == 8) {
+	 /* Fill in the LOD bias value. */
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+	 cur_mrf += mrf_per_channel;
+      }
+      brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
+      cur_mrf += mrf_per_channel;
+   }
 
-   if (BRW_IS_IGDNG(p->brw)) {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   if (intel->is_ironlake) {
+      if (shadow)
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
    } else {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+      /* Note that G45 and older determines shadow compare and dispatch width
+       * from message length for most messages.
+       */
+      if (c->dispatch_width == 16 && shadow)
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
    }
 
-   brw_SAMPLE(p, 
-	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+   brw_SAMPLE(p,
+	      dst_retyped,
 	      1,
-	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
-	      inst->writemask,
-	      msg_type, 
-	      responseLength,
-	      msgLength,
-	      0,	
+	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+	      sampler,
+	      dst_flags & WRITEMASK_XYZW,
+	      msg_type,
+	      response_length,
+	      cur_mrf - 1,
+	      0,
 	      1,
-	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+	      simd_mode);
 }
 
 
-static void emit_txb( struct brw_wm_compile *c,
-		      const struct brw_wm_instruction *inst,
-		      struct brw_reg *dst,
-		      GLuint dst_flags,
-		      struct brw_reg *arg )
+void emit_txb(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    GLuint msgLength;
    GLuint msg_type;
-   /* Shadow ignored for txb.
+   GLuint mrf_per_channel;
+   GLuint response_length;
+   struct brw_reg dst_retyped;
+
+   /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
+    * samples, so we'll use the 16-wide instruction, leave the second halves
+    * undefined, and trust the execution mask to keep the undefined pixels
+    * from mattering.
     */
-   switch (inst->tex_idx) {
+   if (c->dispatch_width == 16 || !intel->is_ironlake) {
+      if (intel->is_ironlake)
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+      mrf_per_channel = 2;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 8;
+   } else {
+      msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      mrf_per_channel = 1;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 4;
+   }
+
+   /* Shadow ignored for txb. */
+   switch (tex_idx) {
    case TEXTURE_1D_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), arg[2]);
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
       break;
    default:
       /* unexpected target */
       abort();
    }
 
-   brw_MOV(p, brw_message_reg(8), arg[3]);
-   msgLength = 9;
-
-   if (BRW_IS_IGDNG(p->brw))
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
-   else
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+   brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
+   msgLength = 2 + 4 * mrf_per_channel - 1;
 
    brw_SAMPLE(p, 
-	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      dst_retyped,
 	      1,
-	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
-	      inst->writemask,
+	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+	      sampler,
+	      dst_flags & WRITEMASK_XYZW,
 	      msg_type,
-	      8,		/* responseLength */
+	      response_length,
 	      msgLength,
 	      0,	
 	      1,
@@ -920,11 +1015,13 @@ static void emit_txb( struct brw_wm_compile *c,
 }
 
 
-static void emit_lit( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0 )
+static void emit_lit(struct brw_wm_compile *c,
+		     const struct brw_reg *dst,
+		     GLuint mask,
+		     const struct brw_reg *arg0)
 {
+   struct brw_compile *p = &c->func;
+
    assert((mask & WRITEMASK_XW) == 0);
 
    if (mask & WRITEMASK_Y) {
@@ -934,7 +1031,7 @@ static void emit_lit( struct brw_compile *p,
    }
 
    if (mask & WRITEMASK_Z) {
-      emit_math2(p, BRW_MATH_FUNCTION_POW,
+      emit_math2(c, BRW_MATH_FUNCTION_POW,
 		 &dst[2],
 		 WRITEMASK_X | (mask & SATURATE),
 		 &arg0[1],
@@ -989,7 +1086,7 @@ static void emit_kil_nv( struct brw_wm_compile *c )
 
    brw_push_insn_state(p);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
    brw_pop_insn_state(p);
 }
@@ -1001,7 +1098,13 @@ static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint eot )
 {
    struct brw_compile *p = &c->func;
-   
+   struct brw_reg dst;
+
+   if (c->dispatch_width == 16)
+      dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
    /* Pass through control information:
     */
 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
@@ -1018,7 +1121,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
    /* Send framebuffer write message: */
 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
    brw_fb_WRITE(p,
-		retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		dst,
 		base_reg,
 		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 		target,		
@@ -1050,14 +1153,15 @@ static void emit_aa( struct brw_wm_compile *c,
  * \param arg1  the pass-through depth value
  * \param arg2  the shader-computed depth value
  */
-static void emit_fb_write( struct brw_wm_compile *c,
-			   struct brw_reg *arg0,
-			   struct brw_reg *arg1,
-			   struct brw_reg *arg2,
-			   GLuint target,
-			   GLuint eot)
+void emit_fb_write(struct brw_wm_compile *c,
+		   struct brw_reg *arg0,
+		   struct brw_reg *arg1,
+		   struct brw_reg *arg2,
+		   GLuint target,
+		   GLuint eot)
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    GLuint nr = 2;
    GLuint channel;
 
@@ -1069,30 +1173,37 @@ static void emit_fb_write( struct brw_wm_compile *c,
    /* I don't really understand how this achieves the color interleave
     * (ie RGBARGBA) in the result:  [Do the saturation here]
     */
-   {
-      brw_push_insn_state(p);
-      
-      for (channel = 0; channel < 4; channel++) {
+   brw_push_insn_state(p);
+
+   for (channel = 0; channel < 4; channel++) {
+      if (c->dispatch_width == 16 && brw->has_compr4) {
+	 /* By setting the high bit of the MRF register number, we indicate
+	  * that we want COMPR4 mode - instead of doing the usual destination
+	  * + 1 for the second half we get destination + 4.
+	  */
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel + (1 << 7)),
+		 arg0[channel]);
+      } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_MOV(p,
 		 brw_message_reg(nr + channel),
 		 arg0[channel]);
-       
-	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-	 brw_MOV(p,
-		 brw_message_reg(nr + channel + 4),
-		 sechalf(arg0[channel]));
-      }
 
-      /* skip over the regs populated above:
-       */
-      nr += 8;
-   
-      brw_pop_insn_state(p);
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_MOV(p,
+		    brw_message_reg(nr + channel + 4),
+		    sechalf(arg0[channel]));
+	 }
+      }
    }
+   /* skip over the regs populated above:
+    */
+   nr += 8;
+   brw_pop_insn_state(p);
 
    if (c->key.source_depth_to_render_target)
    {
@@ -1142,7 +1253,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
 	      get_element_ud(brw_vec8_grf(1,0), 6), 
 	      brw_imm_ud(1<<26)); 
 
-      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
       {
 	 emit_aa(c, arg1, 2);
 	 fire_fb_write(c, 0, nr, target, eot);
@@ -1156,7 +1267,6 @@ static void emit_fb_write( struct brw_wm_compile *c,
    }
 }
 
-
 /**
  * Move a GPR to scratch memory. 
  */
@@ -1294,7 +1404,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Generated instructions for calculating triangle interpolants:
 	  */
       case WM_PIXELXY:
-	 emit_pixel_xy(p, dst, dst_flags);
+	 emit_pixel_xy(c, dst, dst_flags);
 	 break;
 
       case WM_DELTAXY:
@@ -1306,7 +1416,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case WM_PIXELW:
-	 emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
+	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
 	 break;
 
       case WM_LINTERP:
@@ -1364,7 +1474,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TRUNC:
-	 emit_trunc(p, dst, dst_flags, args[0]);
+	 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_LRP:
@@ -1391,27 +1501,27 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Higher math functions:
 	  */
       case OPCODE_RCP:
-	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_RSQ:
-	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_SIN:
-	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_COS:
-	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_EX2:
-	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_LG2:
-	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_SCS:
@@ -1419,13 +1529,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	  * fixup for 16-element execution.
 	  */
 	 if (dst_flags & WRITEMASK_X)
-	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
 	 if (dst_flags & WRITEMASK_Y)
-	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
 	 break;
 
       case OPCODE_POW:
-	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
+	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
 	 break;
 
 	 /* Comparisons:
@@ -1463,17 +1573,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	break;
 
       case OPCODE_LIT:
-	 emit_lit(p, dst, dst_flags, args[0]);
+	 emit_lit(c, dst, dst_flags, args[0]);
 	 break;
 
 	 /* Texturing operations:
 	  */
       case OPCODE_TEX:
-	 emit_tex(c, inst, dst, dst_flags, args[0]);
+	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+		  inst->tex_idx, inst->tex_unit,
+		  inst->tex_shadow);
 	 break;
 
       case OPCODE_TXB:
-	 emit_txb(c, inst, dst, dst_flags, args[0]);
+	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+		  inst->tex_idx, inst->tex_unit);
 	 break;
 
       case OPCODE_KIL:
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 0e86d75dea..3737faf26f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -138,7 +138,6 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
    reg.CondSrc = 0;
-   reg.pad = 0;
    return reg;
 }
 
@@ -182,6 +181,8 @@ static void release_temp( struct brw_wm_compile *c, struct prog_dst_register tem
 static struct prog_instruction *get_fp_inst(struct brw_wm_compile *c)
 {
    assert(c->nr_fp_insns < BRW_WM_MAX_INSN);
+   memset(&c->prog_instructions[c->nr_fp_insns], 0,
+	  sizeof(*c->prog_instructions));
    return &c->prog_instructions[c->nr_fp_insns++];
 }
 
@@ -448,7 +449,6 @@ static void emit_interp( struct brw_wm_compile *c,
       break;
 
    case FRAG_ATTRIB_FACE:
-      /* XXX review/test this case */
       emit_op(c,
               WM_FRONTFACING,
               dst_mask(dst, WRITEMASK_X),
@@ -957,7 +957,7 @@ static void precalc_txp( struct brw_wm_compile *c,
 
 
 
-static void emit_fb_write( struct brw_wm_compile *c )
+static void emit_render_target_writes( struct brw_wm_compile *c )
 {
    struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
    struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
@@ -985,7 +985,7 @@ static void emit_fb_write( struct brw_wm_compile *c )
    }
    else {
       /* if gl_FragData[0] is written, use it, else use gl_FragColor */
-      if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
+      if (c->fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DATA0))
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
       else 
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
@@ -1154,7 +1154,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out->DstReg.WriteMask = 0;
 	 break;
       case OPCODE_END:
-	 emit_fb_write(c);
+	 emit_render_target_writes(c);
 	 break;
       case OPCODE_PRINT:
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 28d6d4eba5..fde83eea62 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -371,7 +371,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  for (j = 0; j < 4; j++)
 	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
        }
-       if (c->key.vp_outputs_written & (1 << i)) {
+       if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 	  reg_index += 2;
        }
     }
@@ -550,42 +550,6 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     }
 }
 
-
-/**
- * Same as \sa get_src_reg() but if the register is a literal, emit
- * a brw_reg encoding the literal.
- * Note that a brw instruction only allows one src operand to be a literal.
- * For instructions with more than one operand, only the second can be a
- * literal.  This means that we treat some literals as constants/uniforms
- * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
- * 
- */
-static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
-                                      const struct prog_instruction *inst,
-                                      GLuint srcRegIndex, GLuint channel)
-{
-    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    if (src->File == PROGRAM_CONSTANT) {
-       /* a literal */
-       const int component = GET_SWZ(src->Swizzle, channel);
-       const GLfloat *param =
-          c->fp->program.Base.Parameters->ParameterValues[src->Index];
-       GLfloat value = param[component];
-       if (src->Negate & (1 << channel))
-          value = -value;
-       if (src->Abs)
-          value = FABSF(value);
-#if 0
-       printf("  form immed value %f for chan %d\n", value, channel);
-#endif
-       return brw_imm_f(value);
-    }
-    else {
-       return get_src_reg(c, inst, srcRegIndex, channel);
-    }
-}
-
-
 /**
  * Subroutines are minimal support for resusable instruction sequences.
  * They are implemented as simply as possible to minimise overhead: there
@@ -650,542 +614,110 @@ static void invoke_subroutine( struct brw_wm_compile *c,
     }
 }
 
-static void emit_trunc( struct brw_wm_compile *c,
-                        const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src = get_src_reg(c, inst, 0, i);
-	    brw_RNDZ(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_mov( struct brw_wm_compile *c,
-                      const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-            /* XXX some moves from immediate value don't work reliably!!! */
-            /*src = get_src_reg_imm(c, inst, 0, i);*/
-            src = get_src_reg(c, inst, 0, i);
-	    brw_MOV(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_pixel_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
-
-    struct brw_reg dst0, dst1;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    dst0 = get_dst_reg(c, inst, 0);
-    dst1 = get_dst_reg(c, inst, 1);
-    /* Calculate pixel centers by adding 1 or 0 to each of the
-     * micro-tile coordinates passed in r1.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 4), 2, 4, 0),
-		brw_imm_v(0x10101010));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 5), 2, 4, 0),
-		brw_imm_v(0x11001100));
-    }
-}
-
-static void emit_delta_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg dst0, dst1, src0, src1;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    dst0 = get_dst_reg(c, inst, 0);
-    dst1 = get_dst_reg(c, inst, 1);
-    src0 = get_src_reg(c, inst, 0, 0);
-    src1 = get_src_reg(c, inst, 0, 1);
-    /* Calc delta X,Y by subtracting origin in r1 from the pixel
-     * centers.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		dst0,
-		retype(src0, BRW_REGISTER_TYPE_UW),
-		negate(r1));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		dst1,
-		retype(src1, BRW_REGISTER_TYPE_UW),
-		negate(suboffset(r1,1)));
-
-    }
-}
-
-static void fire_fb_write( struct brw_wm_compile *c,
-                           GLuint base_reg,
-                           GLuint nr,
-                           GLuint target,
-                           GLuint eot)
-{
-    struct brw_compile *p = &c->func;
-    /* Pass through control information:
-     */
-    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
-    {
-	brw_push_insn_state(p);
-	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
-	brw_MOV(p,
-		brw_message_reg(base_reg + 1),
-		brw_vec8_grf(1, 0));
-	brw_pop_insn_state(p);
-    }
-    /* Send framebuffer write message: */
-    brw_fb_WRITE(p,
-	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
-	    base_reg,
-	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
-	    target,              
-	    nr,
-	    0,
-	    eot);
-}
-
-static void emit_fb_write(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    int nr = 2;
-    int channel;
-    GLuint target, eot;
-    struct brw_reg src0;
-
-    /* Reserve a space for AA - may not be needed:
-     */
-    if (c->key.aa_dest_stencil_reg)
-	nr += 1;
-
-    brw_push_insn_state(p);
-    for (channel = 0; channel < 4; channel++) {
-        src0 = get_src_reg(c,  inst, 0, channel);
-        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-        brw_MOV(p, brw_message_reg(nr + channel), src0);
-    }
-    /* skip over the regs populated above: */
-    nr += 8;
-    brw_pop_insn_state(p);
-
-    if (c->key.source_depth_to_render_target) {
-       if (c->key.computes_depth) {
-          src0 = get_src_reg(c, inst, 2, 2);
-          brw_MOV(p, brw_message_reg(nr), src0);
-       }
-       else {
-          src0 = get_src_reg(c, inst, 1, 1);
-          brw_MOV(p, brw_message_reg(nr), src0);
-       }
-
-       nr += 2;
-    }
-
-    if (c->key.dest_depth_reg) {
-        const GLuint comp = c->key.dest_depth_reg / 2;
-        const GLuint off = c->key.dest_depth_reg % 2;
-
-        if (off != 0) {
-            /* XXX this code needs review/testing */
-            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
-            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
-
-            brw_push_insn_state(p);
-            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
-            /* 2nd half? */
-            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
-            brw_pop_insn_state(p);
-        }
-        else
-        {
-            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
-            brw_MOV(p, brw_message_reg(nr), src);
-        }
-        nr += 2;
-   }
-
-    target = INST_AUX_GET_TARGET(inst->Aux);
-    eot = inst->Aux & INST_AUX_EOT;
-    fire_fb_write(c, 0, nr, target, eot);
-}
-
-static void emit_pixel_w( struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+/* Workaround for using brw_wm_emit.c's emit functions, which expect
+ * destination regs to be uniquely written.  Moves arguments out to
+ * temporaries as necessary for instructions which use their destination as
+ * a temporary.
+ */
+static void
+unalias3(struct brw_wm_compile *c,
+	 void (*func)(struct brw_compile *c,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2),
+	 const struct brw_reg *dst,
+	 GLuint mask,
+	 const struct brw_reg *arg0,
+	 const struct brw_reg *arg1,
+	 const struct brw_reg *arg2)
 {
     struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    if (mask & WRITEMASK_W) {
-	struct brw_reg dst, src0, delta0, delta1;
-	struct brw_reg interp3;
-
-	dst = get_dst_reg(c, inst, 3);
-	src0 = get_src_reg(c, inst, 0, 0);
-	delta0 = get_src_reg(c, inst, 1, 0);
-	delta1 = get_src_reg(c, inst, 1, 1);
-
-	interp3 = brw_vec1_grf(src0.nr+1, 4);
-	/* Calc 1/w - just linterp wpos[3] optimized by putting the
-	 * result straight into a message reg.
-	 */
-	brw_LINE(p, brw_null_reg(), interp3, delta0);
-	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
-
-	/* Calc w */
-	brw_math_16( p, dst,
-		BRW_MATH_FUNCTION_INV,
-		BRW_MATH_SATURATE_NONE,
-		2, brw_null_reg(),
-		BRW_MATH_PRECISION_FULL);
-    }
-}
+    struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
+    int i, j;
+    int mark = mark_tmps(c);
 
-static void emit_linterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0;
-    GLuint nr, i;
-
-    src0 = get_src_reg(c, inst, 0, 0);
-    delta0 = get_src_reg(c, inst, 1, 0);
-    delta1 = get_src_reg(c, inst, 1, 1);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
-	}
+    for (j = 0; j < 4; j++) {
+	tmp_arg0[j] = arg0[j];
+	tmp_arg1[j] = arg1[j];
+	tmp_arg2[j] = arg2[j];
     }
-}
-
-static void emit_cinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    struct brw_reg interp[4];
-    struct brw_reg dst, src0;
-    GLuint nr, i;
 
-    src0 = get_src_reg(c, inst, 0, 0);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
+    for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, suboffset(interp[i],3));
+	    for (j = 0; j < 4; j++) {
+		if (arg0[j].file == dst[i].file &&
+		    dst[i].nr == arg0[j].nr) {
+		    tmp_arg0[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg0[j], arg0[j]);
+		}
+		if (arg1[j].file == dst[i].file &&
+		    dst[i].nr == arg1[j].nr) {
+		    tmp_arg1[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg1[j], arg1[j]);
+		}
+		if (arg2[j].file == dst[i].file &&
+		    dst[i].nr == arg2[j].nr) {
+		    tmp_arg2[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg2[j], arg2[j]);
+		}
+	    }
 	}
     }
-}
-
-static void emit_pinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
 
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0, w;
-    GLuint nr, i;
+    func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
 
-    src0 = get_src_reg(c, inst, 0, 0);
-    delta0 = get_src_reg(c, inst, 1, 0);
-    delta1 = get_src_reg(c, inst, 1, 1);
-    w = get_src_reg(c, inst, 2, 3);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1), 
-		    delta1);
-	    brw_MUL(p, dst, dst, w);
-	}
-    }
+    release_tmps(c, mark);
 }
 
-/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
-static void emit_frontfacing(struct brw_wm_compile *c,
-			     const struct prog_instruction *inst)
+/* Workaround for using brw_wm_emit.c's emit functions, which expect
+ * destination regs to be uniquely written.  Moves arguments out to
+ * temporaries as necessary for instructions which use their destination as
+ * a temporary.
+ */
+static void
+unalias2(struct brw_wm_compile *c,
+	 void (*func)(struct brw_compile *c,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1),
+	 const struct brw_reg *dst,
+	 GLuint mask,
+	 const struct brw_reg *arg0,
+	 const struct brw_reg *arg1)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
-    struct brw_reg dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, brw_imm_f(0.0));
-	}
-    }
+    struct brw_reg tmp_arg0[4], tmp_arg1[4];
+    int i, j;
+    int mark = mark_tmps(c);
 
-    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
-     * us front face
-     */
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, brw_imm_f(1.0));
-	}
+    for (j = 0; j < 4; j++) {
+	tmp_arg0[j] = arg0[j];
+	tmp_arg1[j] = arg1[j];
     }
-    brw_set_predicate_control_flag_value(p, 0xff);
-}
 
-static void emit_xpd(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
     for (i = 0; i < 4; i++) {
-	GLuint i2 = (i+2)%3;
-	GLuint i1 = (i+1)%3;
 	if (mask & (1<<i)) {
-	    struct brw_reg src0, src1, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = negate(get_src_reg(c, inst, 0, i2));
-	    src1 = get_src_reg_imm(c, inst, 1, i1);
-	    brw_MUL(p, brw_null_reg(), src0, src1);
-	    src0 = get_src_reg(c, inst, 0, i1);
-	    src1 = get_src_reg_imm(c, inst, 1, i2);
-	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-	    brw_MAC(p, dst, src0, src1);
-	    brw_set_saturate(p, 0);
+	    for (j = 0; j < 4; j++) {
+		if (arg0[j].file == dst[i].file &&
+		    dst[i].nr == arg0[j].nr) {
+		    tmp_arg0[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg0[j], arg0[j]);
+		}
+		if (arg1[j].file == dst[i].file &&
+		    dst[i].nr == arg1[j].nr) {
+		    tmp_arg1[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg1[j], arg1[j]);
+		}
+	    }
 	}
     }
-    brw_set_saturate(p, 0);
-}
 
-static void emit_dp3(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[3], src1[3], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 3; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, 0);
-}
-
-static void emit_dp4(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[3], src1[3]);
-    brw_set_saturate(p, 0);
-}
-
-static void emit_dph(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_ADD(p, dst, dst, src1[3]);
-    brw_set_saturate(p, 0);
-}
-
-/**
- * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
- * Note that the result of the function is smeared across the dest
- * register's X, Y, Z and W channels (subject to writemasking of course).
- */
-static void emit_math1(struct brw_wm_compile *c,
-                       const struct prog_instruction *inst, GLuint func)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    /* Get first component of source register */
-    dst = get_dst_reg(c, inst, dst_chan);
-    src0 = get_src_reg(c, inst, 0, 0);
-
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_math(p,
-             dst,
-             func,
-             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-             2,
-             brw_null_reg(),
-             BRW_MATH_DATA_VECTOR,
-             BRW_MATH_PRECISION_FULL);
-}
-
-static void emit_rcp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
-}
-
-static void emit_rsq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
-}
-
-static void emit_sin(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
-}
+    func(p, dst, mask, tmp_arg0, tmp_arg1);
 
-static void emit_cos(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
-}
-
-static void emit_ex2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
-}
-
-static void emit_lg2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
-}
-
-static void emit_add(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_ADD(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
+    release_tmps(c, mark);
 }
 
 static void emit_arl(struct brw_wm_compile *c,
@@ -1201,180 +733,6 @@ static void emit_arl(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-
-static void emit_mul(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_MUL(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_frc(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg_imm(c, inst, 0, i);
-	    brw_FRC(p, dst, src0);
-	}
-    }
-    if (inst->SaturateMode != SATURATE_OFF)
-	brw_set_saturate(p, 0);
-}
-
-static void emit_flr(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg_imm(c, inst, 0, i);
-	    brw_RNDD(p, dst, src0);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-
-static void emit_min_max(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    const GLuint mask = inst->DstReg.WriteMask;
-    const int mark = mark_tmps(c);
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-            struct brw_reg real_dst = get_dst_reg(c, inst, i);
-	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
-	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
-            struct brw_reg dst;
-            /* if dst==src0 or dst==src1 we need to use a temp reg */
-            GLboolean use_temp = brw_same_reg(dst, src0) ||
-                                 brw_same_reg(dst, src1);
-            if (use_temp)
-               dst = alloc_tmp(c);
-            else
-               dst = real_dst;
-
-            /*
-            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
-                   dst.nr, src0.nr, src1.nr);
-            */
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
-
-            if (inst->Opcode == OPCODE_MIN)
-               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
-            else
-               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
-
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-            if (use_temp)
-               brw_MOV(p, real_dst, dst);
-	}
-    }
-    brw_pop_insn_state(p);
-    release_tmps(c, mark);
-}
-
-static void emit_pow(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst, src0, src1;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    dst = get_dst_reg(c, inst, dst_chan);
-    src0 = get_src_reg_imm(c, inst, 0, 0);
-    src1 = get_src_reg_imm(c, inst, 1, 0);
-
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_MOV(p, brw_message_reg(3), src1);
-
-    brw_math(p,
-	    dst,
-	    BRW_MATH_FUNCTION_POW,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-}
-
-static void emit_lrp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
-    int i;
-    int mark = mark_tmps(c);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-
-	    if (src1.nr == dst.nr) {
-		tmp1 = alloc_tmp(c);
-		brw_MOV(p, tmp1, src1);
-	    } else
-		tmp1 = src1;
-
-	    src2 = get_src_reg(c, inst, 2, i);
-	    if (src2.nr == dst.nr) {
-		tmp2 = alloc_tmp(c);
-		brw_MOV(p, tmp2, src2);
-	    } else
-		tmp2 = src2;
-
-	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
-	    brw_MUL(p, brw_null_reg(), dst, tmp2);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MAC(p, dst, src0, tmp1);
-	    brw_set_saturate(p, 0);
-	}
-	release_tmps(c, mark);
-    }
-}
-
 /**
  * For GLSL shaders, this KIL will be unconditional.
  * It may be contained inside an IF/ENDIF structure of course.
@@ -1385,94 +743,11 @@ static void emit_kil(struct brw_wm_compile *c)
     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
     brw_push_insn_state(p);
     brw_set_mask_control(p, BRW_MASK_DISABLE);
-    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
     brw_AND(p, depth, c->emit_mask_reg, depth);
     brw_pop_insn_state(p);
 }
 
-static void emit_mad(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1, src2;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    src2 = get_src_reg_imm(c, inst, 2, i);
-	    brw_MUL(p, dst, src0, src1);
-
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_ADD(p, dst, dst, src2);
-	    brw_set_saturate(p, 0);
-	}
-    }
-}
-
-static void emit_sop(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst, GLuint cond)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_push_insn_state(p);
-	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	    brw_MOV(p, dst, brw_imm_f(0.0));
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, brw_imm_f(1.0));
-	    brw_pop_insn_state(p);
-	}
-    }
-}
-
-static void emit_slt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_L);
-}
-
-static void emit_sle(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_LE);
-}
-
-static void emit_sgt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_G);
-}
-
-static void emit_sge(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_GE);
-}
-
-static void emit_seq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
-}
-
-static void emit_sne(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
-}
-
 static INLINE struct brw_reg high_words( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
@@ -2525,196 +1800,6 @@ static void emit_noise4( struct brw_wm_compile *c,
     
     release_tmps( c, mark );
 }
-    
-static void emit_wpos_xy(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0[2], dst[2];
-
-    dst[0] = get_dst_reg(c, inst, 0);
-    dst[1] = get_dst_reg(c, inst, 1);
-
-    src0[0] = get_src_reg(c, inst, 0, 0);
-    src0[1] = get_src_reg(c, inst, 0, 1);
-
-    /* Calculate the pixel offset from window bottom left into destination
-     * X and Y channels.
-     */
-    if (mask & WRITEMASK_X) {
-	/* X' = X - origin_x */
-	brw_ADD(p,
-		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_W),
-		brw_imm_d(0 - c->key.origin_x));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
-	brw_ADD(p,
-		dst[1],
-		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
-		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
-    }
-}
-
-/* TODO
-   BIAS on SIMD8 not working yet...
- */	
-static void emit_txb(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
-    GLuint i;
-    GLuint msg_type;
-
-    assert(unit < BRW_MAX_TEX_UNIT);
-
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
-
-    for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, inst, 0, i);
-
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
-	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-	    break;
-	case TEXTURE_3D_INDEX:
-	case TEXTURE_CUBE_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), src[2]);
-	    break;
-	default:
-            /* invalid target */
-            abort();
-    }
-    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
-    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
-
-    if (BRW_IS_IGDNG(p->brw)) {
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
-    } else {
-        /* Does it work well on SIMD8? */
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
-    }
-
-    brw_SAMPLE(p,
-               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
-               1,                                           /* msg_reg_nr */
-               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
-               SURF_INDEX_TEXTURE(unit),
-               unit,                                        /* sampler */
-               inst->DstReg.WriteMask,                      /* writemask */
-               msg_type,                                    /* msg_type */
-               4,                                           /* response_length */
-               4,                                           /* msg_length */
-               0,                                           /* eot */
-               1,
-               BRW_SAMPLER_SIMD_MODE_SIMD8);	
-}
-
-
-static void emit_tex(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
-    GLuint msg_len;
-    GLuint i, nr;
-    GLuint emit;
-    GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
-    GLuint msg_type;
-
-    assert(unit < BRW_MAX_TEX_UNIT);
-
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
-
-    for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, inst, 0, i);
-
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    emit = WRITEMASK_X;
-	    nr = 1;
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    emit = WRITEMASK_XY;
-	    nr = 2;
-	    break;
-	case TEXTURE_3D_INDEX:
-	case TEXTURE_CUBE_INDEX:
-	    emit = WRITEMASK_XYZ;
-	    nr = 3;
-	    break;
-	default:
-           /* invalid target */
-           abort();
-    }
-    msg_len = 1;
-
-    /* move/load S, T, R coords */
-    for (i = 0; i < nr; i++) {
-	static const GLuint swz[4] = {0,1,2,2};
-	if (emit & (1<<i))
-	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
-	else
-	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
-	msg_len += 1;
-    }
-
-    if (shadow) {
-       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
-       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
-    }
-
-    if (BRW_IS_IGDNG(p->brw)) {
-        if (shadow)
-            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
-        else
-            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
-    } else {
-        /* Does it work for shadow on SIMD8 ? */
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
-    }
-    
-    brw_SAMPLE(p,
-               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
-               1,                                          /* msg_reg_nr */
-               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
-               SURF_INDEX_TEXTURE(unit),
-               unit,                                       /* sampler */
-               inst->DstReg.WriteMask,                     /* writemask */
-               msg_type,                                   /* msg_type */
-               4,                                          /* response_length */
-               shadow ? 6 : 4,                             /* msg_length */
-               0,                                          /* eot */
-               1,
-               BRW_SAMPLER_SIMD_MODE_SIMD8);	
-
-    if (shadow)
-	brw_MOV(p, dst[3], brw_imm_f(1.0));
-}
-
 
 /**
  * Resolve subroutine calls after code emit is done.
@@ -2741,6 +1826,7 @@ get_argument_regs(struct brw_wm_compile *c,
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
+   struct intel_context *intel = &brw->intel;
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
@@ -2771,137 +1857,153 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
         if (c->fp->use_const_buffer)
            fetch_constants(c, inst);
 
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+	if (inst->Opcode != OPCODE_ARL) {
+	   for (j = 0; j < 4; j++) {
+	      if (inst->DstReg.WriteMask & (1 << j))
+		 dst[j] = get_dst_reg(c, inst, j);
+	      else
+		 dst[j] = brw_null_reg();
+	   }
+	}
+	for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
+	    get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
 
 	dst_flags = inst->DstReg.WriteMask;
 	if (inst->SaturateMode == SATURATE_ZERO_ONE)
 	    dst_flags |= SATURATE;
 
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
 	switch (inst->Opcode) {
 	    case WM_PIXELXY:
-		emit_pixel_xy(c, inst);
+		emit_pixel_xy(c, dst, dst_flags);
 		break;
 	    case WM_DELTAXY: 
-		emit_delta_xy(c, inst);
+		emit_delta_xy(p, dst, dst_flags, args[0]);
 		break;
 	    case WM_PIXELW:
-		emit_pixel_w(c, inst);
+		emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
 		break;	
 	    case WM_LINTERP:
-		emit_linterp(c, inst);
+		emit_linterp(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case WM_PINTERP:
-		emit_pinterp(c, inst);
+		emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case WM_CINTERP:
-		emit_cinterp(c, inst);
+		emit_cinterp(p, dst, dst_flags, args[0]);
 		break;
 	    case WM_WPOSXY:
-		emit_wpos_xy(c, inst);
+		emit_wpos_xy(c, dst, dst_flags, args[0]);
 		break;
 	    case WM_FB_WRITE:
-		emit_fb_write(c, inst);
+		emit_fb_write(c, args[0], args[1], args[2],
+			      INST_AUX_GET_TARGET(inst->Aux),
+			      inst->Aux & INST_AUX_EOT);
 		break;
 	    case WM_FRONTFACING:
-		emit_frontfacing(c, inst);
+		emit_frontfacing(p, dst, dst_flags);
 		break;
 	    case OPCODE_ADD:
-		emit_add(c, inst);
+		emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_ARL:
 		emit_arl(c, inst);
 		break;
 	    case OPCODE_FRC:
-		emit_frc(c, inst);
+		emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_FLR:
-		emit_flr(c, inst);
+		emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_LRP:
-		emit_lrp(c, inst);
+		unalias3(c, emit_lrp,
+			 dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case OPCODE_TRUNC:
-		emit_trunc(c, inst);
+		emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_MOV:
 	    case OPCODE_SWZ:
-		emit_mov(c, inst);
+		emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_DP3:
-		emit_dp3(c, inst);
+		emit_dp3(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DP4:
-		emit_dp4(c, inst);
+		emit_dp4(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_XPD:
-		emit_xpd(c, inst);
+		emit_xpd(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DPH:
-		emit_dph(c, inst);
+		emit_dph(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_RCP:
-		emit_rcp(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_RSQ:
-		emit_rsq(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_SIN:
-		emit_sin(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_COS:
-		emit_cos(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_EX2:
-		emit_ex2(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_LG2:
-		emit_lg2(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_MIN:	
+		unalias2(c, emit_min, dst, dst_flags, args[0], args[1]);
+		break;
 	    case OPCODE_MAX:	
-		emit_min_max(c, inst);
+		unalias2(c, emit_max, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DDX:
 	    case OPCODE_DDY:
-		for (j = 0; j < 4; j++) {
-		    if (inst->DstReg.WriteMask & (1 << j))
-			dst[j] = get_dst_reg(c, inst, j);
-		    else
-			dst[j] = brw_null_reg();
-		}
-		get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
 		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
 			  args[0]);
                 break;
 	    case OPCODE_SLT:
-		emit_slt(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_L, args[0], args[1]);
 		break;
 	    case OPCODE_SLE:
-		emit_sle(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_LE, args[0], args[1]);
 		break;
 	    case OPCODE_SGT:
-		emit_sgt(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_G, args[0], args[1]);
 		break;
 	    case OPCODE_SGE:
-		emit_sge(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_GE, args[0], args[1]);
 		break;
 	    case OPCODE_SEQ:
-		emit_seq(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_EQ, args[0], args[1]);
 		break;
 	    case OPCODE_SNE:
-		emit_sne(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_NEQ, args[0], args[1]);
 		break;
 	    case OPCODE_MUL:
-		emit_mul(c, inst);
+		emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_POW:
-		emit_pow(c, inst);
+		emit_math2(c, BRW_MATH_FUNCTION_POW,
+			   dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_MAD:
-		emit_mad(c, inst);
+		emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case OPCODE_NOISE1:
 		emit_noise1(c, inst);
@@ -2916,10 +2018,19 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_noise4(c, inst);
 		break;
 	    case OPCODE_TEX:
-		emit_tex(c, inst);
+		emit_tex(c, dst, dst_flags, args[0],
+			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
+				 0, 1, 0, 0),
+			 inst->TexSrcTarget,
+			 inst->TexSrcUnit,
+			 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
 		break;
 	    case OPCODE_TXB:
-		emit_txb(c, inst);
+		emit_txb(c, dst, dst_flags, args[0],
+			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
+				 0, 1, 0, 0),
+			 inst->TexSrcTarget,
+			 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
 		break;
 	    case OPCODE_KIL_NV:
 		emit_kil(c);
@@ -2929,6 +2040,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
+		assert(if_depth > 0);
 		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
@@ -2982,9 +2094,10 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                   struct brw_instruction *inst0, *inst1;
                   GLuint br = 1;
 
-                  if (BRW_IS_IGDNG(brw))
+                  if (intel->is_ironlake)
                      br = 2;
- 
+
+		  assert(loop_depth > 0);
                   loop_depth--;
                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 602b1351ef..ff4c082d5e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -42,12 +42,14 @@
 static struct brw_wm_ref *get_ref( struct brw_wm_compile *c )
 {
    assert(c->nr_refs < BRW_WM_MAX_REF);
+   memset(&c->refs[c->nr_refs], 0, sizeof(*c->refs));
    return &c->refs[c->nr_refs++];
 }
 
 static struct brw_wm_value *get_value( struct brw_wm_compile *c)
 {
    assert(c->nr_refs < BRW_WM_MAX_VREG);
+   memset(&c->vreg[c->nr_vreg], 0, sizeof(*c->vreg));
    return &c->vreg[c->nr_vreg++];
 }
 
@@ -55,6 +57,7 @@ static struct brw_wm_value *get_value( struct brw_wm_compile *c)
 static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 {
    assert(c->nr_insns < BRW_WM_MAX_INSN);
+   memset(&c->instruction[c->nr_insns], 0, sizeof(*c->instruction));
    return &c->instruction[c->nr_insns++];
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 6faea018fb..31303febf0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -82,8 +82,8 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
-      if (c->key.vp_outputs_written & (1<<j)) {
+   for (j = 0; j < VERT_RESULT_MAX; j++) {
+      if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
 	 int fp_index;
 
 	 if (j >= VERT_RESULT_VAR0)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 0acb027431..ad267a4e6a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -73,8 +73,8 @@ static dri_bo *upload_default_color( struct brw_context *brw,
 
    COPY_4V(sdc.color, color); 
    
-   return brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
-			  NULL, 0 );
+   return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			 &sdc, sizeof(sdc), NULL, 0);
 }
 
 
@@ -262,10 +262,10 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
 	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 	    float bordercolor[4] = {
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0]
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0]
 	    };
 	    /* GL specs that border color for depth textures is taken from the
 	     * R channel, while the hardware uses A.  Spam R into all the
@@ -274,7 +274,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
 	 } else {
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->BorderColor);
+							texObj->BorderColor.f);
 	 }
 	 key->sampler_count = unit + 1;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 361f91292b..d3373ea79e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -49,8 +49,6 @@ struct brw_wm_unit_key {
    unsigned int curbe_offset;
    unsigned int urb_size;
 
-   unsigned int max_threads;
-
    unsigned int nr_surfaces, sampler_count;
    GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
    GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
@@ -67,18 +65,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      key->max_threads = 1;
-   else {
-      /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_IGDNG(brw))
-         key->max_threads = 12 * 6;
-      else if (BRW_IS_G4X(brw))
-	 key->max_threads = 10 * 5;
-      else
-	 key->max_threads = 8 * 4;
-   }
-
    /* CACHE_NEW_WM_PROG */
    key->total_grf = brw->wm.prog_data->total_grf;
    key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
@@ -106,7 +92,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* as far as we can tell */
    key->computes_depth =
-      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
+      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
    /* BRW_NEW_DEPTH_BUFFER
     * Override for NULL depthbuffer case, required by the Pixel Shader Computed
     * Depth field.
@@ -140,6 +126,7 @@ static dri_bo *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_state wm;
    dri_bo *bo;
 
@@ -150,7 +137,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       wm.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -170,7 +157,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw)) 
+   if (intel->is_ironlake)
       wm.wm4.sampler_count = 0; /* hardware requirement */
    else
       wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
@@ -191,7 +178,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    else
       wm.wm5.enable_16_pix = 1;
 
-   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.max_threads = brw->wm_max_threads - 1;
    wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
    wm.wm5.legacy_line_rast = 0;
    wm.wm5.legacy_global_depth_bias = 0;
@@ -268,7 +255,7 @@ static void upload_wm_unit( struct brw_context *brw )
     */
    assert(key.total_scratch <= 12 * 1024);
    if (key.total_scratch) {
-      GLuint total = key.total_scratch * key.max_threads;
+      GLuint total = key.total_scratch * brw->wm_max_threads;
 
       if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
 	 dri_bo_unreference(brw->wm.scratch_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 0bf735c0f2..f26cfabb7d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -86,25 +86,22 @@ static GLuint translate_tex_format( gl_format mesa_format,
    case MESA_FORMAT_AL88:
       return BRW_SURFACEFORMAT_L8A8_UNORM;
 
+   case MESA_FORMAT_AL1616:
+      return BRW_SURFACEFORMAT_L16A16_UNORM;
+
    case MESA_FORMAT_RGB888:
       assert(0);		/* not supported for sampling */
       return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
 
    case MESA_FORMAT_ARGB8888:
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
    case MESA_FORMAT_XRGB8888:
       return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
 
    case MESA_FORMAT_RGBA8888_REV:
       _mesa_problem(NULL, "unexpected format in i965:translate_tex_format()");
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
 
    case MESA_FORMAT_RGB565:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -358,7 +355,10 @@ brw_create_constant_surface( struct brw_context *brw,
 			 NULL, NULL);
 
    if (key->bo) {
-      /* Emit relocation to surface contents */
+      /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
+       * bspec ("Data Cache") says that the data cache does not exist as
+       * a separate cache and is just the sampler cache.
+       */
       dri_bo_emit_reloc(bo,
 			I915_GEM_DOMAIN_SAMPLER, 0,
 			0,
@@ -511,7 +511,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
 				unsigned int unit)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;;
+   GLcontext *ctx = &intel->ctx;
    dri_bo *region_bo = NULL;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_region *region = irb ? irb->region : NULL;
@@ -522,7 +523,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
-      uint32_t draw_offset;
+      uint32_t draw_x;
+      uint32_t draw_y;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -531,12 +533,16 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       region_bo = region->buffer;
 
       key.surface_type = BRW_SURFACE_2D;
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
+      /* XRGB and ARGB are treated the same here because the chips in this
+       * family cannot render to XRGB targets.  This means that we have to
+       * mask writes to alpha (ala glColorMask) and reconfigure the alpha
+       * blending hardware to use GL_ONE (or GL_ZERO) for cases where
+       * GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is used.
+       */
       case MESA_FORMAT_ARGB8888:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-	 break;
       case MESA_FORMAT_XRGB8888:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 break;
       case MESA_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -548,7 +554,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 	 break;
       default:
-	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->texformat);
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->Base.Format);
       }
       key.tiling = region->tiling;
       if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
@@ -560,7 +566,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       }
       key.pitch = region->pitch;
       key.cpp = region->cpp;
-      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
+      key.draw_x = region->draw_x;
+      key.draw_y = region->draw_y;
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -568,11 +575,19 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
-      key.draw_offset = 0;
+      key.draw_x = 0;
+      key.draw_y = 0;
    }
    /* _NEW_COLOR */
-   memcpy(key.color_mask, ctx->Color.ColorMask,
+   memcpy(key.color_mask, ctx->Color.ColorMask[0],
 	  sizeof(key.color_mask));
+
+   /* As mentioned above, disable writes to the alpha component when the
+    * renderbuffer is XRGB.
+    */
+   if (ctx->DrawBuffer->Visual.alphaBits == 0)
+     key.color_mask[3] = GL_FALSE;
+
    key.color_blend = (!ctx->Color._LogicOpEnabled &&
 		      ctx->Color.BlendEnabled);
 
@@ -591,25 +606,32 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
       if (key.tiling == I915_TILING_NONE) {
-	 surf.ss1.base_addr = key.draw_offset;
+	 surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
       } else {
-	 uint32_t tile_offset = key.draw_offset % 4096;
-
-	 surf.ss1.base_addr = key.draw_offset - tile_offset;
-
-	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
-	 if (BRW_IS_G4X(brw)) {
-	    if (key.tiling == I915_TILING_X) {
-	       /* Note that the low bits of these fields are missing, so
-		* there's the possibility of getting in trouble.
-		*/
-	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 512 / 2;
-	    } else {
-	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 128 / 2;
-	    }
+	 uint32_t tile_base, tile_x, tile_y;
+	 uint32_t pitch = key.pitch * key.cpp;
+
+	 if (key.tiling == I915_TILING_X) {
+	    tile_x = key.draw_x % (512 / key.cpp);
+	    tile_y = key.draw_y % 8;
+	    tile_base = ((key.draw_y / 8) * (8 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
+	 } else {
+	    /* Y */
+	    tile_x = key.draw_x % (128 / key.cpp);
+	    tile_y = key.draw_y % 32;
+	    tile_base = ((key.draw_y / 32) * (32 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
 	 }
+	 assert(intel->is_g4x || (tile_x == 0 && tile_y == 0));
+	 assert(tile_x % 4 == 0);
+	 assert(tile_y % 2 == 0);
+	 /* Note that the low bits of these fields are missing, so
+	  * there's the possibility of getting in trouble.
+	  */
+	 surf.ss1.base_addr = tile_base;
+	 surf.ss5.x_offset = tile_x / 4;
+	 surf.ss5.y_offset = tile_y / 2;
       }
       if (region_bo != NULL)
 	 surf.ss1.base_addr += region_bo->offset; /* reloc */
diff --git a/src/mesa/drivers/dri/i965/intel_swapbuffers.c b/src/mesa/drivers/dri/i965/intel_swapbuffers.c
deleted file mode 120000
index 148d5215aa..0000000000
--- a/src/mesa/drivers/dri/i965/intel_swapbuffers.c
+++ /dev/null
@@ -1 +0,0 @@
-../intel/intel_swapbuffers.c
-\ No newline at end of file