22 files changed, 954 insertions, 135 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index c87e5b9a12..c45d48dff8 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -148,7 +148,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) {
+   if (!BRW_IS_G4X(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 82d1e87357..740c7cbd10 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -102,7 +102,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.clip5.api_mode = BRW_CLIP_API_OGL;
    clip.clip5.clip_mode = key->clip_mode;
 
-   if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw))
+   if (BRW_IS_G4X(brw))
       clip.clip5.negative_w_clip_test = 1;
 
    clip.clip6.clipper_viewport_state_ptr = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 8459b59b46..1dbba37fe7 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -526,7 +526,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) {
+   if (!BRW_IS_G4X(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 474158b484..e2bc08a6cb 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -39,6 +39,7 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
+#include "brw_state.h"
 #include "brw_vs.h"
 #include "intel_tex.h"
 #include "intel_blit.h"
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1c6a0dede0..e3904be977 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -433,7 +433,6 @@ struct brw_context
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean wrap;
    GLboolean tmp_fallback;
    GLboolean no_batch_wrap;
 
@@ -445,6 +444,19 @@ struct brw_context
       GLuint nr_draw_regions;
       struct intel_region *draw_regions[MAX_DRAW_BUFFERS];
       struct intel_region *depth_region;
+
+      /**
+       * List of buffers accumulated in brw_validate_state to receive
+       * dri_bo_check_aperture treatment before exec, so we can know if we
+       * should flush the batch and try again before emitting primitives.
+       *
+       * This can be a fixed number as we only have a limited number of
+       * objects referenced from the batchbuffer in a primitive emit,
+       * consisting of the vertex buffers, pipelined state pointers,
+       * the CURBE, the depth buffer, and a query BO.
+       */
+      dri_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      int validated_bo_count;
    } state;
 
    struct brw_state_pointers attribs;
@@ -680,14 +692,6 @@ void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
 
 /*======================================================================
- * brw_state.c
- */
-void brw_validate_state( struct brw_context *brw );
-void brw_init_state( struct brw_context *brw );
-void brw_destroy_state( struct brw_context *brw );
-
-
-/*======================================================================
  * brw_state_dump.c
  */
 void brw_debug_batch(struct intel_context *intel);
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 7cddd3a7de..c7bac7b0c5 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -307,6 +307,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf);
    }
 
+   brw_add_validated_bo(brw, brw->curbe.curbe_bo);
 
    /* Because this provokes an action (ie copy the constants into the
     * URB), it shouldn't be shortcircuited if identical to the
@@ -328,13 +329,6 @@ static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
-   dri_bo *aper_array[] = {
-      brw->intel.batch->buf,
-      brw->curbe.curbe_bo,
-   };
-
-   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
-      intel_batchbuffer_flush(intel->batch);
 
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
    if (sz == 0) {
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 0593e8d5f5..39c32255f8 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -798,10 +798,9 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_GM45(brw)        (IS_GM45_GM((brw)->intel.intelScreen->deviceID))
 #define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? 384 : 256)  /* 512 bit unit */
+#define CMD_PIPELINE_SELECT(brw)        (BRW_IS_G4X(brw) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)          (BRW_IS_G4X(brw) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                  (BRW_IS_G4X(brw) ? 384 : 256)  /* 512 bit units */
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 6c71b4abcf..d87b8f8a84 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -256,6 +256,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    struct intel_context *intel = intel_context(ctx);
    struct brw_context *brw = brw_context(ctx);
    GLboolean retval = GL_FALSE;
+   GLboolean warn = GL_FALSE;
    GLuint i;
 
    if (ctx->NewState)
@@ -282,30 +283,25 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
 
    LOCK_HARDWARE(intel);
 
-   if (brw->intel.numClipRects == 0) {
+   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
       UNLOCK_HARDWARE(intel);
       return GL_TRUE;
    }
 
+   /* Flush the batch if it's approaching full, so that we don't wrap while
+    * we've got validated state that needs to be in the same batch as the
+    * primitives.  This fraction is just a guess (minimal full state plus
+    * a primitive is around 512 bytes), and would be better if we had
+    * an upper bound of how much we might emit in a single
+    * brw_try_draw_prims().
+    */
+   intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
+				   LOOP_CLIPRECTS);
    {
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.  This fraction is just a guess (minimal full state plus
-       * a primitive is around 512 bytes), and would be better if we had
-       * an upper bound of how much we might emit in a single
-       * brw_try_draw_prims().
-       */
-      if (intel->batch->ptr - intel->batch->map > intel->batch->size * 3 / 4
-	/* brw_emit_prim may change the cliprect_mode to LOOP_CLIPRECTS */
-	  || intel->batch->cliprect_mode != LOOP_CLIPRECTS)
-	      intel_batchbuffer_flush(intel->batch);
-
       /* Set the first primitive early, ahead of validate_state:
        */
       brw_set_prim(brw, prim[0].mode);
 
-      /* XXX:  Need to separate validate and upload of state.  
-       */
       brw_validate_state( brw );
 
       /* Various fallback checks:
@@ -316,6 +312,31 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       if (check_fallbacks( brw, prim, nr_prims ))
 	 goto out;
 
+      /* Check that we can fit our state in with our existing batchbuffer, or
+       * flush otherwise.
+       */
+      if (dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					  brw->state.validated_bo_count)) {
+	 static GLboolean warned;
+	 intel_batchbuffer_flush(intel->batch);
+
+	 /* Validate the state after we flushed the batch (which would have
+	  * changed the set of dirty state).  If we still fail to
+	  * check_aperture, warn of what's happening, but attempt to continue
+	  * on since it may succeed anyway, and the user would probably rather
+	  * see a failure and a warning than a fallback.
+	  */
+	 brw_validate_state(brw);
+	 if (!warned &&
+	     dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					     brw->state.validated_bo_count)) {
+	    warn = GL_TRUE;
+	    warned = GL_TRUE;
+	 }
+      }
+
+      brw_upload_state(brw);
+
       for (i = 0; i < nr_prims; i++) {
 	 brw_emit_prim(brw, &prim[i]);
       }
@@ -326,6 +347,10 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
  out:
    UNLOCK_HARDWARE(intel);
 
+   if (warn)
+      fprintf(stderr, "i965: Single primitive emit potentially exceeded "
+	      "available aperture space\n");
+
    if (!retval)
       DBG("%s failed\n", __FUNCTION__);
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 7b88b5eaa1..4080c5e322 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -250,10 +250,10 @@ static void get_space( struct brw_context *brw,
       wrap_buffers(brw, size);
    }
 
+   assert(*bo_return == NULL);
    dri_bo_reference(brw->vb.upload.bo);
    *bo_return = brw->vb.upload.bo;
    *offset_return = brw->vb.upload.offset;
-
    brw->vb.upload.offset += size;
 }
 
@@ -359,6 +359,14 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 input->offset = (unsigned long)input->glarray->Ptr;
 	 input->stride = input->glarray->StrideB;
       } else {
+	 if (input->bo != NULL) {
+	    /* Already-uploaded vertex data is present from a previous
+	     * prepare_vertices, but we had to re-validate state due to
+	     * check_aperture failing and a new batch being produced.
+	     */
+	    continue;
+	 }
+
 	 /* Queue the buffer object up to be uploaded in the next pass,
 	  * when we've decided if we're doing interleaved or not.
 	  */
@@ -417,6 +425,12 @@ static void brw_prepare_vertices(struct brw_context *brw)
    }
 
    brw_prepare_query_begin(brw);
+
+   for (i = 0; i < nr_enabled; i++) {
+      struct brw_vertex_element *input = enabled[i];
+
+      brw_add_validated_bo(brw, input->bo);
+   }
 }
 
 static void brw_emit_vertices(struct brw_context *brw)
@@ -512,7 +526,7 @@ static void brw_prepare_indices(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint ib_size;
-   dri_bo *bo;
+   dri_bo *bo = NULL;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
 
@@ -561,6 +575,8 @@ static void brw_prepare_indices(struct brw_context *brw)
    dri_bo_unreference(brw->ib.bo);
    brw->ib.bo = bo;
    brw->ib.offset = offset;
+
+   brw_add_validated_bo(brw, brw->ib.bo);
 }
 
 static void brw_emit_indices(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 207b8b7ca3..49b422ee2f 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -65,7 +65,7 @@ struct brw_reg
    GLuint abs:1;		/* source only */
    GLuint vstride:4;		/* source only */
    GLuint width:3;		/* src only, align1 only */
-   GLuint hstride:2;   		/* src only, align1 only */
+   GLuint hstride:2;   		/* align1 only */
    GLuint address_mode:1;	/* relative addressing, hopefully! */
    GLuint pad0:1;
 
@@ -432,6 +432,12 @@ static INLINE struct brw_reg brw_uw8_grf( GLuint nr,
    return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
 }
 
+static INLINE struct brw_reg brw_uw16_grf( GLuint nr,
+					    GLuint subnr )
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
 static INLINE struct brw_reg brw_null_reg( void )
 {
    return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0bfbec9d14..ce4cf46cfa 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -64,7 +64,9 @@ static void brw_set_dest( struct brw_instruction *insn,
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
-	 insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
       }
       else {
 	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
@@ -78,7 +80,9 @@ static void brw_set_dest( struct brw_instruction *insn,
        */
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
-	 insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
       }
       else {
 	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
@@ -329,14 +333,14 @@ static void brw_set_sampler_message(struct brw_context *brw,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) {
-      insn->bits3.sampler_gm45_g4x.binding_table_index = binding_table_index;
-      insn->bits3.sampler_gm45_g4x.sampler = sampler;
-      insn->bits3.sampler_gm45_g4x.msg_type = msg_type;
-      insn->bits3.sampler_gm45_g4x.response_length = response_length;
-      insn->bits3.sampler_gm45_g4x.msg_length = msg_length;
-      insn->bits3.sampler_gm45_g4x.end_of_thread = eot;
-      insn->bits3.sampler_gm45_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   if (BRW_IS_G4X(brw)) {
+      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
+      insn->bits3.sampler_g4x.sampler = sampler;
+      insn->bits3.sampler_g4x.msg_type = msg_type;
+      insn->bits3.sampler_g4x.response_length = response_length;
+      insn->bits3.sampler_g4x.msg_length = msg_length;
+      insn->bits3.sampler_g4x.end_of_thread = eot;
+      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
    } else {
       insn->bits3.sampler.binding_table_index = binding_table_index;
       insn->bits3.sampler.sampler = sampler;
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 487c638ce2..627705fa9b 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -71,6 +71,38 @@ const struct brw_tracked_state brw_blend_constant_color = {
    .emit = upload_blend_constant_color
 };
 
+/* Constant single cliprect for framebuffer object or DRI2 drawing */
+static void upload_drawing_rect(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+
+   if (!intel->constant_cliprect)
+      return;
+
+   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
+   OUT_BATCH(0); /* xmin, ymin */
+   OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
+	    ((ctx->DrawBuffer->Height - 1) << 16));
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_drawing_rect
+};
+
+static void prepare_binding_table_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->wm.bind_bo);
+}
+
 /**
  * Upload the binding table pointers, which point each stage's array of surface
  * state pointers.
@@ -81,13 +113,6 @@ const struct brw_tracked_state brw_blend_constant_color = {
 static void upload_binding_table_pointers(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   dri_bo *aper_array[] = {
-      intel->batch->buf,
-      brw->wm.bind_bo,
-   };
-
-   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
-      intel_batchbuffer_flush(intel->batch);
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
@@ -107,6 +132,7 @@ const struct brw_tracked_state brw_binding_table_pointers = {
       .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_SURF_BIND,
    },
+   .prepare = prepare_binding_table_pointers,
    .emit = upload_binding_table_pointers,
 };
 
@@ -140,21 +166,18 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    brw->state.dirty.brw |= BRW_NEW_PSP;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+
+static void prepare_psp_urb_cbs(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
-   dri_bo *aper_array[] = {
-      intel->batch->buf,
-      brw->vs.state_bo,
-      brw->gs.state_bo,
-      brw->clip.state_bo,
-      brw->wm.state_bo,
-      brw->cc.state_bo,
-   };
-
-   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
-      intel_batchbuffer_flush(intel->batch);
+   brw_add_validated_bo(brw, brw->vs.state_bo);
+   brw_add_validated_bo(brw, brw->gs.state_bo);
+   brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->wm.state_bo);
+   brw_add_validated_bo(brw, brw->cc.state_bo);
+}
 
+static void upload_psp_urb_cbs(struct brw_context *brw )
+{
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
    brw_upload_constant_buffer_state(brw);
@@ -172,14 +195,23 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
 		CACHE_NEW_WM_UNIT | 
 		CACHE_NEW_CC_UNIT)
    },
+   .prepare = prepare_psp_urb_cbs,
    .emit = upload_psp_urb_cbs,
 };
 
+static void prepare_depthbuffer(struct brw_context *brw)
+{
+   struct intel_region *region = brw->state.depth_region;
+
+   if (region != NULL)
+      brw_add_validated_bo(brw, region->buffer);
+}
+
 static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? sizeof(struct brw_depthbuffer_gm45_g4x) / 4 : sizeof(struct brw_depthbuffer) / 4;
+   unsigned int len = BRW_IS_G4X(brw) ? 6 : 5;
 
    if (region == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
@@ -190,16 +222,12 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
    } else {
       unsigned int format;
-      dri_bo *aper_array[] = {
-	 intel->batch->buf,
-	 region->buffer
-      };
 
       switch (region->cpp) {
       case 2:
@@ -216,9 +244,6 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 return;
       }
 
-      if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
-	 intel_batchbuffer_flush(intel->batch);
-
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
@@ -234,7 +259,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -247,6 +272,7 @@ const struct brw_tracked_state brw_depthbuffer = {
       .brw = BRW_NEW_DEPTH_BUFFER | BRW_NEW_BATCH,
       .cache = 0,
    },
+   .prepare = prepare_depthbuffer,
    .emit = emit_depthbuffer,
 };
 
@@ -318,7 +344,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
-   if (!(BRW_IS_GM45(brw) || BRW_IS_G4X(brw)))
+   if (!BRW_IS_G4X(brw))
       return;
 
    /* use legacy aa line coverage computation */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index a1a1353dee..cb9169e2ee 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -42,6 +42,7 @@
 #include "main/imports.h"
 
 #include "brw_context.h"
+#include "brw_state.h"
 #include "intel_batchbuffer.h"
 #include "intel_reg.h"
 
@@ -163,10 +164,6 @@ void
 brw_prepare_query_begin(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   dri_bo *aper_array[] = {
-      intel->batch->buf,
-      brw->query.bo,
-   };
 
    /* Skip if we're not doing any queries. */
    if (is_empty_list(&brw->query.active_head))
@@ -182,8 +179,7 @@ brw_prepare_query_begin(struct brw_context *brw)
       brw->query.index = 0;
    }
 
-   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
-      intel_batchbuffer_flush(intel->batch);
+   brw_add_validated_bo(brw, brw->query.bo);
 }
 
 /** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 4c04036ef0..bb22c03eeb 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -35,6 +35,16 @@
 
 #include "brw_context.h"
 
+static inline void
+brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
+{
+   assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
+
+   if (bo != NULL) {
+      dri_bo_reference(bo);
+      brw->state.validated_bos[brw->state.validated_bo_count++] = bo;
+   }
+};
 
 const struct brw_tracked_state brw_blend_constant_color;
 const struct brw_tracked_state brw_cc_unit;
@@ -79,10 +89,19 @@ const struct brw_tracked_state brw_pipe_control;
 const struct brw_tracked_state brw_clear_surface_cache;
 const struct brw_tracked_state brw_clear_batch_cache;
 
+const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
 
 /***********************************************************************
+ * brw_state.c
+ */
+void brw_validate_state(struct brw_context *brw);
+void brw_upload_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+/***********************************************************************
  * brw_state_cache.c
  */
 dri_bo *brw_cache_data(struct brw_cache *cache,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 94ef924868..dc87859f3f 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -97,8 +97,6 @@ void brw_clear_batch_cache_flush( struct brw_context *brw )
 {
    clear_batch_cache(brw);
 
-   brw->wrap = 0;
-   
 /*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
    
    brw->state.dirty.mesa |= ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index b6a52843a8..7a642bd2a8 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -45,7 +45,6 @@ const struct brw_tracked_state *atoms[] =
 {
    &brw_check_fallback,
 
-   &brw_active_vertprog,
    &brw_wm_input_sizes,
    &brw_vs_prog,
    &brw_gs_prog, 
@@ -99,6 +98,7 @@ const struct brw_tracked_state *atoms[] =
    &brw_psp_urb_cbs,
 #endif
 
+   &brw_drawing_rect,
    &brw_indices,
    &brw_vertices,
 
@@ -168,6 +168,18 @@ static void xor_states( struct brw_state_flags *result,
    result->cache = a->cache ^ b->cache;
 }
 
+static void
+brw_clear_validated_bos(struct brw_context *brw)
+{
+   int i;
+
+   /* Clear the last round of validated bos */
+   for (i = 0; i < brw->state.validated_bo_count; i++) {
+      dri_bo_unreference(brw->state.validated_bos[i]);
+      brw->state.validated_bos[i] = NULL;
+   }
+   brw->state.validated_bo_count = 0;
+}
 
 /***********************************************************************
  * Emit all state:
@@ -176,14 +188,14 @@ void brw_validate_state( struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
    struct brw_state_flags *state = &brw->state.dirty;
-   GLuint i, count, pass = 0;
-   dri_bo *last_batch_bo = NULL;
+   GLuint i;
+
+   brw_clear_validated_bos(brw);
 
    state->mesa |= brw->intel.NewGLState;
    brw->intel.NewGLState = 0;
 
-   if (brw->wrap)
-      state->brw |= BRW_NEW_CONTEXT;
+   brw_add_validated_bo(brw, intel->batch->buf);
 
    if (brw->emit_state_always) {
       state->mesa |= ~0;
@@ -199,6 +211,10 @@ void brw_validate_state( struct brw_context *brw )
       brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
    }
 
+   if (brw->vertex_program != brw->attribs.VertexProgram->_Current) {
+      brw->vertex_program = brw->attribs.VertexProgram->_Current;
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+   }
 
    if (state->mesa == 0 &&
        state->cache == 0 &&
@@ -210,8 +226,6 @@ void brw_validate_state( struct brw_context *brw )
 
    brw->intel.Fallback = 0;
 
-   count = 0;
-
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
       const struct brw_tracked_state *atom = brw->state.atoms[i];
@@ -225,19 +239,15 @@ void brw_validate_state( struct brw_context *brw )
         }
       }
    }
+}
 
-   if (brw->intel.Fallback)
-      return;
 
-   /* We're about to try to set up a coherent state in the batchbuffer for
-    * the emission of primitives.  If we exceed the aperture size in any of the
-    * emit() calls, we need to go back to square 1 and try setting up again.
-    */
-got_flushed:
-   dri_bo_unreference(last_batch_bo);
-   last_batch_bo = intel->batch->buf;
-   dri_bo_reference(last_batch_bo);
-   assert(pass++ <= 2);
+void brw_upload_state(struct brw_context *brw)
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   int i;
+
+   brw_clear_validated_bos(brw);
 
    if (INTEL_DEBUG) {
       /* Debug version which enforces various sanity checks on the
@@ -262,8 +272,6 @@ got_flushed:
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
 	       atom->emit( brw );
-	       if (intel->batch->buf != last_batch_bo)
-		  goto got_flushed;
 	    }
 	 }
 
@@ -288,15 +296,11 @@ got_flushed:
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
 	       atom->emit( brw );
-	       if (intel->batch->buf != last_batch_bo)
-		  goto got_flushed;
 	    }
 	 }
       }
    }
 
-   dri_bo_unreference(last_batch_bo);
-
    if (!brw->intel.Fallback)
       memset(state, 0, sizeof(*state));
 }
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index ec865c925a..4e577d0f6a 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -175,7 +175,7 @@ struct brw_depthbuffer
    } dword4;
 };
 
-struct brw_depthbuffer_gm45_g4x
+struct brw_depthbuffer_g4x
 {
    union header_union header;
    
@@ -1405,7 +1405,7 @@ struct brw_instruction
          GLuint msg_target:4;
          GLuint pad1:3;
          GLuint end_of_thread:1;
-      } sampler_gm45_g4x; 
+      } sampler_g4x;
 
       struct brw_urb_immediate urb;
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 1116ade0a4..1a004176de 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -92,9 +92,9 @@ static void recalculate_urb_fence( struct brw_context *brw )
    if (brw->urb.vsize < vsize ||
        brw->urb.sfsize < sfsize ||
        brw->urb.csize < csize ||
-       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
-				 brw->urb.sfsize > brw->urb.sfsize ||
-				 brw->urb.csize > brw->urb.csize))) {
+       (brw->urb.constrained && (brw->urb.vsize > vsize ||
+				 brw->urb.sfsize > sfsize ||
+				 brw->urb.csize > csize))) {
       
 
       brw->urb.csize = csize;
@@ -114,6 +114,10 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
 	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
 
+	 /* Mark us as operating with constrained nr_entries, so that next
+	  * time we recalculate we'll resize the fences in the hope of
+	  * escaping constrained mode and getting back to normal performance.
+	  */
 	 brw->urb.constrained = 1;
 	 
 	 if (!check_urb_layout(brw)) {
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 9de05408ba..25b4ee85cb 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -818,8 +818,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    }
 
 
-   /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
-    */
+   /* Build ndc coords */
    if (!c->key.know_w_is_one) {
       ndc = get_tmp(c);
       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
@@ -829,12 +828,12 @@ static void emit_vertex_write( struct brw_vs_compile *c)
       ndc = pos;
    }
 
-   /* This includes the workaround for -ve rhw, so is no longer an
-    * optional step:
+   /* Update the header for point size, user clipping flags, and -ve rhw
+    * workaround.
     */
    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
        c->key.nr_userclip ||
-       !c->key.know_w_is_one)
+       (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -867,7 +866,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw)) && !c->key.know_w_is_one) {
+      if (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index a64e437860..2d4c81274e 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -62,7 +62,6 @@ dri_bo_release(dri_bo **bo)
  */
 static void brw_destroy_context( struct intel_context *intel )
 {
-   GLcontext *ctx = &intel->ctx;
    struct brw_context *brw = brw_context(&intel->ctx);
    int i;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 297617ee2d..896390c17b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -157,6 +157,7 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_PARAM 256
 #define BRW_WM_MAX_CONST 256
 #define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+#define BRW_WM_MAX_SUBROUTINE 16
 
 
 
@@ -246,7 +247,10 @@ struct brw_wm_compile {
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
    GLuint reg_index;
+   GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
+   GLuint tmp_max;
+   GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 4d5e11f4b6..cb728190f5 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -4,6 +4,10 @@
 #include "brw_eu.h"
 #include "brw_wm.h"
 
+enum _subroutine {
+    SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
+};
+
 /* Only guess, need a flag in gl_fragment_program later */
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 {
@@ -12,13 +16,17 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
 	    case OPCODE_IF:
-	    case OPCODE_INT:
+	    case OPCODE_TRUNC:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
 	    case OPCODE_RET:
 	    case OPCODE_DDX:
 	    case OPCODE_DDY:
+	    case OPCODE_NOISE1:
+	    case OPCODE_NOISE2:
+	    case OPCODE_NOISE3:
+	    case OPCODE_NOISE4:
 	    case OPCODE_BGNLOOP:
 		return GL_TRUE; 
 	    default:
@@ -47,13 +55,26 @@ static int get_scalar_dst_index(struct prog_instruction *inst)
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
-    reg = brw_vec8_grf(c->tmp_index--, 0);
+    if(c->tmp_index == c->tmp_max)
+	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
+    
+    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
     return reg;
 }
 
-static void release_tmps(struct brw_wm_compile *c)
+static int mark_tmps(struct brw_wm_compile *c)
+{
+    return c->tmp_index;
+}
+
+static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
+{
+    return brw_vec8_grf( c->tmp_regs[ index ], 0 );
+}
+
+static void release_tmps(struct brw_wm_compile *c, int mark)
 {
-    c->tmp_index = 127;
+    c->tmp_index = mark;
 }
 
 static struct brw_reg 
@@ -155,6 +176,68 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 	    src->NegateBase, src->Abs);
 }
 
+/* Subroutines are minimal support for resusable instruction sequences.
+   They are implemented as simply as possible to minimise overhead: there
+   is no explicit support for communication between the caller and callee
+   other than saving the return address in a temporary register, nor is
+   there any automatic local storage.  This implies that great care is
+   required before attempting reentrancy or any kind of nested
+   subroutine invocations. */
+static void invoke_subroutine( struct brw_wm_compile *c,
+			       enum _subroutine subroutine,
+			       void (*emit)( struct brw_wm_compile * ) )
+{
+    struct brw_compile *p = &c->func;
+
+    assert( subroutine < BRW_WM_MAX_SUBROUTINE );
+    
+    if( c->subroutines[ subroutine ] ) {
+	/* subroutine previously emitted: reuse existing instructions */
+
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	int here = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
+
+	brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+		 brw_imm_d( ( c->subroutines[ subroutine ] -
+			      here - 1 ) << 4 ) );
+	brw_pop_insn_state(p);
+
+	release_tmps( c, mark );
+    } else {
+	/* previously unused subroutine: emit, and mark for later reuse */
+	
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	struct brw_instruction *calc;
+	int base = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
+	brw_pop_insn_state(p);
+	
+	c->subroutines[ subroutine ] = p->nr_insn;
+
+	emit( c );
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_MOV( p, brw_ip_reg(), return_address );
+	brw_pop_insn_state(p);
+
+	brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
+	
+	release_tmps( c, mark );
+    }
+}
+
 static void emit_abs( struct brw_wm_compile *c,
 		struct prog_instruction *inst)
 {
@@ -172,7 +255,7 @@ static void emit_abs( struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-static void emit_int( struct brw_wm_compile *c,
+static void emit_trunc( struct brw_wm_compile *c,
 		struct prog_instruction *inst)
 {
     int i;
@@ -778,6 +861,7 @@ static void emit_lrp(struct brw_wm_compile *c,
     GLuint mask = inst->DstReg.WriteMask;
     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
     int i;
+    int mark = mark_tmps(c);
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
 	    dst = get_dst_reg(c, inst, i, 1);
@@ -804,7 +888,7 @@ static void emit_lrp(struct brw_wm_compile *c,
 	    brw_MAC(p, dst, src0, tmp1);
 	    brw_set_saturate(p, 0);
 	}
-	release_tmps(c);
+	release_tmps(c, mark);
     }
 }
 
@@ -957,6 +1041,633 @@ static void emit_ddy(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
+static __inline struct brw_reg high_words( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
+		   0, 8, 2 );
+}
+
+static __inline struct brw_reg low_words( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
+}
+
+static __inline struct brw_reg even_bytes( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
+}
+
+static __inline struct brw_reg odd_bytes( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
+		   0, 16, 2 );
+}
+
+/* One-, two- and three-dimensional Perlin noise, similar to the description
+   in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
+static void noise1_sub( struct brw_wm_compile *c ) {
+
+    struct brw_compile *p = &c->func;
+    struct brw_reg param,
+	x0, x1, /* gradients at each end */       
+	t, tmp[ 2 ], /* float temporaries */
+	itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
+    int i;
+    int mark = mark_tmps( c );
+
+    x0 = alloc_tmp( c );
+    x1 = alloc_tmp( c );
+    t = alloc_tmp( c );
+    tmp[ 0 ] = alloc_tmp( c );
+    tmp[ 1 ] = alloc_tmp( c );
+    itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
+    itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
+    itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
+    itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
+    itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
+    
+    param = lookup_tmp( c, mark - 2 );
+
+    brw_set_access_mode( p, BRW_ALIGN_1 );
+
+    brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+
+    /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
+       be hashed.  Also compute the remainder (offset within the unit
+       length), interleaved to reduce register dependency penalties. */
+    brw_RNDD( p, itmp[ 0 ], param );
+    brw_FRC( p, param, param );
+    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
+    brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+
+    /* We're now ready to perform the hashing.  The two hashes are
+       interleaved for performance.  The hash function used is
+       designed to rapidly achieve avalanche and require only 32x16
+       bit multiplication, and 16-bit swizzles (which we get for
+       free).  We can't use immediate operands in the multiplies,
+       because immediates are permitted only in src1 and the 16-bit
+       factor is permitted only in src0. */
+    for( i = 0; i < 2; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
+    for( i = 0; i < 2; i++ )
+       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		high_words( itmp[ i ] ) );
+    for( i = 0; i < 2; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
+    for( i = 0; i < 2; i++ )
+       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		high_words( itmp[ i ] ) );
+    for( i = 0; i < 2; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+    for( i = 0; i < 2; i++ )
+       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		high_words( itmp[ i ] ) );
+
+    /* Now we want to initialise the two gradients based on the
+       hashes.  Format conversion from signed integer to float leaves
+       everything scaled too high by a factor of pow( 2, 31 ), but
+       we correct for that right at the end. */
+    brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
+    brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
+    brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
+
+    brw_MUL( p, x0, x0, param );
+    brw_MUL( p, x1, x1, t );
+    
+    /* We interpolate between the gradients using the polynomial
+       6t^5 - 15t^4 + 10t^3 (Perlin). */
+    brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+    brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
+					   pipeline */
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+    brw_MUL( p, param, tmp[ 0 ], param );
+    brw_MUL( p, x1, x1, param );
+    brw_ADD( p, x0, x0, x1 );    
+    /* scale by pow( 2, -30 ), to compensate for the format conversion
+       above and an extra factor of 2 so that a single gradient covers
+       the [-1,1] range */
+    brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
+
+    release_tmps( c, mark );
+}
+
+static void emit_noise1( struct brw_wm_compile *c,
+			 struct prog_instruction *inst )
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src, param, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    int mark = mark_tmps( c );
+
+    assert( mark == 0 );
+    
+    src = get_src_reg( c, inst->SrcReg, 0, 1 );
+
+    param = alloc_tmp( c );
+
+    brw_MOV( p, param, src );
+
+    invoke_subroutine( c, SUB_NOISE1, noise1_sub );
+    
+    /* Fill in the result: */
+    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_MOV( p, dst, param );
+	}
+    }
+    if( inst->SaturateMode == SATURATE_ZERO_ONE )
+	brw_set_saturate( p, 0 );
+    
+    release_tmps( c, mark );
+}
+    
+static void noise2_sub( struct brw_wm_compile *c ) {
+
+    struct brw_compile *p = &c->func;
+    struct brw_reg param0, param1,
+	x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */       
+	t, tmp[ 4 ], /* float temporaries */
+	itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
+    int i;
+    int mark = mark_tmps( c );
+
+    x0y0 = alloc_tmp( c );
+    x0y1 = alloc_tmp( c );
+    x1y0 = alloc_tmp( c );
+    x1y1 = alloc_tmp( c );
+    t = alloc_tmp( c );
+    for( i = 0; i < 4; i++ ) {
+	tmp[ i ] = alloc_tmp( c );
+	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+    }
+    itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
+    itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
+    itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
+    
+    param0 = lookup_tmp( c, mark - 3 );
+    param1 = lookup_tmp( c, mark - 2 );
+
+    brw_set_access_mode( p, BRW_ALIGN_1 );
+    
+    /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
+       be hashed.  Also compute the remainders (offsets within the unit
+       square), interleaved to reduce register dependency penalties. */
+    brw_RNDD( p, itmp[ 0 ], param0 );
+    brw_RNDD( p, itmp[ 1 ], param1 );
+    brw_FRC( p, param0, param0 );
+    brw_FRC( p, param1, param1 );
+    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+    brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
+	     low_words( itmp[ 1 ] ) );
+    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
+    brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
+    brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
+
+    /* We're now ready to perform the hashing.  The four hashes are
+       interleaved for performance.  The hash function used is
+       designed to rapidly achieve avalanche and require only 32x16
+       bit multiplication, and 16-bit swizzles (which we get for
+       free).  We can't use immediate operands in the multiplies,
+       because immediates are permitted only in src1 and the 16-bit
+       factor is permitted only in src0. */
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		 high_words( itmp[ i ] ) );
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		 high_words( itmp[ i ] ) );
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+		 high_words( itmp[ i ] ) );
+
+    /* Now we want to initialise the four gradients based on the
+       hashes.  Format conversion from signed integer to float leaves
+       everything scaled too high by a factor of pow( 2, 15 ), but
+       we correct for that right at the end. */
+    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+    brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
+    brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
+    
+    brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
+    
+    brw_MUL( p, x1y0, x1y0, t );
+    brw_MUL( p, x1y1, x1y1, t );
+    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+    brw_MUL( p, x0y0, x0y0, param0 );
+    brw_MUL( p, x0y1, x0y1, param0 );
+
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
+    brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
+    brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
+
+    brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
+    
+    /* We interpolate between the gradients using the polynomial
+       6t^5 - 15t^4 + 10t^3 (Perlin). */
+    brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
+    brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
+						 pipeline */
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
+						 pipeline */
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+    brw_MUL( p, param0, tmp[ 0 ], param0 );
+    brw_MUL( p, param1, tmp[ 1 ], param1 );
+    
+    /* Here we interpolate in the y dimension... */
+    brw_MUL( p, x0y1, x0y1, param1 );
+    brw_MUL( p, x1y1, x1y1, param1 );
+    brw_ADD( p, x0y0, x0y0, x0y1 );
+    brw_ADD( p, x1y0, x1y0, x1y1 );
+
+    /* And now in x.  There are horrible register dependencies here,
+       but we have nothing else to do. */
+    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+    brw_MUL( p, x1y0, x1y0, param0 );
+    brw_ADD( p, x0y0, x0y0, x1y0 );
+    
+    /* scale by pow( 2, -15 ), as described above */
+    brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
+
+    release_tmps( c, mark );
+}
+
+static void emit_noise2( struct brw_wm_compile *c,
+			 struct prog_instruction *inst )
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, param0, param1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    int mark = mark_tmps( c );
+
+    assert( mark == 0 );
+    
+    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+
+    param0 = alloc_tmp( c );
+    param1 = alloc_tmp( c );
+
+    brw_MOV( p, param0, src0 );
+    brw_MOV( p, param1, src1 );
+
+    invoke_subroutine( c, SUB_NOISE2, noise2_sub );
+    
+    /* Fill in the result: */
+    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_MOV( p, dst, param0 );
+	}
+    }
+    if( inst->SaturateMode == SATURATE_ZERO_ONE )
+	brw_set_saturate( p, 0 );
+    
+    release_tmps( c, mark );
+}
+
+/* The three-dimensional case is much like the one- and two- versions above,
+   but since the number of corners is rapidly growing we now pack 16 16-bit
+   hashes into each register to extract more parallelism from the EUs. */
+static void noise3_sub( struct brw_wm_compile *c ) {
+
+    struct brw_compile *p = &c->func;
+    struct brw_reg param0, param1, param2,
+	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
+	xi, yi, zi, /* interpolation coefficients */
+	t, tmp[ 8 ], /* float temporaries */
+	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
+	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
+    int i;
+    int mark = mark_tmps( c );
+
+    x0y0 = alloc_tmp( c );
+    x0y1 = alloc_tmp( c );
+    x1y0 = alloc_tmp( c );
+    x1y1 = alloc_tmp( c );
+    xi = alloc_tmp( c );
+    yi = alloc_tmp( c );
+    zi = alloc_tmp( c );
+    t = alloc_tmp( c );
+    for( i = 0; i < 8; i++ ) {
+	tmp[ i ] = alloc_tmp( c );
+	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
+    }
+    
+    param0 = lookup_tmp( c, mark - 4 );
+    param1 = lookup_tmp( c, mark - 3 );
+    param2 = lookup_tmp( c, mark - 2 );
+
+    brw_set_access_mode( p, BRW_ALIGN_1 );
+    
+    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
+       be hashed.  Also compute the remainders (offsets within the unit
+       cube), interleaved to reduce register dependency penalties. */
+    brw_RNDD( p, itmp[ 0 ], param0 );
+    brw_RNDD( p, itmp[ 1 ], param1 );
+    brw_RNDD( p, itmp[ 2 ], param2 );
+    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBC8F ) ); /* constant used later */
+    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0xD0BD ) ); /* constant used later */
+    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0x9B93 ) ); /* constant used later */
+    brw_FRC( p, param0, param0 );
+    brw_FRC( p, param1, param1 );
+    brw_FRC( p, param2, param2 );
+    /* Since we now have only 16 bits of precision in the hash, we must
+       be more careful about thorough mixing to maintain entropy as we
+       squash the input vector into a small scalar. */
+    brw_MUL( p, brw_acc_reg(), itmp[ 4 ], itmp[ 0 ] );
+    brw_MAC( p, brw_acc_reg(), itmp[ 5 ], itmp[ 1 ] );
+    brw_MAC( p, itmp[ 0 ], itmp[ 6 ], itmp[ 2 ] );
+    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
+	     brw_imm_uw( 0xBC8F ) );
+
+    /* Temporarily disable the execution mask while we work with ExecSize=16
+       channels (the mask is set for ExecSize=8 and is probably incorrect).
+       Although this might cause execution of unwanted channels, the code
+       writes only to temporary registers and has no side effects, so
+       disabling the mask is harmless. */
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
+    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
+    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
+
+    /* We're now ready to perform the hashing.  The eight hashes are
+       interleaved for performance.  The hash function used is
+       designed to rapidly achieve avalanche and require only 16x16
+       bit multiplication, and 8-bit swizzles (which we get for
+       free). */
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+		 odd_bytes( wtmp[ i ] ) );
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+		 odd_bytes( wtmp[ i ] ) );
+    brw_pop_insn_state( p );
+
+    /* Now we want to initialise the four rear gradients based on the
+       hashes.  Format conversion from signed integer to float leaves
+       everything scaled too high by a factor of pow( 2, 15 ), but
+       we correct for that right at the end. */
+    /* x component */
+    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
+    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
+    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
+    brw_pop_insn_state( p );
+    
+    brw_MUL( p, x1y0, x1y0, t );
+    brw_MUL( p, x1y1, x1y1, t );
+    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+    brw_MUL( p, x0y0, x0y0, param0 );
+    brw_MUL( p, x0y1, x0y1, param0 );
+
+    /* y component */
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+    
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
+    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
+    
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    
+    /* z component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    
+    /* We interpolate between the gradients using the polynomial
+       6t^5 - 15t^4 + 10t^3 (Perlin). */
+    brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
+    brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
+    brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
+    brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
+    brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
+    brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
+    brw_MUL( p, xi, xi, param0 );
+    brw_MUL( p, yi, yi, param1 );
+    brw_MUL( p, zi, zi, param2 );
+    brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
+    brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
+    brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
+    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
+    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
+    brw_MUL( p, xi, xi, param0 );
+    brw_MUL( p, yi, yi, param1 );
+    brw_MUL( p, zi, zi, param2 );
+    brw_MUL( p, xi, xi, param0 );
+    brw_MUL( p, yi, yi, param1 );
+    brw_MUL( p, zi, zi, param2 );
+    brw_MUL( p, xi, xi, param0 );
+    brw_MUL( p, yi, yi, param1 );
+    brw_MUL( p, zi, zi, param2 );
+    
+    /* Here we interpolate in the y dimension... */
+    brw_MUL( p, x0y1, x0y1, yi );
+    brw_MUL( p, x1y1, x1y1, yi );
+    brw_ADD( p, x0y0, x0y0, x0y1 );
+    brw_ADD( p, x1y0, x1y0, x1y1 );
+
+    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
+    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+    brw_MUL( p, x1y0, x1y0, xi );
+    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
+
+    /* Now do the same thing for the front four gradients... */
+    /* x component */
+    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
+    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
+    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
+    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
+    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, x1y0, x1y0, t );
+    brw_MUL( p, x1y1, x1y1, t );
+    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+    brw_MUL( p, x0y0, x0y0, param0 );
+    brw_MUL( p, x0y1, x0y1, param0 );
+
+    /* y component */
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+    
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
+    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
+    
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    
+    /* z component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    
+    /* The interpolation coefficients are still around from last time, so
+       again interpolate in the y dimension... */
+    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
+    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
+    brw_MUL( p, x0y1, x0y1, yi );
+    brw_MUL( p, x1y1, x1y1, yi );
+    brw_ADD( p, x0y0, x0y0, x0y1 );
+    brw_ADD( p, x1y0, x1y0, x1y1 );
+
+    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
+       time put the front face in tmp[ 1 ] and we're nearly there... */
+    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+    brw_MUL( p, x1y0, x1y0, xi );
+    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
+
+    /* The final interpolation, in the z dimension: */
+    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
+    
+    /* scale by pow( 2, -15 ), as described above */
+    brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
+
+    release_tmps( c, mark );
+}
+
+static void emit_noise3( struct brw_wm_compile *c,
+			 struct prog_instruction *inst )
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, src2, param0, param1, param2, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    int mark = mark_tmps( c );
+
+    assert( mark == 0 );
+    
+    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
+
+    param0 = alloc_tmp( c );
+    param1 = alloc_tmp( c );
+    param2 = alloc_tmp( c );
+
+    brw_MOV( p, param0, src0 );
+    brw_MOV( p, param1, src1 );
+    brw_MOV( p, param2, src2 );
+
+    invoke_subroutine( c, SUB_NOISE3, noise3_sub );
+    
+    /* Fill in the result: */
+    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_MOV( p, dst, param0 );
+	}
+    }
+    if( inst->SaturateMode == SATURATE_ZERO_ONE )
+	brw_set_saturate( p, 0 );
+    
+    release_tmps( c, mark );
+}
+    
 static void emit_wpos_xy(struct brw_wm_compile *c,
                 struct prog_instruction *inst)
 {
@@ -1201,8 +1912,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_LRP:
 		emit_lrp(c, inst);
 		break;
-	    case OPCODE_INT:
-		emit_int(c, inst);
+	    case OPCODE_TRUNC:
+		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
 		emit_mov(c, inst);
@@ -1276,6 +1987,17 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_MAD:
 		emit_mad(c, inst);
 		break;
+	    case OPCODE_NOISE1:
+		emit_noise1(c, inst);
+		break;
+	    case OPCODE_NOISE2:
+		emit_noise2(c, inst);
+		break;
+	    case OPCODE_NOISE3:
+		emit_noise3(c, inst);
+		break;
+	    /* case OPCODE_NOISE4: */
+		/* not yet implemented */
 	    case OPCODE_TEX:
 		emit_tex(c, inst);
 		break;
@@ -1368,7 +2090,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
     brw_wm_pass_fp(c);
-    c->tmp_index = 127;
     brw_wm_emit_glsl(brw, c);
     c->prog_data.total_grf = c->reg_index;
     c->prog_data.total_scratch = 0;