33 files changed, 1040 insertions, 491 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 82370162f5..c724218cf5 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -88,7 +88,7 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   key->stencil = ctx->Stencil.Enabled;
+   key->stencil = ctx->Stencil._Enabled;
    key->stencil_two_side = ctx->Stencil._TestTwoSide;
 
    if (key->stencil) {
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index c45d48dff8..d830e49e50 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -181,34 +181,54 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
 	 is_negative = brw_IF(p, BRW_EXECUTE_1);
 	 {
-	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
-	    brw_math_invert(p, c->reg.t, c->reg.t);
-	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
-
-	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
-	    brw_MOV(p, c->reg.t1, c->reg.t);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (!BRW_IS_G4X(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p, is_neg2);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 } 
 	 is_negative = brw_ELSE(p, is_negative);
 	 {
-	    /* Coming back in.  We know that both cannot be negative
-	     * because the line would have been culled in that case.
-	     */
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (!BRW_IS_G4X(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             }
 
-	    /* If both are positive, do nothing */
-             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
-             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              {
-		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
-		brw_math_invert(p, c->reg.t, c->reg.t);
-		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
-
-		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
-		brw_MOV(p, c->reg.t0, c->reg.t);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	     }
-	     brw_ENDIF(p, is_neg2);
-	 }
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             }
+
+             if (!BRW_IS_G4X(p->brw)) {
+                 brw_ENDIF(p, is_neg2);
+             }
+         }
 	 brw_ENDIF(p, is_negative);	 
       }
       brw_ENDIF(p, plane_active);
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 1dbba37fe7..7fd37bd05f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -455,6 +455,8 @@ static void brw_clip_test( struct brw_clip_compile *c )
     struct brw_indirect vt2 = brw_indirect(2, 0);
 
     struct brw_compile *p = &c->func;
+    struct brw_instruction *is_outside;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
 
     brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
     brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
@@ -462,53 +464,87 @@ static void brw_clip_test( struct brw_clip_compile *c )
     brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
     brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
     brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
 
     /* test nearz, xmin, ymin plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3)); 
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3)); 
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3)); 
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
     brw_XOR(p, t, t1, t2);
     brw_XOR(p, t1, t2, t3);
     brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 0), brw_imm_ud(0));
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 1), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 2), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
     /* test farz, xmax, ymax plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3)); 
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3)); 
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3)); 
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
     brw_XOR(p, t, t1, t2);
     brw_XOR(p, t1, t2, t3);
     brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 0), brw_imm_ud(0));
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 1), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 2), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index eaac6224f6..d96ff29310 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -121,6 +121,9 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   ctx->Shader.EmitCondCodes = GL_TRUE;
+
 /*    ctx->Const.MaxNativeVertexProgramTemps = 32; */
 
    brw_init_state( brw );
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index df90c2027f..48ed4325be 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -46,7 +46,7 @@
  *
  * CURBE - constant URB entry.  An urb region (entry) used to hold
  * constant values which the fixed function units can be instructed to
- * preload into the GRF when spawining a thread.
+ * preload into the GRF when spawning a thread.
  *
  * VUE - vertex URB entry.  An urb entry holding a vertex and usually
  * a vertex header.  The header contains control information and
@@ -63,7 +63,7 @@
  * special and may be overwritten.
  *
  * MRF - message register file.  Threads communicate (and terminate)
- * by sending messages.  Message parameters are placed in contigous
+ * by sending messages.  Message parameters are placed in contiguous
  * MRF registers.  All program output is via these messages.  URB
  * entries are populated by sending a message to the shared URB
  * function containing the new data, together with a control word,
@@ -154,21 +154,22 @@ struct brw_state_flags {
    GLuint cache;
 };
 
+
+/** Subclass of Mesa vertex program */
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
 };
 
 
-
+/** Subclass of Mesa fragment program */
 struct brw_fragment_program {
    struct gl_fragment_program program;
-   GLuint id;
+   GLuint id;  /**< serial no. to identify frag progs, never re-used */
+   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 };
 
 
-
-
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
  * corresponding to a different brw_wm_prog_key struct, with different
@@ -418,8 +419,8 @@ struct brw_context
       struct brw_tracked_state **atoms;
       GLuint nr_atoms;
 
-      GLuint nr_draw_regions;
-      struct intel_region *draw_regions[MAX_DRAW_BUFFERS];
+      GLuint nr_color_regions;
+      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
       struct intel_region *depth_region;
 
       /**
@@ -627,8 +628,6 @@ struct brw_context
  * brw_vtbl.c
  */
 void brwInitVtbl( struct brw_context *brw );
-void brw_do_flush( struct brw_context *brw, 
-		   GLuint flags );
 
 /*======================================================================
  * brw_context.c
@@ -670,7 +669,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
-void brw_upload_constant_buffer_state(struct brw_context *brw);
+/* brw_curbe.c
+ */
+void brw_upload_cs_urb_state(struct brw_context *brw);
 
 
 /*======================================================================
@@ -683,6 +684,32 @@ brw_context( GLcontext *ctx )
    return (struct brw_context *)ctx;
 }
 
+static INLINE struct brw_vertex_program *
+brw_vertex_program(struct gl_vertex_program *p)
+{
+   return (struct brw_vertex_program *) p;
+}
+
+static INLINE const struct brw_vertex_program *
+brw_vertex_program_const(const struct gl_vertex_program *p)
+{
+   return (const struct brw_vertex_program *) p;
+}
+
+static INLINE struct brw_fragment_program *
+brw_fragment_program(struct gl_fragment_program *p)
+{
+   return (struct brw_fragment_program *) p;
+}
+
+static INLINE const struct brw_fragment_program *
+brw_fragment_program_const(const struct gl_fragment_program *p)
+{
+   return (const struct brw_fragment_program *) p;
+}
+
+
+
 #define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 4eaaa5f871..545dedd34b 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -53,7 +53,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
    
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
+   const struct brw_vertex_program *vp = brw_vertex_program_const(brw->vertex_program);
    GLuint nr_vp_regs = (vp->program.Base.Parameters->NumParameters * 4 + 15) / 16;
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
@@ -138,24 +138,24 @@ const struct brw_tracked_state brw_curbe_offsets = {
  * fixed-function hardware in a double-buffering scheme to avoid a
  * pipeline stall each time the contents of the curbe is changed.
  */
-void brw_upload_constant_buffer_state(struct brw_context *brw)
+void brw_upload_cs_urb_state(struct brw_context *brw)
 {
-   struct brw_constant_buffer_state cbs; 
-   memset(&cbs, 0, sizeof(cbs));
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
 
    /* It appears that this is the state packet for the CS unit, ie. the
     * urb entries detailed here are housed in the CS range from the
     * URB_FENCE command.
     */
-   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
-   cbs.header.length = sizeof(cbs)/4 - 2;
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
 
    /* BRW_NEW_URB_FENCE */
-   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
 
    assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
 }
 
 static GLfloat fixed_plane[6][4] = {
@@ -174,10 +174,12 @@ static GLfloat fixed_plane[6][4] = {
 static void prepare_constant_buffer(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
-   GLuint sz = brw->curbe.total_size;
-   GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
+   const struct brw_fragment_program *fp =
+      brw_fragment_program_const(brw->fragment_program);
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    GLfloat *buf;
    GLuint i;
 
@@ -189,27 +191,25 @@ static void prepare_constant_buffer(struct brw_context *brw)
    brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
 
    if (sz == 0) {
-
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
 	 brw->curbe.last_buf = NULL;
 	 brw->curbe.last_bufsz  = 0;
       }
-
       return;
    }
 
-   buf = (GLfloat *)malloc(bufsz);
-
-   memset(buf, 0, bufsz);
+   buf = (GLfloat *) _mesa_calloc(bufsz);
 
+   /* fragment shader constants */
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
       _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
 
+      /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
-	 buf[offset + i] = brw->wm.prog_data->param[i][0];
+	 buf[offset + i] = *brw->wm.prog_data->param[i];
    }
 
 
@@ -244,7 +244,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       }
    }
 
-
+   /* vertex shader constants */
    if (brw->curbe.vs_size) {
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = vp->program.Base.Parameters->NumParameters;
@@ -252,10 +252,11 @@ static void prepare_constant_buffer(struct brw_context *brw)
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
       for (i = 0; i < nr; i++) {
-	 buf[offset + i * 4 + 0] = vp->program.Base.Parameters->ParameterValues[i][0];
-	 buf[offset + i * 4 + 1] = vp->program.Base.Parameters->ParameterValues[i][1];
-	 buf[offset + i * 4 + 2] = vp->program.Base.Parameters->ParameterValues[i][2];
-	 buf[offset + i * 4 + 3] = vp->program.Base.Parameters->ParameterValues[i][3];
+         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
+	 buf[offset + i * 4 + 0] = value[0];
+	 buf[offset + i * 4 + 1] = value[1];
+	 buf[offset + i * 4 + 2] = value[2];
+	 buf[offset + i * 4 + 3] = value[3];
       }
    }
 
@@ -274,11 +275,14 @@ static void prepare_constant_buffer(struct brw_context *brw)
        brw->curbe.last_buf &&
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
-      free(buf);
+      /* constants have not changed */
+      _mesa_free(buf);
    } 
    else {
+      /* constants have changed */
       if (brw->curbe.last_buf)
-	 free(brw->curbe.last_buf);
+	 _mesa_free(brw->curbe.last_buf);
+
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
 
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 39c32255f8..590b064c7e 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -734,7 +734,7 @@
 
 
 #define CMD_URB_FENCE                 0x6000
-#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CS_URB_STATE              0x6001
 #define CMD_CONST_BUFFER              0x6002
 
 #define CMD_STATE_BASE_ADDRESS        0x6101
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 99fd587e9f..5342622a73 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -127,6 +127,7 @@ static void brw_emit_prim(struct brw_context *brw,
 			  uint32_t hw_prim)
 {
    struct brw_3d_primitive prim_packet;
+   struct intel_context *intel = &brw->intel;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
@@ -146,10 +147,27 @@ static void brw_emit_prim(struct brw_context *brw,
 
    /* Can't wrap here, since we rely on the validated state. */
    brw->no_batch_wrap = GL_TRUE;
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
+
    brw->no_batch_wrap = GL_FALSE;
 }
 
@@ -194,9 +212,16 @@ static GLboolean check_fallbacks( struct brw_context *brw,
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
-   if (!brw->intel.strict_conformance)
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
       return GL_FALSE;
 
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
    if (ctx->Polygon.SmoothFlag) {
       for (i = 0; i < nr_prims; i++)
 	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
@@ -220,7 +245,7 @@ static GLboolean check_fallbacks( struct brw_context *brw,
 	 /* GS doesn't get enough information to know when to reset
 	  * the stipple counter?!?
 	  */
-	 if (prim[i].mode == GL_LINE_LOOP) 
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
 	    return GL_TRUE;
 	    
 	 if (prim[i].mode == GL_POLYGON &&
@@ -230,13 +255,46 @@ static GLboolean check_fallbacks( struct brw_context *brw,
       }
    }
 
-
    if (ctx->Point.SmoothFlag) {
       for (i = 0; i < nr_prims; i++)
 	 if (prim[i].mode == GL_POINTS) 
 	    return GL_TRUE;
    }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
       
+   /* Nothing stopping us from the fast path now */
    return GL_FALSE;
 }
 
@@ -261,11 +319,18 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
+   /* We have to validate the textures *before* checking for fallbacks;
+    * otherwise, the software fallback won't be able to rely on the
+    * texture state, the firstLevel and lastLevel fields won't be
+    * set in the intel texture object (they'll both be 0), and the 
+    * software fallback will segfault if it attempts to access any
+    * texture level other than level 0.
+    */
+   brw_validate_textures( brw );
+
    if (check_fallbacks(brw, prim, nr_prims))
       return GL_FALSE;
 
-   brw_validate_textures( brw );
-
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs( brw, arrays );
@@ -346,6 +411,8 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       retval = GL_TRUE;
    }
 
+   if (intel->always_flush_batch)
+      intel_batchbuffer_flush(intel->batch);
  out:
    UNLOCK_HARDWARE(intel);
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index b3ae4eef33..c53efba599 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -129,3 +129,126 @@ const GLuint *brw_get_program( struct brw_compile *p,
    return (const GLuint *)p->store;
 }
 
+
+
+/**
+ * Subroutine calls require special attention.
+ * Mesa instructions may be expanded into multiple hardware instructions
+ * so the prog_instruction::BranchTarget field can't be used as an index
+ * into the hardware instructions.
+ *
+ * The BranchTarget field isn't needed, however.  Mesa's GLSL compiler
+ * emits CAL and BGNSUB instructions with labels that can be used to map
+ * subroutine calls to actual subroutine code blocks.
+ *
+ * The structures and function here implement patching of CAL instructions
+ * so they jump to the right subroutine code...
+ */
+
+
+/**
+ * For each OPCODE_BGNSUB we create one of these.
+ */
+struct brw_glsl_label
+{
+   const char *name; /**< the label string */
+   GLuint position;  /**< the position of the brw instruction for this label */
+   struct brw_glsl_label *next;  /**< next in linked list */
+};
+
+
+/**
+ * For each OPCODE_CAL we create one of these.
+ */
+struct brw_glsl_call
+{
+   GLuint call_inst_pos;  /**< location of the CAL instruction */
+   const char *sub_name;  /**< name of subroutine to call */
+   struct brw_glsl_call *next;  /**< next in linked list */
+};
+
+
+/**
+ * Called for each OPCODE_BGNSUB.
+ */
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position)
+{
+   struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
+   label->name = name;
+   label->position = position;
+   label->next = c->first_label;
+   c->first_label = label;
+}
+
+
+/**
+ * Called for each OPCODE_CAL.
+ */
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
+{
+   struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
+   call->call_inst_pos = call_pos;
+   call->sub_name = name;
+   call->next = c->first_call;
+   c->first_call = call;
+}
+
+
+/**
+ * Lookup a label, return label's position/offset.
+ */
+static GLuint
+brw_lookup_label(struct brw_compile *c, const char *name)
+{
+   const struct brw_glsl_label *label;
+   for (label = c->first_label; label; label = label->next) {
+      if (strcmp(name, label->name) == 0) {
+         return label->position;
+      }
+   }
+   abort();  /* should never happen */
+   return ~0;
+}
+
+
+/**
+ * When we're done generating code, this function is called to resolve
+ * subroutine calls.
+ */
+void
+brw_resolve_cals(struct brw_compile *c)
+{
+    const struct brw_glsl_call *call;
+
+    for (call = c->first_call; call; call = call->next) {
+        const GLuint sub_loc = brw_lookup_label(c, call->sub_name);
+	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
+	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
+	GLint offset = brw_sub_inst - brw_call_inst;
+
+	/* patch brw_inst1 to point to brw_inst2 */
+	brw_set_src1(brw_call_inst, brw_imm_d(offset * 16));
+    }
+
+    /* free linked list of calls */
+    {
+        struct brw_glsl_call *call, *next;
+        for (call = c->first_call; call; call = next) {
+	    next = call->next;
+	    _mesa_free(call);
+	}
+	c->first_call = NULL;
+    }
+
+    /* free linked list of labels */
+    {
+        struct brw_glsl_label *label, *next;
+	for (label = c->first_label; label; label = next) {
+	    next = label->next;
+	    _mesa_free(label);
+	}
+	c->first_label = NULL;
+    }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 9e2b39af9b..eb99c21711 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -91,8 +91,13 @@ struct brw_indirect {
 };
 
 
+struct brw_glsl_label;
+struct brw_glsl_call;
+
+
+
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 1200
+#define BRW_EU_MAX_INSN 4000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -106,9 +111,22 @@ struct brw_compile {
    GLuint flag_value;
    GLboolean single_program_flow;
    struct brw_context *brw;
+
+   struct brw_glsl_label *first_label;  /**< linked list of labels */
+   struct brw_glsl_call *first_call;    /**< linked list of CALs */
 };
 
 
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position);
+
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos);
+
+void
+brw_resolve_cals(struct brw_compile *c);
+
+
 
 static INLINE int type_sz( GLuint type )
 {
@@ -152,6 +170,13 @@ static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint writemask )
 {
    struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < 128);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < 9);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
    reg.type = type;
    reg.file = file;
    reg.nr = nr;
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 4e099b5945..6dce1ca48e 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -55,6 +55,9 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
 			  struct brw_reg dest )
 {
+   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
    insn->bits1.da1.dest_reg_file = dest.file;
    insn->bits1.da1.dest_reg_type = dest.type;
    insn->bits1.da1.dest_address_mode = dest.address_mode;
@@ -96,10 +99,13 @@ static void brw_set_dest( struct brw_instruction *insn,
 }
 
 static void brw_set_src0( struct brw_instruction *insn,
-		      struct brw_reg reg )
+                          struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
    insn->bits1.da1.src0_reg_file = reg.file;
    insn->bits1.da1.src0_reg_type = reg.type;
    insn->bits2.da1.src0_abs = reg.abs;
@@ -169,10 +175,12 @@ static void brw_set_src0( struct brw_instruction *insn,
 
 
 void brw_set_src1( struct brw_instruction *insn,
-			  struct brw_reg reg )
+                   struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   assert(reg.nr < 128);
+
    insn->bits1.da1.src1_reg_file = reg.file;
    insn->bits1.da1.src1_reg_type = reg.type;
    insn->bits3.da1.src1_abs = reg.abs;
@@ -323,13 +331,13 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
-                 struct brw_instruction *insn,
-				     GLuint binding_table_index,
-				     GLuint sampler,
-				     GLuint msg_type,
-				     GLuint response_length,
-				     GLuint msg_length,
-				     GLboolean eot)
+                                    struct brw_instruction *insn,
+                                    GLuint binding_table_index,
+                                    GLuint sampler,
+                                    GLuint msg_type,
+                                    GLuint response_length,
+                                    GLuint msg_length,
+                                    GLboolean eot)
 {
    brw_set_src1(insn, brw_imm_d(0));
 
@@ -407,7 +415,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
  * Convenience routines.
  */
 #define ALU1(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0)   			\
 {							\
@@ -415,7 +423,7 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,			\
 }
 
 #define ALU2(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0,			\
 	      struct brw_reg src1)   			\
@@ -469,9 +477,9 @@ void brw_NOP(struct brw_compile *p)
  */
 
 struct brw_instruction *brw_JMPI(struct brw_compile *p, 
-	      struct brw_reg dest,
-	      struct brw_reg src0,
-	      struct brw_reg src1)
+                                 struct brw_reg dest,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
 {
    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 
@@ -674,7 +682,7 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 
 
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
-	       struct brw_instruction *do_insn)
+                                  struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
 
@@ -931,13 +939,13 @@ void brw_dp_READ_16( struct brw_compile *p,
 
 
 void brw_fb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   GLuint msg_reg_nr,
-		   struct brw_reg src0,
-		   GLuint binding_table_index,
-		   GLuint msg_length,
-		   GLuint response_length,
-		   GLboolean eot)
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLuint binding_table_index,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
    
@@ -973,8 +981,8 @@ void brw_SAMPLE(struct brw_compile *p,
 {
    GLboolean need_stall = 0;
    
-   if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+   if (writemask == 0) {
+      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1006,7 +1014,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
@@ -1047,8 +1055,7 @@ void brw_SAMPLE(struct brw_compile *p,
 			      eot);
    }
 
-   if (need_stall)
-   {
+   if (need_stall) {
       struct brw_reg reg = vec8(offset(dest, response_length-1));
 
       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index e63098fdd4..299357409c 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -75,8 +75,8 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    
    /* _NEW_STENCIL 
     */
-   if (ctx->Stencil.Enabled && 
-       !brw->intel.hw_stencil) {
+   if (ctx->Stencil._Enabled &&
+       (ctx->DrawBuffer->Name == 0 && !brw->intel.hw_stencil)) {
       DBG("FALLBACK: stencil\n");
       return GL_TRUE;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 9dcdad7b4e..5c94a49f60 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -178,7 +178,7 @@ static void upload_psp_urb_cbs(struct brw_context *brw )
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
-   brw_upload_constant_buffer_state(brw);
+   brw_upload_cs_urb_state(brw);
 }
 
 const struct brw_tracked_state brw_psp_urb_cbs = {
@@ -290,8 +290,21 @@ static void upload_polygon_stipple(struct brw_context *brw)
    bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
    bps.header.length = sizeof(bps)/4-2;
 
-   for (i = 0; i < 32; i++)
-      bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+   /* Polygon stipple is provided in OpenGL order, i.e. bottom
+    * row first.  If we're rendering to a window (i.e. the
+    * default frame buffer object, 0), then we need to invert
+    * it to match our pixel layout.  But if we're rendering
+    * to a FBO (i.e. any named frame buffer object), we *don't*
+    * need to invert - we already match the layout.
+    */
+   if (ctx->DrawBuffer->Name == 0) {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+   }
+   else {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+   }
 
    BRW_CACHED_BATCH_STRUCT(brw, &bps);
 }
@@ -319,8 +332,22 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
 
-   bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
-   bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
+    * we have to invert the Y axis in order to match the OpenGL
+    * pixel coordinate system, and our offset must be matched
+    * to the window position.  If we're drawing to a FBO
+    * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate
+    * system works just fine, and there's no window system to
+    * worry about.
+    */
+   if (brw->intel.ctx.DrawBuffer->Name == 0) {
+      bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
+      bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   }
+   else {
+      bpso.bits0.y_offset = 0;
+      bpso.bits0.x_offset = 0;
+   }
 
    BRW_CACHED_BATCH_STRUCT(brw, &bpso);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 0c86911044..d90bd82038 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -38,6 +38,7 @@
 
 #include "brw_context.h"
 #include "brw_util.h"
+#include "brw_wm.h"
 
 static void brwBindProgram( GLcontext *ctx,
 			    GLenum target, 
@@ -94,7 +95,6 @@ static struct gl_program *brwNewProgram( GLcontext *ctx,
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
-   
    _mesa_delete_program( ctx, prog );
 }
 
@@ -110,30 +110,35 @@ static void brwProgramStringNotify( GLcontext *ctx,
 				    GLenum target,
 				    struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_fragment_program *p = (struct brw_fragment_program *)prog;
-      struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
       if (fprog->FogOption) {
          _mesa_append_fog_code(ctx, fprog);
          fprog->FogOption = GL_NONE;
       }
 
-      if (p == fp)
+      if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-      p->id = brw->program_id++;      
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
    }
    else if (target == GL_VERTEX_PROGRAM_ARB) {
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_vertex_program *p = (struct brw_vertex_program *)prog;
-      struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-      if (p == vp)
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
 	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-      if (p->program.IsPositionInvariant) {
-	 _mesa_insert_mvp_code(ctx, &p->program);
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
       }
-      p->id = brw->program_id++;      
+      newVP->id = brw->program_id++;      
 
       /* Also tell tnl about it:
        */
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 8c1711538a..c3c85978f4 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -167,8 +167,14 @@ static void upload_sf_prog(struct brw_context *brw)
    key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_POLYGON */
-   if (key.do_twoside_color)
-      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW);
+   if (key.do_twoside_color) {
+      /* If we're rendering to a FBO, we have to invert the polygon
+       * face orientation, just as we invert the viewport in
+       * sf_unit_create_from_key().  ctx->DrawBuffer->Name will be
+       * nonzero if we're rendering to such an FBO.
+       */
+      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
+   }
 
    dri_bo_unreference(brw->sf.prog_bo);
    brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index e96d5354b3..93a9686f71 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -44,6 +44,7 @@ static void upload_sf_vp(struct brw_context *brw)
    struct brw_sf_viewport sfv;
    GLfloat y_scale, y_bias;
    const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
@@ -58,8 +59,6 @@ static void upload_sf_vp(struct brw_context *brw)
 
    /* _NEW_VIEWPORT */
 
-   const GLfloat *v = ctx->Viewport._WindowMap.m;
-
    sfv.viewport.m00 = v[MAT_SX];
    sfv.viewport.m11 = v[MAT_SY] * y_scale;
    sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index df839c5b30..81b0a45998 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -52,7 +52,6 @@ const struct brw_tracked_state brw_cc_vp;
 const struct brw_tracked_state brw_check_fallback;
 const struct brw_tracked_state brw_clip_prog;
 const struct brw_tracked_state brw_clip_unit;
-const struct brw_tracked_state brw_constant_buffer_state;
 const struct brw_tracked_state brw_constant_buffer;
 const struct brw_tracked_state brw_curbe_offsets;
 const struct brw_tracked_state brw_invarient_state;
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index dc87859f3f..811940edc0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -97,8 +97,6 @@ void brw_clear_batch_cache_flush( struct brw_context *brw )
 {
    clear_batch_cache(brw);
 
-/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
-   
    brw->state.dirty.mesa |= ~0;
    brw->state.dirty.brw |= ~0;
    brw->state.dirty.cache |= ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index b28c57c2bc..5d332d010c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -162,6 +162,14 @@ static void brw_debug_prog(const char *name, dri_bo *prog)
       fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
 	      name, (unsigned int)prog->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
+      /* Stop at the end of the program.  It'd be nice to keep track of the actual
+       * intended program size instead of guessing like this.
+       */
+      if (data[i * 4 + 0] == 0 &&
+	  data[i * 4 + 1] == 0 &&
+	  data[i * 4 + 2] == 0 &&
+	  data[i * 4 + 3] == 0)
+	 break;
    }
 
    dri_bo_unmap(prog);
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index d97ff27f0a..89e2981203 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -439,7 +439,7 @@ struct brw_urb_fence
    } bits1;
 };
 
-struct brw_constant_buffer_state /* previously brw_command_streamer */
+struct brw_cs_urb_state
 {
    struct header header;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index 9977677fd7..d29eb17f8c 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -170,8 +170,8 @@ static void calc_wm_input_sizes( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = 
-      (struct brw_vertex_program *)brw->vertex_program;
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
    /* BRW_NEW_INPUT_DIMENSIONS */
    struct tracker t;
    GLuint insn;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 24b7dc30fe..3807dff991 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -156,6 +156,12 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
    c->prog_data.total_grf = reg;
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+   }
 }
 
 
@@ -658,7 +664,7 @@ static void emit_nrm( struct brw_vs_compile *c,
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       GLuint file,
+			       gl_register_file file,
 			       GLuint index )
 {
 
@@ -954,36 +960,27 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 }
 
 
+/**
+ * Called after code generation to resolve subroutine calls and the
+ * END instruction.
+ * \param end_inst  points to brw code for END instruction
+ * \param last_inst  points to last instruction emitted before vertex write
+ */
 static void 
-post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+post_vs_emit( struct brw_vs_compile *c,
+              struct brw_instruction *end_inst,
+              struct brw_instruction *last_inst )
 {
-   GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn, target_insn;
-   struct prog_instruction *inst1, *inst2;
-   struct brw_instruction *brw_inst1, *brw_inst2;
-   int offset;
-   for (insn = 0; insn < nr_insns; insn++) {
-       inst1 = &c->vp->program.Base.Instructions[insn];
-       brw_inst1 = inst1->Data;
-       switch (inst1->Opcode) {
-	   case OPCODE_CAL:
-	   case OPCODE_BRA:
-	       target_insn = inst1->BranchTarget;
-	       inst2 = &c->vp->program.Base.Instructions[target_insn];
-	       brw_inst2 = inst2->Data;
-	       offset = brw_inst2 - brw_inst1;
-	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	       break;
-	   case OPCODE_END:
-	       offset = end_inst - brw_inst1;
-	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	       break;
-	   default:
-	       break;
-       }
-   }
+   GLint offset;
+
+   brw_resolve_cals(&c->func);
+
+   /* patch up the END code to jump past subroutines, etc */
+   offset = last_inst - end_inst;
+   brw_set_src1(end_inst, brw_imm_d(offset * 16));
 }
 
+
 /* Emit the fragment program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
@@ -992,7 +989,8 @@ void brw_vs_emit(struct brw_vs_compile *c )
    struct brw_compile *p = &c->func;
    GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_insn = 0;
-   struct brw_instruction *end_inst;
+   GLuint end_offset = 0;
+   struct brw_instruction *end_inst, *last_inst;
    struct brw_instruction *if_inst[MAX_IFSN];
    struct brw_indirect stack_index = brw_indirect(0, 0);   
 
@@ -1035,7 +1033,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
       
       /* Get argument regs.  SWZ is special and does this itself.
        */
-      inst->Data = &p->store[p->nr_insn];
       if (inst->Opcode != OPCODE_SWZ)
 	  for (i = 0; i < 3; i++) {
 	      struct prog_src_register *src = &inst->SrcReg[i];
@@ -1203,7 +1200,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 brw_set_access_mode(p, BRW_ALIGN_16);
 	 brw_ADD(p, get_addr_reg(stack_index),
 			 get_addr_reg(stack_index), brw_imm_d(4));
-	 inst->Data = &p->store[p->nr_insn];
+         brw_save_call(p, inst->Comment, p->nr_insn);
 	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
       case OPCODE_RET:
@@ -1212,14 +1209,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 brw_set_access_mode(p, BRW_ALIGN_1);
          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
+	 break;
       case OPCODE_END:	
+         end_offset = p->nr_insn;
+         /* this instruction will get patched later to jump past subroutine
+          * code, etc.
+          */
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
       case OPCODE_PRINT:
+         /* no-op */
+         break;
       case OPCODE_BGNSUB:
+         brw_save_label(p, inst->Comment, p->nr_insn);
+         break;
       case OPCODE_ENDSUB:
-         /* no-op instructions */
-	 break;
+         /* no-op */
+         break;
       default:
 	 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
@@ -1257,9 +1263,11 @@ void brw_vs_emit(struct brw_vs_compile *c )
       release_tmps(c);
    }
 
-   end_inst = &p->store[p->nr_insn];
+   end_inst = &p->store[end_offset];
+   last_inst = &p->store[p->nr_insn];
+
+   /* The END instruction will be patched to jump to this code */
    emit_vertex_write(c);
-   post_vs_emit(c, end_inst);
-   for (insn = 0; insn < nr_insns; insn++)
-       c->vp->program.Base.Instructions[insn].Data = NULL;
+
+   post_vs_emit(c, end_inst, last_inst);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index b501a59ccd..960bbb311e 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -67,11 +67,13 @@ static void brw_destroy_context( struct intel_context *intel )
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
 
+   _mesa_free(brw->wm.compile_data);
+
    brw_FrameBufferTexDestroy( brw );
 
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-      intel_region_release(&brw->state.draw_regions[i]);
-   brw->state.nr_draw_regions = 0;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
    intel_region_release(&brw->state.depth_region);
 
    dri_bo_release(&brw->curbe.curbe_bo);
@@ -90,6 +92,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->wm.bind_bo);
    for (i = 0; i < BRW_WM_MAX_SURF; i++)
       dri_bo_release(&brw->wm.surf_bo[i]);
+   dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.state_bo);
    dri_bo_release(&brw->cc.prog_bo);
@@ -102,25 +105,25 @@ static void brw_destroy_context( struct intel_context *intel )
  * called from intelDrawBuffer()
  */
 static void brw_set_draw_region( struct intel_context *intel, 
-                                 struct intel_region *draw_regions[],
+                                 struct intel_region *color_regions[],
                                  struct intel_region *depth_region,
-                                 GLuint num_regions)
+                                 GLuint num_color_regions)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-   int i;
+   GLuint i;
 
    /* release old color/depth regions */
    if (brw->state.depth_region != depth_region)
       brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-       intel_region_release(&brw->state.draw_regions[i]);
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
    intel_region_release(&brw->state.depth_region);
 
    /* reference new color/depth regions */
-   for (i = 0; i < num_regions; i++)
-       intel_region_reference(&brw->state.draw_regions[i], draw_regions[i]);
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
    intel_region_reference(&brw->state.depth_region, depth_region);
-   brw->state.nr_draw_regions = num_regions;
+   brw->state.nr_color_regions = num_color_regions;
 }
 
 
@@ -181,23 +184,6 @@ static void brw_note_unlock( struct intel_context *intel )
 }
 
 
-void brw_do_flush( struct brw_context *brw, GLuint flags )
-{
-   struct brw_mi_flush flush;
-   memset(&flush, 0, sizeof(flush));      
-   flush.opcode = CMD_MI_FLUSH;
-   flush.flags = flags;
-   BRW_BATCH_STRUCT(brw, &flush);
-}
-
-
-static void brw_emit_flush( struct intel_context *intel, GLuint unused )
-{
-   brw_do_flush(brw_context(&intel->ctx),
-		BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE);
-}
-
-
 /* called from intelWaitForIdle() and intelFlush()
  *
  * For now, just flush everything.  Could be smarter later.
@@ -234,6 +220,5 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.flush_cmd = brw_flush_cmd;
-   brw->intel.vtbl.emit_flush = brw_emit_flush;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index ea708a0681..1645ca0b70 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -80,6 +80,53 @@ GLuint brw_wm_is_scalar_result( GLuint opcode )
 }
 
 
+/**
+ * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
+ * no flow control instructions so we can more readily do SSA-style
+ * optimizations.
+ */
+static void
+brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   c->prog_data.total_grf = c->max_wm_grf;
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
 static void do_wm_prog( struct brw_context *brw,
 			struct brw_fragment_program *fp, 
 			struct brw_wm_prog_key *key)
@@ -90,52 +137,32 @@ static void do_wm_prog( struct brw_context *brw,
 
    c = brw->wm.compile_data;
    if (c == NULL) {
-     brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
-     c = brw->wm.compile_data;
+      brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
+      c = brw->wm.compile_data;
    } else {
-     memset(c, 0, sizeof(*brw->wm.compile_data));
+      memset(c, 0, sizeof(*brw->wm.compile_data));
    }
    memcpy(&c->key, key, sizeof(*key));
 
    c->fp = fp;
    c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
 
-    brw_init_compile(brw, &c->func);
-   if (brw_wm_is_glsl(&c->fp->program)) {
-       brw_wm_glsl_emit(brw, c);
-   } else {
-       /* Augment fragment program.  Add instructions for pre- and
-	* post-fragment-program tasks such as interpolation and fogging.
-	*/
-       brw_wm_pass_fp(c);
-
-       /* Translate to intermediate representation.  Build register usage
-	* chains.
-	*/
-       brw_wm_pass0(c);
-
-       /* Dead code removal.
-	*/
-       brw_wm_pass1(c);
-
-       /* Register allocation.
-	*/
-       c->grf_limit = BRW_WM_MAX_GRF/2;
-
-       brw_wm_pass2(c);
-
-       c->prog_data.total_grf = c->max_wm_grf;
-       if (c->last_scratch) {
-	   c->prog_data.total_scratch =
-	       c->last_scratch + 0x40;
-       } else {
-	   c->prog_data.total_scratch = 0;
-       }
-
-       /* Emit GEN4 code.
-	*/
-       brw_wm_emit(c);
+   brw_init_compile(brw, &c->func);
+
+   /* temporary sanity check assertion */
+   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->isGLSL) {
+      brw_wm_glsl_emit(brw, c);
    }
+   else {
+      brw_wm_non_glsl_emit(brw, c);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM)
       fprintf(stderr, "\n");
 
@@ -159,7 +186,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct brw_fragment_program *fp = 
+   const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
    GLuint lookup = 0;
    GLuint line_aa;
@@ -174,7 +201,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
        ctx->Color.AlphaEnabled)
       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPR))
+   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 
    /* _NEW_DEPTH */
@@ -186,7 +213,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
       lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
 
    /* _NEW_STENCIL */
-   if (ctx->Stencil.Enabled) {
+   if (ctx->Stencil._Enabled) {
       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
       if (ctx->Stencil.WriteMask[0] ||
@@ -278,10 +305,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
       key->drawable_height = brw->intel.driDrawable->h;
    }
 
-   /* Extra info:
-    */
+   /* The unique fragment program ID */
    key->program_string_id = fp->id;
-
 }
 
 
@@ -305,8 +330,6 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 }
 
 
-/* See brw_wm.c:
- */
 const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 0f46a25b1a..7f0e5702f2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -143,13 +143,12 @@ struct brw_wm_instruction {
    GLuint writemask:4;
    GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
    GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
+   GLuint tex_shadow:1; /* do shadow comparison? */
    GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
    GLuint target:10;    /* target binding table index for FB_WRITE*/
 };
 
 
-#define PROGRAM_INTERNAL_PARAM 
-
 #define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
@@ -240,13 +239,15 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   /** Mapping from Mesa registers to hardware registers */
    struct {
 	GLboolean inited;
 	struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;
+   GLuint reg_index;  /**< Index of next free GRF register */
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
@@ -281,4 +282,6 @@ void brw_wm_lookup_iz( GLuint line_aa,
 
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index b5050a3e40..f2dca9caa6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -671,7 +671,6 @@ static void emit_tex( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength, responseLength;
-   GLboolean shadow = (c->key.shadowtex_mask & (1<<inst->tex_unit)) ? 1 : 0;
    GLuint i, nr;
    GLuint emit;
 
@@ -693,7 +692,7 @@ static void emit_tex( struct brw_wm_compile *c,
       break;
    }
 
-   if (shadow) {
+   if (inst->tex_shadow) {
       nr = 4;
       emit |= WRITEMASK_W;
    }
@@ -718,7 +717,7 @@ static void emit_tex( struct brw_wm_compile *c,
 	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      (shadow ? 
+	      (inst->tex_shadow ? 
 	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE : 
 	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE),
 	      responseLength,
@@ -886,6 +885,9 @@ static void emit_aa( struct brw_wm_compile *c,
 
 /* Post-fragment-program processing.  Send the results to the
  * framebuffer.
+ * \param arg0  the fragment color
+ * \param arg1  the pass-through depth value
+ * \param arg2  the shader-computed depth value
  */
 static void emit_fb_write( struct brw_wm_compile *c,
 			   struct brw_reg *arg0,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index ea3f3fc678..533be3858e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -129,7 +129,7 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    reg.Index = idx;
    reg.WriteMask = WRITEMASK_XYZW;
    reg.RelAddr = 0;
-   reg.CondMask = 0;
+   reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
    reg.CondSrc = 0;
    reg.pad = 0;
@@ -183,16 +183,16 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
 {
    struct prog_instruction *inst = get_fp_inst(c);
    *inst = *inst0;
-   inst->Data = (void *)inst0;
    return inst;
 }
 
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
 				       GLuint op,
 				       struct prog_dst_register dest,
 				       GLuint saturate,
 				       GLuint tex_src_unit,
 				       GLuint tex_src_target,
+				       GLuint tex_shadow,
 				       struct prog_src_register src0,
 				       struct prog_src_register src1,
 				       struct prog_src_register src2 )
@@ -206,6 +206,7 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
    inst->SaturateMode = saturate;   
    inst->TexSrcUnit = tex_src_unit;
    inst->TexSrcTarget = tex_src_target;
+   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
@@ -213,6 +214,20 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
 }
    
 
+static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+				       GLuint op,
+				       struct prog_dst_register dest,
+				       GLuint saturate,
+				       struct prog_src_register src0,
+				       struct prog_src_register src1,
+				       struct prog_src_register src2 )
+{
+   return emit_tex_op(c, op, dest, saturate,
+                      0, 0, 0,  /* tex unit, target, shadow */
+                      src0, src1, src2);
+}
+   
+
 
 
 /***********************************************************************
@@ -234,7 +249,7 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
       emit_op(c,
 	      WM_PIXELXY,
 	      dst_mask(pixel_xy, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      payload_r0_depth,
 	      src_undef(),
 	      src_undef());
@@ -257,7 +272,7 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
       emit_op(c,
 	      WM_DELTAXY,
 	      dst_mask(delta_xy, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
 	      src_undef());
@@ -274,14 +289,13 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
       struct prog_dst_register pixel_w = get_temp(c);
       struct prog_src_register deltas = get_delta_xy(c);
       struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS);
-      
-      
+
       /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
        */
       emit_op(c,
 	      WM_PIXELW,
 	      dst_mask(pixel_w, WRITEMASK_W),
-	      0, 0, 0,
+	      0,
 	      interp_wpos,
 	      deltas, 
 	      src_undef());
@@ -316,7 +330,7 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op(c,
 	      WM_WPOSXY,
 	      dst_mask(dst, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      get_pixel_xy(c),
 	      src_undef(),
 	      src_undef());
@@ -328,7 +342,7 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op(c,
 	      WM_LINTERP,
 	      dst,
-	      0, 0, 0,
+	      0,
 	      interp,
 	      deltas,
 	      arg2);
@@ -339,7 +353,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	 emit_op(c,
 		 WM_CINTERP,
 		 dst,
-		 0, 0, 0,
+		 0,
 		 interp,
 		 src_undef(),
 		 src_undef());
@@ -348,7 +362,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	 emit_op(c,
 		 WM_LINTERP,
 		 dst,
-		 0, 0, 0,
+		 0,
 		 interp,
 		 deltas,
 		 src_undef());
@@ -358,7 +372,7 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op(c,
 	      WM_PINTERP,
 	      dst,
-	      0, 0, 0,
+	      0,
 	      interp,
 	      deltas,
 	      get_pixel_w(c));
@@ -378,7 +392,7 @@ static void emit_ddx( struct brw_wm_compile *c,
     emit_op(c,
             OPCODE_DDX,
             inst->DstReg,
-            0, 0, 0,
+            0,
             interp,
             get_pixel_w(c),
             src_undef());
@@ -394,7 +408,7 @@ static void emit_ddy( struct brw_wm_compile *c,
     emit_op(c,
             OPCODE_DDY,
             inst->DstReg,
-            0, 0, 0,
+            0,
             interp,
             get_pixel_w(c),
             src_undef());
@@ -489,13 +503,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(dst, WRITEMASK_Y),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src0,
 	      src1,
 	      src_undef());
    }
 
-
    if (dst.WriteMask & WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
@@ -505,7 +518,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       swz = emit_op(c,
 		    OPCODE_SWZ,
 		    dst_mask(dst, WRITEMASK_XZ),
-		    inst->SaturateMode, 0, 0,
+		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
 		    src_undef(),
 		    src_undef());
@@ -518,7 +531,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_W),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src1,
 	      src_undef(),
 	      src_undef());
@@ -540,7 +553,7 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz = emit_op(c,
 		    OPCODE_SWZ,
 		    dst_mask(dst, WRITEMASK_XW),
-		    0, 0, 0,
+		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
 		    src_undef(),
 		    src_undef());
@@ -548,12 +561,11 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz->SrcReg[0].NegateBase = 0;
    }
 
-
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
 	      OPCODE_LIT,
 	      dst_mask(dst, WRITEMASK_YZ),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src0,
 	      src_undef(),
 	      src_undef());
@@ -589,7 +601,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        /* tmpcoord = src0 (i.e.: coord = src0) */
        out = emit_op(c, OPCODE_MOV,
                      tmpcoord,
-                     0, 0, 0,
+                     0,
                      src0,
                      src_undef(),
                      src_undef());
@@ -599,7 +611,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        /* tmp0 = MAX(coord.X, coord.Y) */
        emit_op(c, OPCODE_MAX,
                tmp0,
-               0, 0, 0,
+               0,
                src_swizzle1(coord, X),
                src_swizzle1(coord, Y),
                src_undef());
@@ -607,7 +619,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        /* tmp1 = MAX(tmp0, coord.Z) */
        emit_op(c, OPCODE_MAX,
                tmp1,
-               0, 0, 0,
+               0,
                tmp0src,
                src_swizzle1(coord, Z),
                src_undef());
@@ -615,7 +627,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        /* tmp0 = 1 / tmp1 */
        emit_op(c, OPCODE_RCP,
                tmp0,
-               0, 0, 0,
+               0,
                tmp1src,
                src_undef(),
                src_undef());
@@ -623,7 +635,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        /* tmpCoord = src0 * tmp0 */
        emit_op(c, OPCODE_MUL,
                tmpcoord,
-               0, 0, 0,
+               0,
                src0,
                tmp0src,
                src_undef());
@@ -646,7 +658,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      tmpcoord,
-	      0, 0, 0,
+	      0,
 	      inst->SrcReg[0],
 	      scale,
 	      src_undef());
@@ -686,22 +698,23 @@ static void precalc_tex( struct brw_wm_compile *c,
      
       /* tmp     = TEX ...
        */
-      emit_op(c, 
-	      OPCODE_TEX,
-	      tmp,
-	      inst->SaturateMode,
-	      unit,
-	      inst->TexSrcTarget,
-	      coord,
-	      src_undef(),
-	      src_undef());
+      emit_tex_op(c, 
+                  OPCODE_TEX,
+                  tmp,
+                  inst->SaturateMode,
+                  unit,
+                  inst->TexSrcTarget,
+                  inst->TexShadow,
+                  coord,
+                  src_undef(),
+                  src_undef());
 
       /* tmp.xyz =  ADD TMP, C0
        */
       emit_op(c,
 	      OPCODE_ADD,
 	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      tmpsrc,
 	      C0,
 	      src_undef());
@@ -712,7 +725,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
-	      0, 0, 0,
+	      0,
 	      tmpsrc,
 	      src_swizzle1(C0, W),
 	      src_undef());
@@ -727,7 +740,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
 	      src_swizzle1(tmpsrc, Y));
@@ -737,7 +750,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_Y),
-	      0, 0, 0,
+	      0,
 	      src_swizzle1(tmpsrc, Z),
 	      src_swizzle1(C1, W),
 	      src_swizzle1(src_reg_from_dst(dst), Y));
@@ -746,15 +759,16 @@ static void precalc_tex( struct brw_wm_compile *c,
    }
    else {
       /* ordinary RGBA tex instruction */
-      emit_op(c, 
-	      OPCODE_TEX,
-	      inst->DstReg,
-	      inst->SaturateMode,
-	      unit,
-	      inst->TexSrcTarget,
-	      coord,
-	      src_undef(),
-	      src_undef());
+      emit_tex_op(c, 
+                  OPCODE_TEX,
+                  inst->DstReg,
+                  inst->SaturateMode,
+                  unit,
+                  inst->TexSrcTarget,
+                  inst->TexShadow,
+                  coord,
+                  src_undef(),
+                  src_undef());
    }
 
    /* For GL_EXT_texture_swizzle: */
@@ -764,7 +778,6 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c, OPCODE_SWZ,
               inst->DstReg,
               SATURATE_OFF, /* saturate already done above */
-              0, 0,   /* tex unit, target N/A */
               src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
               src_undef(),
               src_undef());
@@ -813,7 +826,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_RCP,
 	      dst_mask(tmp, WRITEMASK_W),
-	      0, 0, 0,
+	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
 	      src_undef());
@@ -823,7 +836,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
 	      src_undef());
@@ -849,42 +862,41 @@ static void precalc_txp( struct brw_wm_compile *c,
 static void emit_fb_write( struct brw_wm_compile *c )
 {
    struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPR);
+   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
    struct prog_src_register outcolor;
    GLuint i;
 
    struct prog_instruction *inst, *last_inst;
    struct brw_context *brw = c->func.brw;
 
-   /* inst->Sampler is not used by backend, 
-      use it for fb write target and eot */
-
-   if (brw->state.nr_draw_regions > 1) {
-       for (i = 0 ; i < brw->state.nr_draw_regions; i++) {
-	   outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
-	   last_inst = inst = emit_op(c,
-		   WM_FB_WRITE, dst_mask(dst_undef(),0), 0, 0, 0,
-		   outcolor, payload_r0_depth, outdepth);
-	   inst->Sampler = (i<<1);
-	   if (c->fp_fragcolor_emitted) {
-	       outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
-	       last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-		       0, 0, 0, outcolor, payload_r0_depth, outdepth);
-	       inst->Sampler = (i<<1);
-	   }
-       }
-       last_inst->Sampler |= 1; //eot
+   /* The inst->Aux field is used for FB write target and the EOT marker */
+
+   if (brw->state.nr_color_regions > 1) {
+      for (i = 0 ; i < brw->state.nr_color_regions; i++) {
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
+         last_inst = inst = emit_op(c,
+                                    WM_FB_WRITE, dst_mask(dst_undef(),0), 0,
+                                    outcolor, payload_r0_depth, outdepth);
+         inst->Aux = (i<<1);
+         if (c->fp_fragcolor_emitted) {
+            outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
+            last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                                       0, outcolor, payload_r0_depth, outdepth);
+            inst->Aux = (i<<1);
+         }
+      }
+      last_inst->Aux |= 1; //eot
    }
    else {
       /* if gl_FragData[0] is written, use it, else use gl_FragColor */
       if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
       else 
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
 
-       inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-	       0, 0, 0, outcolor, payload_r0_depth, outdepth);
-       inst->Sampler = 1|(0<<1);
+      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                     0, outcolor, payload_r0_depth, outdepth);
+      inst->Aux = 1|(0<<1);
    }
 }
 
@@ -915,9 +927,9 @@ static void validate_dst_regs( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
    if (inst->DstReg.File == PROGRAM_OUTPUT) {
-       GLuint idx = inst->DstReg.Index;
-       if (idx == FRAG_RESULT_COLR)
-	   c->fp_fragcolor_emitted = 1;
+      GLuint idx = inst->DstReg.Index;
+      if (idx == FRAG_RESULT_COLOR)
+         c->fp_fragcolor_emitted = 1;
    }
 }
 
@@ -937,11 +949,15 @@ static void print_insns( const struct prog_instruction *insn,
 				     3);
       }
       else 
-	 _mesa_printf("UNKNOWN\n");
-	   
+	 _mesa_printf("965 Opcode %d\n", insn->Opcode);
    }
 }
 
+
+/**
+ * Initial pass for fragment program code generation.
+ * This function is used by both the GLSL and non-GLSL paths.
+ */
 void brw_wm_pass_fp( struct brw_wm_compile *c )
 {
    struct brw_fragment_program *fp = c->fp;
@@ -958,15 +974,19 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    c->pixel_w = src_undef();
    c->nr_fp_insns = 0;
 
-   /* Emit preamble instructions:
+   /* Emit preamble instructions.  This is where special instructions such as
+    * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+    * compute shader inputs from varying vars.
     */
-
-
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       validate_src_regs(c, inst);
       validate_dst_regs(c, inst);
    }
+
+   /* Loop over all instructions doing assorted simplifications and
+    * transformations.
+    */
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       struct prog_instruction *out;
@@ -975,7 +995,6 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        * necessary:
        */
 
-
       switch (inst->Opcode) {
       case OPCODE_SWZ: 
 	 out = emit_insn(c, inst);
@@ -1055,9 +1074,9 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
-	   _mesa_printf("pass_fp:\n");
-	   print_insns( c->prog_instructions, c->nr_fp_insns );
-	   _mesa_printf("\n");
+      _mesa_printf("pass_fp:\n");
+      print_insns( c->prog_instructions, c->nr_fp_insns );
+      _mesa_printf("\n");
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 8fd776ac39..4cf092226c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -8,12 +8,17 @@ enum _subroutine {
     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
 };
 
-/* Only guess, need a flag in gl_fragment_program later */
+
+/**
+ * Determine if the given fragment program uses GLSL features such
+ * as flow conditionals, loops, subroutines.
+ * Some GLSL shaders may use these features, others might not.
+ */
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 {
     int i;
     for (i = 0; i < fp->Base.NumInstructions; i++) {
-	struct prog_instruction *inst = &fp->Base.Instructions[i];
+	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
 	    case OPCODE_IF:
 	    case OPCODE_TRUNC:
@@ -36,6 +41,10 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
     return GL_FALSE; 
 }
 
+
+/**
+ * Record the mapping of a Mesa register to a hardware register.
+ */
 static void set_reg(struct brw_wm_compile *c, int file, int index, 
 	int component, struct brw_reg reg)
 {
@@ -43,6 +52,10 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
     c->wm_regs[file][index][component].inited = GL_TRUE;
 }
 
+/**
+ * Examine instruction's write mask to find index of first component
+ * enabled for writing.
+ */
 static int get_scalar_dst_index(struct prog_instruction *inst)
 {
     int i;
@@ -62,6 +75,10 @@ static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
     return reg;
 }
 
+/**
+ * Save current temp register info.
+ * There must be a matching call to release_tmps().
+ */
 static int mark_tmps(struct brw_wm_compile *c)
 {
     return c->tmp_index;
@@ -77,8 +94,22 @@ static void release_tmps(struct brw_wm_compile *c, int mark)
     c->tmp_index = mark;
 }
 
+/**
+ * Convert Mesa src register to brw register.
+ *
+ * Since we're running in SOA mode each Mesa register corresponds to four
+ * hardware registers.  We allocate the hardware registers as needed here.
+ *
+ * \param file  register file, one of PROGRAM_x
+ * \param index  register number
+ * \param component  src component (X=0, Y=1, Z=2, W=3)
+ * \param nr  not used?!?
+ * \param neg  negate value?
+ * \param abs  take absolute value?
+ */
 static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
+get_reg(struct brw_wm_compile *c, int file, int index, int component,
+        int nr, GLuint neg, GLuint abs)
 {
     struct brw_reg reg;
     switch (file) {
@@ -89,21 +120,46 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
 	    break;
 	case PROGRAM_UNDEFINED:
 	    return brw_null_reg();	
-	default:
+	case PROGRAM_TEMPORARY:
+	case PROGRAM_INPUT:
+	case PROGRAM_OUTPUT:
+	case PROGRAM_PAYLOAD:
 	    break;
+	default:
+	    _mesa_problem(NULL, "Unexpected file in get_reg()");
+	    return brw_null_reg();
     }
 
-    if(c->wm_regs[file][index][component].inited)
+    /* see if we've already allocated a HW register for this Mesa register */
+    if (c->wm_regs[file][index][component].inited) {
+	/* yes, re-use */
 	reg = c->wm_regs[file][index][component].reg;
-    else 
+    }
+    else {
+	/* no, allocate new register */
 	reg = brw_vec8_grf(c->reg_index, 0);
+    }
 
-    if(!c->wm_regs[file][index][component].inited) {
+    /* if this is a new register allocation, record it in the table */
+    if (!c->wm_regs[file][index][component].inited) {
 	set_reg(c, file, index, component, reg);
 	c->reg_index++;
     }
 
-    if (neg & (1<< component)) {
+    if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
+	/* ran out of temporary registers! */
+#if 1
+        /* This is a big hack for now.
+         * Return bad register index, just don't hang the GPU.
+         */
+        _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
+        c->reg_index = BRW_WM_MAX_GRF - 13;
+#else
+	return brw_null_reg();
+#endif
+    }
+ 
+    if (neg & (1 << component)) {
 	reg = negate(reg);
     }
     if (abs)
@@ -111,6 +167,12 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
     return reg;
 }
 
+
+/**
+ * Preallocate registers.  This sets up the Mesa to hardware register
+ * mapping for certain registers, such as constants (uniforms/state vars)
+ * and shader inputs.
+ */
 static void prealloc_reg(struct brw_wm_compile *c)
 {
     int i, j;
@@ -119,29 +181,42 @@ static void prealloc_reg(struct brw_wm_compile *c)
     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 
     for (i = 0; i < 4; i++) {
-	reg = (i < c->key.nr_depth_regs) 
-	    ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
+        if (i < c->key.nr_depth_regs) 
+            reg = brw_vec8_grf(i * 2, 0);
+        else
+            reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2*c->key.nr_depth_regs;
+    c->reg_index += 2 * c->key.nr_depth_regs;
+
+    /* constants */
     {
-	int nr_params = c->fp->program.Base.Parameters->NumParameters;
-	struct gl_program_parameter_list *plist = 
+        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const struct gl_program_parameter_list *plist = 
 	    c->fp->program.Base.Parameters;
 	int index = 0;
-	c->prog_data.nr_params = 4*nr_params;
+
+        /* number of float constants */
+	c->prog_data.nr_params = 4 * nr_params;
+
+        /* loop over program constants (float[4]) */
 	for (i = 0; i < nr_params; i++) {
-	    for (j = 0; j < 4; j++, index++) {
-		reg = brw_vec1_grf(c->reg_index + index/8, 
-			index%8);
-		c->prog_data.param[index] = 
-		    &plist->ParameterValues[i][j];
-		set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+            /* loop over XYZW channels */
+            for (j = 0; j < 4; j++, index++) {
+                reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                /* Save pointer to parameter/constant value.
+                 * Constants will be copied in prepare_constant_buffer()
+                 */
+                c->prog_data.param[index] = &plist->ParameterValues[i][j];
+                set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 	    }
 	}
-	c->nr_creg = 2*((4*nr_params+15)/16);
+        /* number of constant regs used (each reg is float[8]) */
+	c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 	c->reg_index += c->nr_creg;
     }
+
+    /* fragment shader inputs */
     for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
 	if (inputs & (1<<i)) {
 	    nr_interp_regs++;
@@ -149,9 +224,9 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	    for (j = 0; j < 4; j++)
 		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
 	    c->reg_index += 2;
-
 	}
     }
+
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
     c->prog_data.urb_read_length = nr_interp_regs * 2;
     c->prog_data.curb_read_length = c->nr_creg;
@@ -161,6 +236,10 @@ static void prealloc_reg(struct brw_wm_compile *c)
     c->reg_index += 2;
 }
 
+
+/**
+ * Convert Mesa dst register to brw register.
+ */
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
 	struct prog_instruction *inst, int component, int nr)
 {
@@ -168,6 +247,10 @@ static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 	    0, 0);
 }
 
+
+/**
+ * Convert Mesa src register to brw register.
+ */
 static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
 	struct prog_src_register *src, int index, int nr)
 {
@@ -176,13 +259,15 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 	    src->NegateBase, src->Abs);
 }
 
-/* Subroutines are minimal support for resusable instruction sequences.
-   They are implemented as simply as possible to minimise overhead: there
-   is no explicit support for communication between the caller and callee
-   other than saving the return address in a temporary register, nor is
-   there any automatic local storage.  This implies that great care is
-   required before attempting reentrancy or any kind of nested
-   subroutine invocations. */
+/**
+ * Subroutines are minimal support for resusable instruction sequences.
+ * They are implemented as simply as possible to minimise overhead: there
+ * is no explicit support for communication between the caller and callee
+ * other than saving the return address in a temporary register, nor is
+ * there any automatic local storage.  This implies that great care is
+ * required before attempting reentrancy or any kind of nested
+ * subroutine invocations.
+ */
 static void invoke_subroutine( struct brw_wm_compile *c,
 			       enum _subroutine subroutine,
 			       void (*emit)( struct brw_wm_compile * ) )
@@ -319,11 +404,10 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
 		stride(suboffset(r1_uw, 5), 2, 4, 0),
 		brw_imm_v(0x11001100));
     }
-
 }
 
 static void emit_delta_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg dst0, dst1, src0, src1;
@@ -351,10 +435,8 @@ static void emit_delta_xy(struct brw_wm_compile *c,
 		negate(suboffset(r1,1)));
 
     }
-
 }
 
-
 static void fire_fb_write( struct brw_wm_compile *c,
                            GLuint base_reg,
                            GLuint nr,
@@ -397,33 +479,59 @@ static void emit_fb_write(struct brw_wm_compile *c,
      */
     if (c->key.aa_dest_stencil_reg)
 	nr += 1;
-    {
-	brw_push_insn_state(p);
-	for (channel = 0; channel < 4; channel++) {
-	    src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
-	    /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-	    /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-	    brw_MOV(p, brw_message_reg(nr + channel), src0);
-	}
-	/* skip over the regs populated above: */
-	nr += 8;
-	brw_pop_insn_state(p);
+
+    brw_push_insn_state(p);
+    for (channel = 0; channel < 4; channel++) {
+        src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
+        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+        brw_MOV(p, brw_message_reg(nr + channel), src0);
     }
+    /* skip over the regs populated above: */
+    nr += 8;
+    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
-   {
-      if (c->key.computes_depth) {
-         src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      } else {
-         src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      }
-
-      nr += 2;
+    if (c->key.source_depth_to_render_target) {
+       if (c->key.computes_depth) {
+          src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+       else {
+          src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+
+       nr += 2;
+    }
+
+    if (c->key.dest_depth_reg) {
+        GLuint comp = c->key.dest_depth_reg / 2;
+        GLuint off = c->key.dest_depth_reg % 2;
+
+        assert(comp == 1);
+        assert(off == 0);
+#if 0
+        /* XXX do we need this code?   comp always 1, off always 0, it seems */
+        if (off != 0) {
+            brw_push_insn_state(p);
+            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+            brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+            /* 2nd half? */
+            brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+            brw_pop_insn_state(p);
+        }
+        else
+#endif
+        {
+           struct brw_reg src =  get_src_reg(c, &inst->SrcReg[1], 1, 1);
+           brw_MOV(p, brw_message_reg(nr), src);
+        }
+        nr += 2;
    }
-    target = inst->Sampler >> 1;
-    eot = inst->Sampler & 1;
+
+    target = inst->Aux >> 1;
+    eot = inst->Aux & 1;
     fire_fb_write(c, 0, nr, target, eot);
 }
 
@@ -465,12 +573,12 @@ static void emit_linterp(struct brw_wm_compile *c,
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0;
+    GLuint nr, i;
 
     src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
     delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
     delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    GLuint nr = src0.nr;
-    int i;
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -494,10 +602,10 @@ static void emit_cinterp(struct brw_wm_compile *c,
 
     struct brw_reg interp[4];
     struct brw_reg dst, src0;
+    GLuint nr, i;
 
     src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    GLuint nr = src0.nr;
-    int i;
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -521,13 +629,13 @@ static void emit_pinterp(struct brw_wm_compile *c,
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0, w;
+    GLuint nr, i;
 
     src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
     delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
     delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
     w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
-    GLuint nr = src0.nr;
-    int i;
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -627,23 +735,46 @@ static void emit_dph(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
+/**
+ * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
+ * Note that the result of the function is smeared across the dest
+ * register's X, Y, Z and W channels (subject to writemasking of course).
+ */
 static void emit_math1(struct brw_wm_compile *c,
 		struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
+    struct brw_reg src0, dst, tmp;
+    const int mark = mark_tmps( c );
+    int i;
+
+    tmp = alloc_tmp(c);
 
+    /* Get first component of source register */
     src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+
+    /* tmp = func(src0) */
     brw_MOV(p, brw_message_reg(2), src0);
     brw_math(p,
-	    dst,
-	    func,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
+             tmp,
+             func,
+             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+             2,
+             brw_null_reg(),
+             BRW_MATH_DATA_VECTOR,
+             BRW_MATH_PRECISION_FULL);
+
+    /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
+
+    /* replicate tmp value across enabled dest channels */
+    for (i = 0; i < 4; i++) {
+       if (inst->DstReg.WriteMask & (1 << i)) {
+          dst = get_dst_reg(c, inst, i, 1);    
+          brw_MOV(p, dst, tmp);
+       }
+    }
+
+    release_tmps(c, mark);
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
@@ -1045,23 +1176,23 @@ static void emit_ddy(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-static __inline struct brw_reg high_words( struct brw_reg reg )
+static INLINE struct brw_reg high_words( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 		   0, 8, 2 );
 }
 
-static __inline struct brw_reg low_words( struct brw_reg reg )
+static INLINE struct brw_reg low_words( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 }
 
-static __inline struct brw_reg even_bytes( struct brw_reg reg )
+static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 }
 
-static __inline struct brw_reg odd_bytes( struct brw_reg reg )
+static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 		   0, 16, 2 );
@@ -1366,9 +1497,11 @@ static void emit_noise2( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
 
-/* The three-dimensional case is much like the one- and two- versions above,
-   but since the number of corners is rapidly growing we now pack 16 16-bit
-   hashes into each register to extract more parallelism from the EUs. */
+/**
+ * The three-dimensional case is much like the one- and two- versions above,
+ * but since the number of corners is rapidly growing we now pack 16 16-bit
+ * hashes into each register to extract more parallelism from the EUs.
+ */
 static void noise3_sub( struct brw_wm_compile *c ) {
 
     struct brw_compile *p = &c->func;
@@ -1670,13 +1803,15 @@ static void emit_noise3( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
     
-/* For the four-dimensional case, the little micro-optimisation benefits
-   we obtain by unrolling all the loops aren't worth the massive bloat it
-   now causes.  Instead, we loop twice around performing a similar operation
-   to noise3, once for the w=0 cube and once for the w=1, with a bit more
-   code to glue it all together. */
-static void noise4_sub( struct brw_wm_compile *c ) {
-
+/**
+ * For the four-dimensional case, the little micro-optimisation benefits
+ * we obtain by unrolling all the loops aren't worth the massive bloat it
+ * now causes.  Instead, we loop twice around performing a similar operation
+ * to noise3, once for the w=0 cube and once for the w=1, with a bit more
+ * code to glue it all together.
+ */
+static void noise4_sub( struct brw_wm_compile *c )
+{
     struct brw_compile *p = &c->func;
     struct brw_reg param[ 4 ],
 	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
@@ -2244,28 +2379,12 @@ static void emit_tex(struct brw_wm_compile *c,
 	brw_MOV(p, dst[3], brw_imm_f(1.0));
 }
 
+/**
+ * Resolve subroutine calls after code emit is done.
+ */
 static void post_wm_emit( struct brw_wm_compile *c )
 {
-    GLuint nr_insns = c->fp->program.Base.NumInstructions;
-    GLuint insn, target_insn;
-    struct prog_instruction *inst1, *inst2;
-    struct brw_instruction *brw_inst1, *brw_inst2;
-    int offset;
-    for (insn = 0; insn < nr_insns; insn++) {
-	inst1 = &c->fp->program.Base.Instructions[insn];
-	brw_inst1 = inst1->Data;
-	switch (inst1->Opcode) {
-	    case OPCODE_CAL:
-		target_insn = inst1->BranchTarget;
-		inst2 = &c->fp->program.Base.Instructions[target_insn];
-		brw_inst2 = inst2->Data;
-		offset = brw_inst2 - brw_inst1;
-		brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-		break;
-	    default:
-		break;
-	}
-    }
+    brw_resolve_cals(&c->func);
 }
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
@@ -2285,10 +2404,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 
     for (i = 0; i < c->nr_fp_insns; i++) {
 	struct prog_instruction *inst = &c->prog_instructions[i];
-	struct prog_instruction *orig_inst;
-
-	if ((orig_inst = inst->Data) != 0)
-	    orig_inst->Data = current_insn(p);
 
 	if (inst->CondUpdate)
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
@@ -2446,7 +2561,10 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		brw_ENDIF(p, if_inst[--if_insn]);
 		break;
 	    case OPCODE_BGNSUB:
+		brw_save_label(p, inst->Comment, p->nr_insn);
+		break;
 	    case OPCODE_ENDSUB:
+		/* no-op */
 		break;
 	    case OPCODE_CAL: 
 		brw_push_insn_state(p);
@@ -2456,8 +2574,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                 brw_set_access_mode(p, BRW_ALIGN_16);
                 brw_ADD(p, get_addr_reg(stack_index),
                          get_addr_reg(stack_index), brw_imm_d(4));
-                orig_inst = inst->Data;
-                orig_inst->Data = &p->store[p->nr_insn];
+		brw_save_call(&c->func, inst->Comment, p->nr_insn);
                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
                 brw_pop_insn_state(p);
 		break;
@@ -2510,14 +2627,34 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
     }
     post_wm_emit(c);
-    for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
-	c->fp->program.Base.Instructions[i].Data = NULL;
+
+    if (c->reg_index >= BRW_WM_MAX_GRF) {
+        _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
+        /* XXX we need to do some proper error recovery here */
+    }
 }
 
+
+/**
+ * Do GPU code generation for shaders that use GLSL features such as
+ * flow control.  Other shaders will be compiled with the 
+ */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
+    if (INTEL_DEBUG & DEBUG_WM) {
+        _mesa_printf("brw_wm_glsl_emit:\n");
+    }
+
+    /* initial instruction translation/simplification */
     brw_wm_pass_fp(c);
+
+    /* actual code generation */
     brw_wm_emit_glsl(brw, c);
+
+    if (INTEL_DEBUG & DEBUG_WM) {
+        brw_wm_print_program(c, "brw_wm_glsl_emit done");
+    }
+
     c->prog_data.total_grf = c->reg_index;
     c->prog_data.total_scratch = 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 590cd946ec..2debd0678a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -334,8 +334,9 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 }
 
 
-static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
-						  const struct prog_instruction *inst )
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct prog_instruction *inst)
 {
    struct brw_wm_instruction *out = get_instruction(c);
    GLuint writemask = inst->DstReg.WriteMask;
@@ -348,8 +349,9 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
    out->saturate = (inst->SaturateMode != SATURATE_OFF);
    out->tex_unit = inst->TexSrcUnit;
    out->tex_idx = inst->TexSrcTarget;
-   out->eot = inst->Sampler & 1;
-   out->target = inst->Sampler>>1;
+   out->tex_shadow = inst->TexShadow;
+   out->eot = inst->Aux & 1;
+   out->target = inst->Aux >> 1;
 
    /* Args:
     */
@@ -365,8 +367,6 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
       pass0_set_dst_scalar(c, out, inst, writemask);
    else 
       pass0_set_dst(c, out, inst, writemask);
-
-   return out;
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index 6eaed8a665..cf031899dd 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -210,9 +210,10 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TEX:
+      case OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
-	 if (c->key.shadowtex_mask & (1<<inst->tex_unit))
+         if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
@@ -267,7 +268,6 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_DST:
-      case OPCODE_TXP:
       default:
 	 break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index b6dac0d698..68a9296a71 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -95,6 +95,7 @@ struct wm_sampler_key {
    int sampler_count;
 
    struct wm_sampler_entry {
+      GLenum tex_target;
       GLenum wrap_r, wrap_s, wrap_t;
       float maxlod, minlod;
       float lod_bias;
@@ -168,19 +169,20 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       }  
    }
 
-   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
-
-   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
-    */
-#if 0
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
+   if (key->tex_target == GL_TEXTURE_CUBE_MAP &&
+       (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
+      /* If we're using anything but nearest sampling for a cube map, we
+       * need to set this wrap mode to avoid GPU lock-ups.
+       */
       sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
       sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
       sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
    }
-#endif
+   else {
+      sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+      sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+      sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+   }
 
    /* Set shadow function: 
     */
@@ -234,6 +236,8 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 struct gl_texture_image *firstImage =
 	    texObj->Image[0][intelObj->firstLevel];
 
+         entry->tex_target = texObj->Target;
+
 	 entry->wrap_r = texObj->WrapR;
 	 entry->wrap_s = texObj->WrapS;
 	 entry->wrap_t = texObj->WrapT;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 3c3b3473d6..63fc8a004f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -62,6 +62,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
    GLcontext *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
+   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -103,11 +104,14 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* as far as we can tell */
    key->computes_depth =
-      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) != 0;
+      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = brw_wm_is_glsl(fp);
+   key->is_glsl = bfp->isGLSL;
+
+   /* temporary sanity check assertion */
+   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
    /* XXX: This needs a flag to indicate when it changes. */
    key->stats_wm = intel->stats_wm;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index d70f9c646c..9b320480b6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -139,7 +139,18 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
       return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
 
    case MESA_FORMAT_S8_Z24:
-      return BRW_SURFACEFORMAT_I24X8_UNORM;
+      /* XXX: these different surface formats don't seem to
+       * make any difference for shadow sampler/compares.
+       */
+      if (depth_mode == GL_INTENSITY) 
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+      else if (depth_mode == GL_ALPHA)
+         return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else
+         return BRW_SURFACEFORMAT_L24X8_UNORM;
+
+   case MESA_FORMAT_DUDV8:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
 
    default:
       assert(0);
@@ -381,8 +392,7 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 	  * a more restrictive relocation to emit.
 	  */
 	 dri_bo_emit_reloc(brw->wm.surf_bo[unit],
-			   I915_GEM_DOMAIN_RENDER |
-			   I915_GEM_DOMAIN_SAMPLER,
+			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER,
 			   0,
 			   offsetof(struct brw_surface_state, ss1),
@@ -447,13 +457,13 @@ static void prepare_wm_surfaces(struct brw_context *brw )
    GLuint i;
    int old_nr_surfaces;
 
-   if (brw->state.nr_draw_regions  > 1) {
-      for (i = 0; i < brw->state.nr_draw_regions; i++) {
-         brw_update_region_surface(brw, brw->state.draw_regions[i], i,
+   if (brw->state.nr_color_regions  > 1) {
+      for (i = 0; i < brw->state.nr_color_regions; i++) {
+         brw_update_region_surface(brw, brw->state.color_regions[i], i,
 				   GL_FALSE);
       }
-   }else {
-      brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE);
+   } else {
+      brw_update_region_surface(brw, brw->state.color_regions[0], 0, GL_TRUE);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;