34 files changed, 1092 insertions, 256 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 228ee3f3be..a1e9dae915 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -55,6 +55,7 @@ static void compile_clip_prog( struct brw_context *brw,
    GLuint program_size;
    GLuint delta;
    GLuint i;
+   GLuint header_regs;
 
    memset(&c, 0, sizeof(c));
    
@@ -72,22 +73,28 @@ static void compile_clip_prog( struct brw_context *brw,
    c.header_position_offset = ATTR_SIZE;
 
    if (intel->gen == 5)
-       delta = 3 * REG_SIZE;
+      header_regs = 3;
    else
-       delta = REG_SIZE;
+      header_regs = 1;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++)
+   delta = header_regs * REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
+
+	 c.idx_to_attr[c.nr_attrs] = i;
+	 c.nr_attrs++;
       }
+   }
 
-   c.nr_attrs = brw_count_bits(c.key.attrs);
-   
-   if (intel->gen == 5)
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
-   else
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   /* The vertex attributes start at a URB row-aligned offset after
+    * the 8-20 dword vertex header, and continue for a URB row-aligned
+    * length.  nr_regs determines the urb_read_length from the start
+    * of the header to the end of the vertex data.
+    */
+   c.nr_regs = header_regs + (c.nr_attrs + 1) / 2;
 
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index 68222c6c27..3a8cd7bf39 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -115,7 +115,10 @@ struct brw_clip_compile {
    GLboolean need_direction;
 
    GLuint header_position_offset;
-   GLuint offset[VERT_ATTRIB_MAX];
+   /** Mapping from VERT_RESULT_* to offset within the VUE. */
+   GLuint offset[VERT_RESULT_MAX];
+   /** Mapping from attribute index to VERT_RESULT_* */
+   GLuint idx_to_attr[VERT_RESULT_MAX];
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index ceb62a3116..4b9117bb0b 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
index 7f47634dca..b994a32bc3 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 916a99ea00..cb58d1da9f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
@@ -76,10 +76,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
 
    if (c->nr_attrs & 1) {
       for (j = 0; j < 3; j++) {
-	 GLuint delta = c->nr_attrs*16 + 32;
-
-         if (intel->gen == 5)
-             delta = c->nr_attrs * 16 + 32 * 3;
+	 GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE;
 
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
       }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index f36d22fdbf..afd93f8be0 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 2148bc8244..d2ac1235e4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -33,7 +33,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
@@ -134,7 +134,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 			     GLboolean force_edgeflag)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = get_tmp(c);
    GLuint i;
 
@@ -149,12 +148,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    /* Iterate over each attribute (could be done in pairs?)
     */
    for (i = 0; i < c->nr_attrs; i++) {
-      GLuint delta = i*16 + 32;
+      GLuint delta = c->offset[c->idx_to_attr[i]];
 
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
-
-      if (delta == c->offset[VERT_RESULT_EDGE]) {
+      if (c->idx_to_attr[i] == VERT_RESULT_EDGE) {
 	 if (force_edgeflag) 
 	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
 	 else
@@ -183,10 +179,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    }
 
    if (i & 1) {
-      GLuint delta = i*16 + 32;
-
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
+      GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE;
 
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
    }
@@ -199,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    brw_clip_project_vertex(c, dest_ptr );
 }
 
-
-
-
-#define MAX_MRF 16
-
 void brw_clip_emit_vue(struct brw_clip_compile *c, 
 		       struct brw_indirect vert,
 		       GLboolean allocate,
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index d13b9ae298..6d064b822e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -34,7 +34,6 @@
 #include "main/api_noop.h"
 #include "main/macros.h"
 #include "main/simple_list.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 6c0b79f724..8196d8ca62 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -35,9 +35,9 @@
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 39bf5b63fc..f7a68cead7 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -501,6 +501,10 @@
 #define BRW_MASK_ENABLE   0
 #define BRW_MASK_DISABLE  1
 
+/* Sandybridge is WECtrl (Write enable control) */
+#define BRW_WE_NORMAL		0
+#define BRW_WE_KILL_PRED	1
+
 #define BRW_OPCODE_MOV        1
 #define BRW_OPCODE_SEL        2
 #define BRW_OPCODE_NOT        4
@@ -600,6 +604,8 @@
 #define BRW_ARF_NOTIFICATION_COUNT    0x90
 #define BRW_ARF_IP                    0xA0
 
+#define BRW_MRF_COMPR4			(1 << 7)
+
 #define BRW_AMASK   0
 #define BRW_IMASK   1
 #define BRW_LMASK   2
@@ -646,13 +652,14 @@
 #define BRW_POLYGON_FACING_BACK       1
 
 #define BRW_MESSAGE_TARGET_NULL               0
-#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_MATH               1 /* reserved on GEN6 */
 #define BRW_MESSAGE_TARGET_SAMPLER            2
 #define BRW_MESSAGE_TARGET_GATEWAY            3
-#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
-#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4 /* sampler cache on GEN6 */
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5 /* render cache on Gen6 */
 #define BRW_MESSAGE_TARGET_URB                6
 #define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+#define BRW_MESSAGE_TARGET_CONST_CACHE	      9 /* GEN6 */
 
 #define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
@@ -698,10 +705,24 @@
 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
 
+/* This one stays the same across generations. */
 #define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+/* GEN4 */
 #define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
-#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          2
 #define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+/* G45, GEN5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ	    3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+/* GEN6 */
+#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ  5
+#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
 
 #define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
 #define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
@@ -721,6 +742,16 @@
 #define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
 #define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
 
+/* GEN6 */
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE_GEN6		7
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE_GEN6		8
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE_GEN6		9
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE_GEN6		10
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORLD_SCATTERED_WRITE_GEN6		11
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6		12
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE_GEN6		13
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE_GEN6	14
+
 #define BRW_MATH_FUNCTION_INV                              1
 #define BRW_MATH_FUNCTION_LOG                              2
 #define BRW_MATH_FUNCTION_EXP                              3
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index ff12daf497..d230714536 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -598,7 +598,7 @@ static int src_da16 (FILE *file,
 	format (file, ".%d", _subreg_nr);
     string (file, "<");
     err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
-    string (file, ",1,1>");
+    string (file, ",4,1>");
     err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
     /*
      * Three kinds of swizzle display:
@@ -836,10 +836,12 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
     if (inst->header.opcode == BRW_OPCODE_SEND) {
 	int target;
 
-	if (gen >= 5)
-	   target = inst->bits2.send_gen5.sfid;
+	if (gen >= 6)
+	    target = inst->header.destreg__conditionalmod;
+	else if (gen == 5)
+	    target = inst->bits2.send_gen5.sfid;
 	else
-	   target = inst->bits3.generic.msg_target;
+	    target = inst->bits3.generic.msg_target;
 
 	newline (file);
 	pad (file, 16);
@@ -868,13 +870,44 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			    inst->bits3.sampler.return_format, NULL);
 	    string (file, ")");
 	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_READ:
+	    if (gen >= 6) {
+		format (file, " (%d, %d, %d, %d, %d, %d)",
+			inst->bits3.dp_render_cache.binding_table_index,
+			inst->bits3.dp_render_cache.msg_control,
+			inst->bits3.dp_render_cache.msg_type,
+			inst->bits3.dp_render_cache.send_commit_msg,
+			inst->bits3.dp_render_cache.msg_length,
+			inst->bits3.dp_render_cache.response_length);
+	    } else if (gen >= 5) {
+		format (file, " (%d, %d, %d)",
+			inst->bits3.dp_read_gen5.binding_table_index,
+			inst->bits3.dp_read_gen5.msg_control,
+			inst->bits3.dp_read_gen5.msg_type);
+	    } else {
+		format (file, " (%d, %d, %d)",
+			inst->bits3.dp_read.binding_table_index,
+			inst->bits3.dp_read.msg_control,
+			inst->bits3.dp_read.msg_type);
+	    }
+	    break;
 	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
-	    format (file, " (%d, %d, %d, %d)",
-		    inst->bits3.dp_write.binding_table_index,
-		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
-		    inst->bits3.dp_write.msg_control,
-		    inst->bits3.dp_write.msg_type,
-		    inst->bits3.dp_write.send_commit_msg);
+	    if (gen >= 6) {
+		format (file, " (%d, %d, %d, %d, %d, %d)",
+			inst->bits3.dp_render_cache.binding_table_index,
+			inst->bits3.dp_render_cache.msg_control,
+			inst->bits3.dp_render_cache.msg_type,
+			inst->bits3.dp_render_cache.send_commit_msg,
+			inst->bits3.dp_render_cache.msg_length,
+			inst->bits3.dp_render_cache.response_length);
+	    } else {
+		format (file, " (%d, %d, %d, %d)",
+			inst->bits3.dp_write.binding_table_index,
+			(inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+			inst->bits3.dp_write.msg_control,
+			inst->bits3.dp_write.msg_type,
+			inst->bits3.dp_write.send_commit_msg);
+	    }
 	    break;
 	case BRW_MESSAGE_TARGET_URB:
 	    if (gen >= 5) {
@@ -900,15 +933,22 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
 	    break;
 	default:
-	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    format (file, "unsupported target %d", target);
 	    break;
 	}
 	if (space)
 	    string (file, " ");
-	format (file, "mlen %d",
-		inst->bits3.generic.msg_length);
-	format (file, " rlen %d",
-		inst->bits3.generic.response_length);
+	if (gen >= 5) {
+	   format (file, "mlen %d",
+		   inst->bits3.generic_gen5.msg_length);
+	   format (file, " rlen %d",
+		   inst->bits3.generic_gen5.response_length);
+	} else {
+	   format (file, "mlen %d",
+		   inst->bits3.generic.msg_length);
+	   format (file, " rlen %d",
+		   inst->bits3.generic.response_length);
+	}
     }
     pad (file, 64);
     if (inst->header.opcode != BRW_OPCODE_NOP) {
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 3a32ad26c1..ffdddd0a38 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -35,7 +35,7 @@
 
 #include "brw_structs.h"
 #include "brw_defines.h"
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
@@ -520,6 +520,20 @@ static INLINE struct brw_reg brw_acc_reg( void )
 		       0);
 }
 
+static INLINE struct brw_reg brw_notification_1_reg(void)
+{
+
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		  BRW_ARF_NOTIFICATION_COUNT,
+		  1,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  WRITEMASK_X);
+}
+
 
 static INLINE struct brw_reg brw_flag_reg( void )
 {
@@ -877,12 +891,15 @@ void brw_dp_READ_4( struct brw_compile *p,
 
 void brw_dp_READ_4_vs( struct brw_compile *p,
                        struct brw_reg dest,
-                       GLuint oword,
-                       GLboolean relAddr,
-                       struct brw_reg addrReg,
                        GLuint location,
                        GLuint bind_table_index );
 
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addrReg,
+			       GLuint offset,
+			       GLuint bind_table_index);
+
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
 		      GLuint scratch_offset );
@@ -919,6 +936,8 @@ void brw_land_fwd_jump(struct brw_compile *p,
 
 void brw_NOP(struct brw_compile *p);
 
+void brw_WAIT(struct brw_compile *p);
+
 /* Special case: there is never a destination, execution size will be
  * taken from src0:
  */
@@ -965,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn,
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 34dfe10cb9..0d5d17f501 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -364,7 +364,8 @@ static void brw_set_dp_write_message( struct brw_context *brw,
 				      GLuint msg_length,
 				      GLuint pixel_scoreboard_clear,
 				      GLuint response_length,
-				      GLuint end_of_thread )
+				      GLuint end_of_thread,
+				      GLuint send_commit_msg)
 {
    struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
@@ -374,7 +375,7 @@ static void brw_set_dp_write_message( struct brw_context *brw,
        insn->bits3.dp_write_gen5.msg_control = msg_control;
        insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
        insn->bits3.dp_write_gen5.msg_type = msg_type;
-       insn->bits3.dp_write_gen5.send_commit_msg = 0;
+       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
        insn->bits3.dp_write_gen5.header_present = 1;
        insn->bits3.dp_write_gen5.response_length = response_length;
        insn->bits3.dp_write_gen5.msg_length = msg_length;
@@ -386,7 +387,7 @@ static void brw_set_dp_write_message( struct brw_context *brw,
        insn->bits3.dp_write.msg_control = msg_control;
        insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
        insn->bits3.dp_write.msg_type = msg_type;
-       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
        insn->bits3.dp_write.response_length = response_length;
        insn->bits3.dp_write.msg_length = msg_length;
        insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
@@ -906,6 +907,20 @@ void brw_CMP(struct brw_compile *p,
    }
 }
 
+/* Issue 'wait' instruction for n1, host could program MMIO
+   to wake up thread. */
+void brw_WAIT (struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
+   struct brw_reg src = brw_notification_1_reg();
+
+   brw_set_dest(insn, src);
+   brw_set_src0(insn, src);
+   brw_set_src1(insn, brw_null_reg());
+   insn->header.execution_size = 0; /* must */
+   insn->header.predicate_control = 0;
+   insn->header.compression_control = 0;
+}
 
 
 /***********************************************************************
@@ -1040,6 +1055,7 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
 		      GLuint scratch_offset )
 {
+   struct intel_context *intel = &p->brw->intel;
    GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
@@ -1056,13 +1072,32 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 
    {
       GLuint msg_length = 3;
-      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_reg dest;
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
+      int send_commit_msg;
+
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
       insn->header.destreg__conditionalmod = msg_reg_nr;
-  
+
+      /* Until gen6, writes followed by reads from the same location
+       * are not guaranteed to be ordered unless write_commit is set.
+       * If set, then a no-op write is issued to the destination
+       * register to set a dependency, and a read from the destination
+       * can be used to ensure the ordering.
+       *
+       * For gen6, only writes between different threads need ordering
+       * protection.  Our use of DP writes is all about register
+       * spilling within a thread.
+       */
+      if (intel->gen >= 6) {
+	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+	 send_commit_msg = 0;
+      } else {
+	 dest = brw_uw16_grf(0, 0);
+	 send_commit_msg = 1;
+      }
+
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
@@ -1073,8 +1108,9 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
 			       msg_length,
 			       0, /* pixel scoreboard */
-			       0, /* response_length */
-			       0); /* eot */
+			       send_commit_msg, /* response_length */
+			       0, /* eot */
+			       send_commit_msg);
    }
 }
 
@@ -1115,7 +1151,7 @@ void brw_dp_READ_16( struct brw_compile *p,
       brw_set_dp_read_message(p->brw,
 			      insn,
 			      255, /* binding table index (255=stateless) */
-			      3,  /* msg_control (3 means 4 Owords) */
+			      BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
 			      1, /* target cache (render/scratch) */
 			      1, /* msg_length */
@@ -1190,68 +1226,107 @@ void brw_dp_READ_4( struct brw_compile *p,
  */
 void brw_dp_READ_4_vs(struct brw_compile *p,
                       struct brw_reg dest,
-                      GLuint oword,
-                      GLboolean relAddr,
-                      struct brw_reg addrReg,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
+   struct brw_reg b;
 
-   assert(oword < 2);
    /*
    printf("vs const read msg, location %u, msg_reg_nr %d\n",
           location, msg_reg_nr);
    */
 
    /* Setup MRF[1] with location/offset into const buffer */
-   {
-      struct brw_reg b;
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+    */
+   b = brw_message_reg(msg_reg_nr);
+   b = retype(b, BRW_REGISTER_TYPE_UD);
+   /*b = get_element_ud(b, 2);*/
+   brw_MOV(p, b, brw_imm_ud(location));
 
-      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-       */
-      b = brw_message_reg(msg_reg_nr);
-      b = retype(b, BRW_REGISTER_TYPE_UD);
-      /*b = get_element_ud(b, 2);*/
-      if (relAddr) {
-         brw_ADD(p, b, addrReg, brw_imm_ud(location));
-      }
-      else {
-         brw_MOV(p, b, brw_imm_ud(location));
-      }
+   brw_pop_insn_state(p);
 
-      brw_pop_insn_state(p);
-   }
+   insn = next_insn(p, BRW_OPCODE_SEND);
 
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = BRW_PREDICATE_NONE;
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-      insn->header.mask_control = BRW_MASK_DISABLE;
-      /*insn->header.access_mode = BRW_ALIGN_16;*/
-  
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, brw_null_reg());
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+   insn->header.mask_control = BRW_MASK_DISABLE;
 
-      brw_set_dp_read_message(p->brw,
-			      insn,
-			      bind_table_index,
-			      oword,  /* 0 = lower Oword, 1 = upper Oword */
-			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      0, /* source cache = data cache */
-			      1, /* msg_length */
-			      1, /* response_length (1 Oword) */
-			      0); /* eot */
-   }
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_null_reg());
+
+   brw_set_dp_read_message(p->brw,
+			   insn,
+			   bind_table_index,
+			   0,
+			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			   0, /* source cache = data cache */
+			   1, /* msg_length */
+			   1, /* response_length (1 Oword) */
+			   0); /* eot */
+}
+
+/**
+ * Read a float[4] constant per vertex from VS constant buffer, with
+ * relative addressing.
+ */
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addr_reg,
+			       GLuint offset,
+			       GLuint bind_table_index)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int msg_type;
+
+   /* Setup MRF[1] with offset into const buffer */
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
+    * fields ignored.
+    */
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+	   addr_reg, brw_imm_d(offset));
+   brw_pop_insn_state(p);
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = 0;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_vec8_grf(0, 0));
+
+   if (intel->gen == 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   brw_set_dp_read_message(p->brw,
+			   insn,
+			   bind_table_index,
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type,
+			   0, /* source cache = data cache */
+			   2, /* msg_length */
+			   1, /* response_length */
+			   0); /* eot */
 }
 
 
@@ -1281,7 +1356,8 @@ void brw_fb_WRITE(struct brw_compile *p,
 			    msg_length,
 			    1,	/* pixel scoreboard */
 			    response_length, 
-			    eot);
+			    eot,
+			    0 /* send_commit_msg */);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 99a6f6be11..a01d5576f8 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -34,7 +34,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c
index e79b3ddea3..8aa6fb6cc6 100644
--- a/src/mesa/drivers/dri/i965/brw_optimize.c
+++ b/src/mesa/drivers/dri/i965/brw_optimize.c
@@ -26,12 +26,600 @@
  */
 
 #include "main/macros.h"
-#include "shader/program.h"
-#include "shader/prog_print.h"
+#include "program/program.h"
+#include "program/prog_print.h"
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_eu.h"
 
+static const struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+    GLboolean is_arith;
+} inst_opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static INLINE
+GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
+{
+   return inst_opcode[inst->header.opcode].is_arith;
+}
+
+static const GLuint inst_stride[7] = {
+    [0] = 0,
+    [1] = 1,
+    [2] = 2,
+    [3] = 4,
+    [4] = 8,
+    [5] = 16,
+    [6] = 32
+};
+
+static const GLuint inst_type_size[8] = {
+    [BRW_REGISTER_TYPE_UD] = 4,
+    [BRW_REGISTER_TYPE_D] = 4,
+    [BRW_REGISTER_TYPE_UW] = 2,
+    [BRW_REGISTER_TYPE_W] = 2,
+    [BRW_REGISTER_TYPE_UB] = 1,
+    [BRW_REGISTER_TYPE_B] = 1,
+    [BRW_REGISTER_TYPE_F] = 4
+};
+
+static INLINE GLboolean
+brw_is_grf_written(const struct brw_instruction *inst,
+                   int reg_index, int size,
+                   int gen)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+   const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                         + inst->bits1.da1.dest_subreg_nr;
+   int length, write_end;
+
+   /* SEND is specific */
+   if (inst->header.opcode == BRW_OPCODE_SEND) {
+      if (gen >= 5)
+         length = inst->bits3.generic_gen5.response_length*REG_SIZE;
+      else 
+         length = inst->bits3.generic.response_length*REG_SIZE;
+   }
+   else {
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+   }
+
+   /* If the two intervals intersect, we overwrite the register */
+   write_end = write_start + length;
+   const int left = MAX2(write_start, reg_start);
+   const int right = MIN2(write_end, reg_end);
+
+   return left < right;
+}
+
+/* Specific path for message register since we need to handle the compr4 case */
+static INLINE GLboolean
+brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
+   const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+
+   /* We use compr4 with a size != 16 elements. Strange, we conservatively
+    * consider that we are writing the register.
+    */
+   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
+      return GL_TRUE;
+
+   GLboolean is_written = GL_FALSE;
+
+   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
+   if (is_compr4) {
+      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
+
+      /* First 8-way register */
+      const int write_start0 = mrf_index*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end0 = write_start0 + length;
+
+      /* Second 8-way register */
+      const int write_start1 = (mrf_index+4)*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end1 = write_start1 + length;
+
+      /* If the two intervals intersect, we overwrite the register */
+      const int left0 = MAX2(write_start0, reg_start);
+      const int right0 = MIN2(write_end0, reg_end);
+      const int left1 = MAX2(write_start1, reg_start);
+      const int right1 = MIN2(write_end1, reg_end);
+
+      is_written = left0 < right0 || left1 < right1;
+   }
+   else {
+      int length;
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+
+      /* If the two intervals intersect, we write into the register */
+      const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                            + inst->bits1.da1.dest_subreg_nr;
+      const int write_end = write_start + length;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+
+      is_written = left < right;
+   }
+
+   /* SEND may perform an implicit mov to a mrf register */
+   if (is_written == GL_FALSE &&
+       inst->header.opcode == BRW_OPCODE_SEND &&
+       inst->bits1.da1.src0_reg_file != 0) {
+
+      const int mrf_start = inst->header.destreg__conditionalmod;
+      const int write_start = mrf_start * REG_SIZE;
+      const int write_end = write_start + REG_SIZE;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+      is_written = left < right;
+   }
+
+   return is_written;
+}
+
+static INLINE GLboolean
+brw_is_mrf_read(const struct brw_instruction *inst,
+                int reg_index, int size, int gen)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND)
+      return GL_FALSE;
+   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+      return GL_TRUE;
+
+   const int reg_start = reg_index*REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   int length, read_start, read_end;
+   if (gen >= 5)
+      length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
+   else 
+      length = inst->bits3.generic.msg_length*REG_SIZE;
+
+   /* Look if SEND uses an implicit mov. In that case, we read one less register
+    * (but we write it)
+    */
+   if (inst->bits1.da1.src0_reg_file != 0)
+      read_start = inst->header.destreg__conditionalmod;
+   else {
+      length--;
+      read_start = inst->header.destreg__conditionalmod + 1;
+   }
+   read_start *= REG_SIZE;
+   read_end = read_start + length;
+
+   const int left = MAX2(read_start, reg_start);
+   const int right = MIN2(read_end, reg_end);
+
+   return left < right;
+}
+
+static INLINE GLboolean
+brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
+{
+   int i, j;
+   if (inst_opcode[inst->header.opcode].nsrc == 0)
+      return GL_FALSE;
+
+   /* Look at first source. We must take into account register regions to
+    * monitor carefully the read. Note that we are a bit too conservative here
+    * since we do not take into account the fact that some complete registers
+    * may be skipped
+    */
+   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
+
+      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits2.da1.src0_width;
+      const int row_num = elem_num >> inst->bits2.da1.src0_width;
+      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
+      int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
+                    + inst->bits2.da1.src0_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   /* Second src register */
+   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
+
+      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits3.da1.src1_width;
+      const int row_num = elem_num >> inst->bits3.da1.src1_width;
+      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
+      int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
+                    + inst->bits3.da1.src1_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_is_control_done(const struct brw_instruction *mov) {
+   return
+       mov->header.dependency_control != 0 ||
+       mov->header.thread_control != 0 ||
+       mov->header.mask_control != 0 ||
+       mov->header.saturate != 0 ||
+       mov->header.debug_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_predicated(const struct brw_instruction *mov) {
+   return mov->header.predicate_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
+                      int *mrf_index,
+                      int *grf_index,
+                      GLboolean *is_compr4)
+{
+   if (brw_is_predicated(mov) ||
+       brw_is_control_done(mov) ||
+       mov->header.debug_control != 0)
+      return GL_FALSE;
+
+   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
+       mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits1.da1.dest_subreg_nr != 0)
+      return GL_FALSE;
+
+   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
+       mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
+       mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
+       mov->bits2.da1.src0_subreg_nr != 0 ||
+       mov->bits2.da1.src0_abs != 0 ||
+       mov->bits2.da1.src0_negate != 0)
+      return GL_FALSE;
+
+   *grf_index = mov->bits2.da1.src0_reg_nr;
+   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
+   *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
+   return GL_TRUE;
+}
+
+static INLINE GLboolean
+brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
+{
+   /* remark: no problem to predicate a SEL instruction */
+   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
+       brw_is_control_done(inst) == GL_FALSE &&
+       inst->header.execution_size == 4 &&
+       inst->header.access_mode == BRW_ALIGN_1 &&
+       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
+       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
+       inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
+       inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
+       inst->bits1.da1.dest_reg_nr == grf_index &&
+       inst->bits1.da1.dest_subreg_nr == 0 &&
+       brw_is_arithmetic_inst(inst))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_inst_are_equal(const struct brw_instruction *src0,
+                   const struct brw_instruction *src1)
+{
+   const GLuint *field0 = (GLuint *) src0;
+   const GLuint *field1 = (GLuint *) src1;
+   return field0[0] == field1[0] &&
+          field0[1] == field1[1] &&
+          field0[2] == field1[2] &&
+          field0[3] == field1[3];
+}
+
+static INLINE void
+brw_inst_copy(struct brw_instruction *dst,
+              const struct brw_instruction *src)
+{
+   GLuint *field_dst = (GLuint *) dst;
+   const GLuint *field_src = (GLuint *) src;
+   field_dst[0] = field_src[0];
+   field_dst[1] = field_src[1];
+   field_dst[2] = field_src[2];
+   field_dst[3] = field_src[3];
+}
+
+static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
+{
+   int i, nr_insn = 0, to = 0, from = 0;
+
+   for (from = 0; from < p->nr_insn; ++from) {
+      if (removeInst[from])
+         continue;
+      if(to != from)
+         brw_inst_copy(p->store + to, p->store + from);
+      to++;
+   }
+
+   for (i = 0; i < p->nr_insn; ++i)
+      if (removeInst[i] == GL_FALSE)
+         nr_insn++;
+   p->nr_insn = nr_insn;
+}
+
+/* The gen code emitter generates a lot of duplications in the
+ * grf-to-mrf moves, for example when texture sampling with the same
+ * coordinates from multiple textures..  Here, we monitor same mov
+ * grf-to-mrf instrutions and remove repeated ones where the operands
+ * and dst ahven't changed in between.
+ */
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
+{
+   const int gen = p->brw->intel.gen;
+   int i, j;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
+      const int simd16_size = 2 * REG_SIZE;
+
+      for (j = i + 1; j < p->nr_insn; j++) {
+         const struct brw_instruction *inst = p->store + j;
+
+         if (brw_inst_are_equal(mov, inst)) {
+            removeInst[j] = GL_TRUE;
+            continue;
+         }
+
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
+            break;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
+/* Replace moves to MRFs where the value moved is the result of a
+ * normal arithmetic operation with computation right into the MRF.
+ */
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
+{
+   int i, j, prev;
+   struct brw_context *brw = p->brw;
+   const int gen = brw->intel.gen;
+   const int simd16_size = 2*REG_SIZE;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   assert(removeInst);
+
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      struct brw_instruction *grf_inst = NULL;
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      /* Using comp4 enables a stride of 4 for this instruction */
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
+
+      /* Look where the register has been set */
+      prev = i;
+      GLboolean potential_remove = GL_FALSE;
+      while (prev--) {
+
+         /* If _one_ instruction writes the grf, we try to remove the mov */
+         struct brw_instruction *inst = p->store + prev;
+         if (brw_is_grf_straight_write(inst, grf_index)) {
+            potential_remove = GL_TRUE;
+            grf_inst = inst;
+            break;
+         }
+
+      }
+
+      if (potential_remove == GL_FALSE)
+         continue;
+      removeInst[i] = GL_TRUE;
+
+      /* Monitor first the section of code between the grf computation and the
+       * mov. Here we cannot read or write both mrf and grf register
+       */
+      for (j = prev + 1; j < i; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
+             brw_is_grf_read(inst, grf_index, simd16_size)           ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE)   ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE)   ||
+             brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
+             brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+      }
+
+      /* After the mov, we can read or write the mrf. If the grf is overwritten,
+       * we are done
+       */
+      for (j = i + 1; j < p->nr_insn; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+
+         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+
+         if (brw_is_grf_straight_write(inst, grf_index))
+            break;
+      }
+
+      /* Note that with the top down traversal, we can safely pacth the mov
+       * instruction
+       */
+      if (removeInst[i]) {
+         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
+         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
 static GLboolean
 is_single_channel_dp4(struct brw_instruction *insn)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index bd560acdad..4b08d2599b 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -31,10 +31,10 @@
   
 #include "main/imports.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/program.h"
-#include "shader/programopt.h"
-#include "shader/shader_api.h"
+#include "main/shaderobj.h"
+#include "program/prog_parameter.h"
+#include "program/program.h"
+#include "program/programopt.h"
 #include "tnl/tnl.h"
 
 #include "brw_context.h"
@@ -174,9 +174,36 @@ static GLboolean brwProgramStringNotify( GLcontext *ctx,
 	 shader_error(ctx, prog,
 		      "i965 driver doesn't yet support uninlined function "
 		      "calls.  Move to using a single return statement at "
-		      "the end of the function to work around it.");
+		      "the end of the function to work around it.\n");
 	 return GL_FALSE;
       }
+      if (prog->Instructions[i].DstReg.RelAddr &&
+	  prog->Instructions[i].DstReg.File == PROGRAM_INPUT) {
+	 shader_error(ctx, prog,
+		      "Variable indexing of shader inputs unsupported\n");
+	 return GL_FALSE;
+      }
+      if (prog->Instructions[i].DstReg.RelAddr &&
+	  prog->Instructions[i].DstReg.File == PROGRAM_OUTPUT) {
+	 shader_error(ctx, prog,
+		      "Variable indexing of shader outputs unsupported\n");
+	 return GL_FALSE;
+      }
+      if (target == GL_FRAGMENT_PROGRAM_ARB) {
+	 if ((prog->Instructions[i].DstReg.RelAddr &&
+	      prog->Instructions[i].DstReg.File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[0].RelAddr &&
+	      prog->Instructions[i].SrcReg[0].File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[1].RelAddr &&
+	      prog->Instructions[i].SrcReg[1].File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[2].RelAddr &&
+	      prog->Instructions[i].SrcReg[2].File == PROGRAM_TEMPORARY)) {
+	    shader_error(ctx, prog,
+			 "Variable indexing of variable arrays in the FS "
+			 "unsupported\n");
+	    return GL_FALSE;
+	 }
+      }
    }
 
    return GL_TRUE;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index a0680a56f2..e525c730d3 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -34,7 +34,7 @@
 #define BRW_SF_H
 
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index e290ca92f6..914f275cc6 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -130,7 +130,7 @@ struct brw_sf_unit_key {
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
-   unsigned point_attenuated:1;
+   unsigned use_vs_point_size:1;
    unsigned render_to_fbo:1;
    float line_width;
    float point_size;
@@ -164,7 +164,8 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 
    key->point_sprite = ctx->Point.PointSprite;
    key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-   key->point_attenuated = ctx->Point._Attenuated;
+   key->use_vs_point_size = (ctx->VertexProgram.PointSizeEnabled ||
+			     ctx->Point._Attenuated);
 
    /* _NEW_LIGHT */
    key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
@@ -296,7 +297,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    /* _NEW_POINT */
    sf.sf7.sprite_point = key->point_sprite;
    sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
-   sf.sf7.use_point_size_state = !key->point_attenuated;
+   sf.sf7.use_point_size_state = !key->use_vs_point_size;
    sf.sf7.aa_line_distance_mode = 0;
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 2a7fa5b699..2fde42a706 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1657,8 +1657,36 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } dp_write_gen5;
 
+      /* Sandybridge DP for sample cache, constant cache, render cache */
       struct {
-	 GLuint pad:16;
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:5;
+	 GLuint msg_type:3;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_sampler_const_cache;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint slot_group_select:1;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:4;
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_render_cache;
+
+      struct {
+	 GLuint function_control:16;
 	 GLuint response_length:4;
 	 GLuint msg_length:4;
 	 GLuint msg_target:4;
@@ -1666,8 +1694,9 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } generic;
 
+      /* Of this struct, only end_of_thread is not present for gen6. */
       struct {
-	 GLuint pad:19;
+	 GLuint function_control:19;
 	 GLuint header_present:1;
 	 GLuint response_length:5;
 	 GLuint msg_length:4;
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index bba9249d1b..1db2a210d4 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -31,7 +31,7 @@
          
 
 #include "main/mtypes.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 #include "brw_util.h"
 #include "brw_defines.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 3c12f11ea7..9a832af9a9 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -34,8 +34,8 @@
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "shader/prog_print.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_parameter.h"
 
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 6493744f3e..9338a6b7db 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -36,7 +36,7 @@
 
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 
 struct brw_vs_prog_key {
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 128987d78a..c1d6525e9b 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -31,9 +31,9 @@
             
 
 #include "main/macros.h"
-#include "shader/program.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "program/program.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 
@@ -44,6 +44,7 @@ static GLboolean
 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
 {
    int opcode_array[] = {
+      [OPCODE_MOV] = 1,
       [OPCODE_ADD] = 2,
       [OPCODE_CMP] = 3,
       [OPCODE_DP3] = 2,
@@ -218,7 +219,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_overflow_output = 0;
 
    if (intel->gen >= 6)
-      mrf = 6;
+      mrf = 4;
    else if (intel->gen == 5)
       mrf = 8;
    else
@@ -238,12 +239,25 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
-            if (mrf < 16) {
+	    /* Two restrictions on our compute-to-MRF here.  The
+	     * message length for all SEND messages is restricted to
+	     * [1,15], so we can't use mrf 15, as that means a length
+	     * of 16.
+	     *
+	     * Additionally, URB writes are aligned to URB rows, so we
+	     * need to put an even number of registers of URB data in
+	     * each URB write so that the later write is aligned.  A
+	     * message length of 15 means 1 message header reg plus 14
+	     * regs of URB data.
+	     *
+	     * For attributes beyond the compute-to-MRF, we compute to
+	     * GRFs and they will be written in the second URB_WRITE.
+	     */
+            if (mrf < 15) {
                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
                mrf++;
             }
             else {
-               /* too many vertex results to fit in MRF, use GRF for overflow */
                if (!c->first_overflow_output)
                   c->first_overflow_output = i;
                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
@@ -318,8 +332,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
+   /* See emit_vertex_write() for where the VUE's overhead on top of the
+    * attributes comes from.
+    */
    if (intel->gen >= 6)
-      c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
    else if (intel->gen == 5)
       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
@@ -869,8 +886,6 @@ get_constant(struct brw_vs_compile *c,
    assert(argIndex < 3);
 
    if (c->current_const[argIndex].index != src->Index) {
-      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
-
       /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
 
@@ -881,9 +896,6 @@ get_constant(struct brw_vs_compile *c,
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
                        const_reg,                     /* writeback dest */
-                       0,                             /* oword */
-                       0,                             /* relative indexing? */
-                       addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
@@ -904,8 +916,8 @@ get_reladdr_constant(struct brw_vs_compile *c,
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg = c->current_const[argIndex].reg;
-   struct brw_reg const2_reg;
    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg byte_addr_reg = get_tmp(c);
 
    assert(argIndex < 3);
 
@@ -917,37 +929,15 @@ get_reladdr_constant(struct brw_vs_compile *c,
 	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
 
+   brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
+
    /* fetch the first vec4 */
-   brw_dp_READ_4_vs(p,
-		    const_reg,                     /* writeback dest */
-		    0,                             /* oword */
-		    1,                             /* relative indexing? */
-		    addrReg,                       /* address register */
-		    16 * src->Index,               /* byte offset */
-		    SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
-		    );
-   /* second vec4 */
-   const2_reg = get_tmp(c);
-
-   /* use upper half of address reg for second read */
-   addrReg = stride(addrReg, 0, 4, 0);
-   addrReg.subnr = 16;
-
-   brw_dp_READ_4_vs(p,
-		    const2_reg,              /* writeback dest */
-		    1,                       /* oword */
-		    1,                       /* relative indexing? */
-		    addrReg,                 /* address register */
-		    16 * src->Index,         /* byte offset */
-		    SURF_INDEX_VERT_CONST_BUFFER
-		    );
-
-   /* merge the two Owords into the constant register */
-   /* const_reg[7..4] = const2_reg[7..4] */
-   brw_MOV(p,
-	   suboffset(stride(const_reg, 0, 4, 1), 4),
-	   suboffset(stride(const2_reg, 0, 4, 1), 4));
-   release_tmp(c, const2_reg);
+   brw_dp_READ_4_vs_relative(p,
+			     const_reg,                     /* writeback dest */
+			     byte_addr_reg,                 /* address register */
+			     16 * src->Index,               /* byte offset */
+			     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+			     );
 
    return const_reg;
 }
@@ -993,36 +983,71 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
  */
 static struct brw_reg deref( struct brw_vs_compile *c,
 			     struct brw_reg arg,
-			     GLint offset)
+			     GLint offset,
+			     GLuint reg_size )
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg tmp = get_tmp(c);
    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
-   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
-   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
+   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
+   struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
+
+   /* Set the vertical stride on the register access so that the first
+    * 4 components come from a0.0 and the second 4 from a0.1.
+    */
+   indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
 
    {
       brw_push_insn_state(p);
       brw_set_access_mode(p, BRW_ALIGN_1);
 
-      /* This is pretty clunky - load the address register twice and
-       * fetch each 4-dword value in turn.  There must be a way to do
-       * this in a single pass, but I couldn't get it to work.
-       */
-      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
-      brw_MOV(p, tmp, indirect);
+      brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
+      brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
+
+      brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
+      brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
 
-      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
-      brw_MOV(p, suboffset(tmp, 4), indirect);
+      brw_MOV(p, tmp, indirect);
 
       brw_pop_insn_state(p);
    }
-   
+
    /* NOTE: tmp not released */
-   return vec8(tmp);
+   return tmp;
 }
 
+static void
+move_to_reladdr_dst(struct brw_vs_compile *c,
+		    const struct prog_instruction *inst,
+		    struct brw_reg val)
+{
+   struct brw_compile *p = &c->func;
+   int reg_size = 32;
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
+   struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
+   GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+   struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
+
+   byte_offset += inst->DstReg.Index * reg_size;
+
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
+   brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
+   brw_MOV(p, indirect, val);
+
+   brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
+   brw_ADD(p, brw_address_reg(0), acc,
+	   brw_imm_uw(byte_offset + reg_size / 2));
+   brw_MOV(p, indirect, suboffset(val, 4));
+
+   brw_pop_insn_state(p);
+}
 
 /**
  * Get brw reg corresponding to the instruction's [argIndex] src reg.
@@ -1091,7 +1116,7 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_INPUT:
    case PROGRAM_OUTPUT:
       if (relAddr) {
-         return deref(c, c->regs[file][0], index);
+         return deref(c, c->regs[file][0], index, 32);
       }
       else {
          assert(c->regs[file][index].nr != 0);
@@ -1113,7 +1138,7 @@ get_src_reg( struct brw_vs_compile *c,
 	    return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
       }
       else {
          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
@@ -1134,26 +1159,6 @@ get_src_reg( struct brw_vs_compile *c,
    }
 }
 
-
-static void emit_arl( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
-   
-   if (need_tmp) 
-      tmp = get_tmp(c);
-
-   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
-   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
-
-   if (need_tmp)
-      release_tmp(c, tmp);
-}
-
-
 /**
  * Return the brw reg for the given instruction's src argument.
  * Will return mangled results for SWZ op.  The emit_swz() function
@@ -1198,8 +1203,17 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
    switch (dst.File) {
    case PROGRAM_TEMPORARY:
    case PROGRAM_OUTPUT:
-      assert(c->regs[dst.File][dst.Index].nr != 0);
-      reg = c->regs[dst.File][dst.Index];
+      /* register-indirect addressing is only 1x1, not VxH, for
+       * destination regs.  So, for RelAddr we'll return a temporary
+       * for the dest and do a move of the result to the RelAddr
+       * register after the instruction emit.
+       */
+      if (dst.RelAddr) {
+	 reg = get_tmp(c);
+      } else {
+	 assert(c->regs[dst.File][dst.Index].nr != 0);
+	 reg = c->regs[dst.File][dst.Index];
+      }
       break;
    case PROGRAM_ADDRESS:
       assert(dst.Index == 0);
@@ -1298,7 +1312,6 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
@@ -1381,16 +1394,19 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
 
+   /* The VUE layout is documented in Volume 2a. */
    if (intel->gen >= 6) {
-      /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
        * dword 4-7 (m2) is the 4D space position
-       * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
-       * m5 is the first vertex data we fill, which is the vertex position.
+       * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
+       * enabled.  We don't use it, so skip it.
+       * m3 is the first vertex element data we fill, which is the vertex
+       * position.
        */
-      brw_MOV(p, offset(m0, 2), pos);
-      brw_MOV(p, offset(m0, 5), pos);
-      len_vertex_header = 4;
+      brw_MOV(p, brw_message_reg(2), pos);
+      brw_MOV(p, brw_message_reg(3), pos);
+      len_vertex_header = 2;
    } else if (intel->gen == 5) {
       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
@@ -1400,9 +1416,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * m6 is a pad so that the vertex element data is aligned
        * m7 is the first vertex data we fill, which is the vertex position.
        */
-      brw_MOV(p, offset(m0, 2), ndc);
-      brw_MOV(p, offset(m0, 3), pos);
-      brw_MOV(p, offset(m0, 7), pos);
+      brw_MOV(p, brw_message_reg(2), ndc);
+      brw_MOV(p, brw_message_reg(3), pos);
+      brw_MOV(p, brw_message_reg(7), pos);
       len_vertex_header = 6;
    } else {
       /* There are 8 dwords in VUE header pre-Ironlake:
@@ -1412,8 +1428,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * dword 8-11 (m3) is the first vertex data, which we always have be the
        * vertex position.
        */
-      brw_MOV(p, offset(m0, 2), ndc);
-      brw_MOV(p, offset(m0, 3), pos);
+      brw_MOV(p, brw_message_reg(2), ndc);
+      brw_MOV(p, brw_message_reg(3), pos);
       len_vertex_header = 2;
    }
 
@@ -1437,29 +1453,26 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Move the overflowed attributes from the GRF to the MRF and
        * issue another brw_urb_WRITE().
        */
-      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
-       * at mrf[4] atm...
-       */
-      GLuint i, mrf = 0;
+      GLuint i, mrf = 1;
       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
             /* move from GRF to MRF */
-            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
             mrf++;
          }
       }
 
       brw_urb_WRITE(p,
                     brw_null_reg(), /* dest */
-                    4,              /* starting mrf reg nr */
+                    0,              /* starting mrf reg nr */
                     c->r0,          /* src */
                     0,              /* allocate */
                     1,              /* used */
-                    mrf+1,          /* msg len */
+                    mrf,            /* msg len */
                     0,              /* response len */
                     1,              /* eot */
                     1,              /* writes complete */
-                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    14 / 2,  /* urb destination offset */
                     BRW_URB_SWIZZLE_INTERLEAVE);
    }
 }
@@ -1665,7 +1678,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_ARL:
-	 emit_arl(c, dst, args[0]);
+	 brw_RNDD(p, dst, args[0]);
 	 break;
       case OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
@@ -1890,6 +1903,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
          }
       }
 
+      if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
+	 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
+	  * compute-to-mrf and the fact that we are allocating
+	  * registers for only the used PROGRAM_OUTPUTs.
+	  */
+	 move_to_reladdr_dst(c, inst, dst);
+      }
+
       release_tmps(c);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index be9e415cb0..0250a68d29 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -31,7 +31,7 @@
 
 #include "main/mtypes.h"
 #include "main/texstore.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 197b875434..40f51c21c9 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -34,7 +34,7 @@
 #define BRW_WM_H
 
 
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index a90a2d3cf2..0c625a4cd0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1326,7 +1326,7 @@ void emit_fb_write(struct brw_wm_compile *c,
 	  * + 1 for the second half we get destination + 4.
 	  */
 	 brw_MOV(p,
-		 brw_message_reg(nr + channel + (1 << 7)),
+		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
 		 arg0[channel]);
       } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
@@ -1763,12 +1763,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
+   /* Only properly tested on ILK */
+   if (p->brw->intel.gen == 5) {
+     brw_remove_duplicate_mrf_moves(p);
+     if (c->dispatch_width == 16)
+	brw_remove_grf_to_mrf_moves(p);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM) {
       int i;
 
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
+     printf("wm-native:\n");
+     for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
       printf("\n");
    }
 }
+
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index d73c391582..0bef874b88 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -37,9 +37,9 @@
 #include "brw_wm.h"
 #include "brw_util.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_statevars.h"
 
 
 /** An invalid texture target */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 57be08a8d1..2dd346d6dd 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1,7 +1,7 @@
 #include "main/macros.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_optimize.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 60bd92ed22..05de85a957 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -32,7 +32,7 @@
 
 #include "brw_context.h"
 #include "brw_wm.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 1789b21451..c1cf4db1ca 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -222,7 +222,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
       drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2),
 			      brw->wm.scratch_bo,
 			      wm.thread2.per_thread_scratch_space,
-			      0, 0);
+			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    }
 
    /* Emit sampler state relocation */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 77898dbbe7..17b016b569 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -32,7 +32,7 @@
 
 #include "main/mtypes.h"
 #include "main/texstore.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 51940efb44..6820ca3abf 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -69,7 +69,7 @@ upload_sf_state(struct brw_context *brw)
    dw1 =
       num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT |
       (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-      3 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
+      1 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
    dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE |
       GEN6_SF_STATISTICS_ENABLE;
    dw3 = 0;
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 5916a13994..4080a9dedf 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -29,8 +29,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 
 static void
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index ed1a72f03b..863c85449d 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -29,8 +29,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 
 static void