62 files changed, 2272 insertions, 3876 deletions
diff --git a/progs/vp/vp-tris.c b/progs/vp/vp-tris.c
index e5be65e78c..f9e6cdad74 100644
--- a/progs/vp/vp-tris.c
+++ b/progs/vp/vp-tris.c
@@ -90,7 +90,9 @@ static void Init( void )
    }
 
    fprintf(stderr, "%.*s\n", sz, buf);
-      
+
+   glEnable(GL_VERTEX_PROGRAM_NV);
+
    glGenProgramsARB(1, &prognum);
 
    glBindProgramARB(GL_VERTEX_PROGRAM_ARB, prognum);
@@ -168,8 +170,6 @@ static void Display( void )
    glClearColor(0.3, 0.3, 0.3, 1);
    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
 
-   glEnable(GL_VERTEX_PROGRAM_NV);
-
    glBegin(GL_TRIANGLES);
 
 
diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index efb3b35f6a..d866d10017 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -936,10 +936,16 @@ static void load_texture( struct texenv_fragment_program *p, GLuint unit )
 			  
       /* TODO: Use D0_MASK_XY where possible.
        */
-      if (p->state->unit[unit].enabled) 
+      if (p->state->unit[unit].enabled) {
 	 p->src_texture[unit] = emit_texld( p, OPCODE_TXP,
 					    tmp, WRITEMASK_XYZW, 
 					    unit, dim, texcoord );
+         p->program->Base.SamplersUsed |= (1 << unit);
+         /* This identity mapping should already be in place
+          * (see _mesa_init_program_struct()) but let's be safe.
+          */
+         p->program->Base.SamplerUnits[unit] = unit;
+      }
       else
 	 p->src_texture[unit] = get_zero(p);
    }
diff --git a/src/mesa/pipe/i915simple/i915_fpc_translate.c b/src/mesa/pipe/i915simple/i915_fpc_translate.c
index 1cd554250c..d517b88acc 100644
--- a/src/mesa/pipe/i915simple/i915_fpc_translate.c
+++ b/src/mesa/pipe/i915simple/i915_fpc_translate.c
@@ -928,8 +928,9 @@ i915_translate_instructions(struct i915_fp_compile *p,
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* XXX no-op? */
-         assert(0);
+         /* This is a no-op.  We'll get immediates from the usual constant/
+          * uniform buffer.
+          */
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index fefd105adf..44f72e63cc 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -47,10 +47,6 @@ static unsigned minify( unsigned d )
    return MAX2(1, d>>1);
 }
 
-static int align(int value, int alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
 
 
 static void
diff --git a/src/mesa/pipe/i965simple/Makefile b/src/mesa/pipe/i965simple/Makefile
index eda5afaea5..48c00ab50b 100644
--- a/src/mesa/pipe/i965simple/Makefile
+++ b/src/mesa/pipe/i965simple/Makefile
@@ -31,6 +31,7 @@ DRIVER_SOURCES = \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
+        brw_shader_info.c \
         brw_state.c \
 	brw_state_batch.c \
 	brw_state_cache.c \
@@ -40,12 +41,11 @@ DRIVER_SOURCES = \
 	brw_urb.c \
 	brw_util.c \
 	brw_vs.c \
-	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
-	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_iz.c \
+	brw_wm_decl.c \
 	brw_wm_glsl.c \
 	brw_wm_sampler_state.c \
 	brw_wm_state.c \
diff --git a/src/mesa/pipe/i965simple/brw_batch.h b/src/mesa/pipe/i965simple/brw_batch.h
index 7c778f360b..bef69ac871 100644
--- a/src/mesa/pipe/i965simple/brw_batch.h
+++ b/src/mesa/pipe/i965simple/brw_batch.h
@@ -36,7 +36,7 @@
 #define INTEL_BATCH_CLIPRECTS    0x2
 
 #define BEGIN_BATCH( dwords, relocs ) \
-   (brw->batch_start = brw->winsys->batch_start(brw->winsys, dwords, relocs))
+   brw->winsys->batch_start(brw->winsys, dwords, relocs)
 
 #define OUT_BATCH( dword ) \
    brw->winsys->batch_dword(brw->winsys, dword)
@@ -50,7 +50,6 @@
  */
 #define FLUSH_BATCH(fence) do {				\
    brw->winsys->batch_flush(brw->winsys, fence);	\
-   brw->batch_start = NULL;				\
    brw->hardware_dirty = ~0;				\
 } while (0)
 
diff --git a/src/mesa/pipe/i965simple/brw_cc.c b/src/mesa/pipe/i965simple/brw_cc.c
index fc7fdba53f..6cc1505311 100644
--- a/src/mesa/pipe/i965simple/brw_cc.c
+++ b/src/mesa/pipe/i965simple/brw_cc.c
@@ -142,7 +142,7 @@ static void upload_cc_vp( struct brw_context *brw )
 
 const struct brw_tracked_state brw_cc_vp = {
    .dirty = {
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_SCENE,
       .cache = 0
    },
    .update = upload_cc_vp
diff --git a/src/mesa/pipe/i965simple/brw_clip_state.c b/src/mesa/pipe/i965simple/brw_clip_state.c
index 51a4666a0b..ea5c05a279 100644
--- a/src/mesa/pipe/i965simple/brw_clip_state.c
+++ b/src/mesa/pipe/i965simple/brw_clip_state.c
@@ -32,7 +32,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-
+#include "pipe/p_util.h"
 
 
 static void upload_clip_unit( struct brw_context *brw )
@@ -43,7 +43,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
    /* CACHE_NEW_CLIP_PROG */
    clip.thread0.grf_reg_count =
-      ALIGN(brw->clip.prog_data->total_grf, 16) / 16 - 1;
+      align(brw->clip.prog_data->total_grf, 16) / 16 - 1;
    clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
    clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
    clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
diff --git a/src/mesa/pipe/i965simple/brw_context.c b/src/mesa/pipe/i965simple/brw_context.c
index e69ba6938e..5e58701e91 100644
--- a/src/mesa/pipe/i965simple/brw_context.c
+++ b/src/mesa/pipe/i965simple/brw_context.c
@@ -237,7 +237,6 @@ struct pipe_context *brw_create(struct pipe_winsys *pipe_winsys,
    brw->pci_id = pci_id;
    brw->dirty = ~0;
    brw->hardware_dirty = ~0;
-   brw->batch_start = NULL;
 
    memset(&brw->wm.bind, ~0, sizeof(brw->wm.bind));
 
diff --git a/src/mesa/pipe/i965simple/brw_context.h b/src/mesa/pipe/i965simple/brw_context.h
index 53f66cd6a9..318c6a7049 100644
--- a/src/mesa/pipe/i965simple/brw_context.h
+++ b/src/mesa/pipe/i965simple/brw_context.h
@@ -119,7 +119,6 @@
  * Handles blending and (presumably) depth and stencil testing.
  */
 
-#define BRW_FALLBACK_TEXTURE		 0x1
 #define BRW_MAX_CURBE                    (32*16)
 
 struct brw_context;
@@ -147,16 +146,13 @@ struct brw_winsys;
 /* Raised for other internal events:
  */
 #define BRW_NEW_URB_FENCE               0x10000
-#define BRW_NEW_INPUT_DIMENSIONS        0x20000
+#define BRW_NEW_PSP                     0x20000
 #define BRW_NEW_CURBE_OFFSETS           0x40000
 #define BRW_NEW_REDUCED_PRIMITIVE       0x80000
 #define BRW_NEW_PRIMITIVE               0x100000
-#define BRW_NEW_CONTEXT                 0x200000
-#define BRW_NEW_WM_INPUT_DIMENSIONS     0x400000
-#define BRW_NEW_INPUT_VARYING           0x800000
-#define BRW_NEW_PSP                     0x1000000
+#define BRW_NEW_SCENE                 0x200000
+#define BRW_NEW_SF_LINKAGE              0x400000
 
-#define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
 extern int BRW_DEBUG;
 
 #define DEBUG_TEXTURE	0x1
@@ -198,23 +194,47 @@ struct brw_state_flags {
    unsigned brw;
 };
 
+
+struct brw_shader_info {
+   int nr_regs[8];		/* TGSI_FILE_* */
+};
+   
+
+
 struct brw_vertex_program {
    struct pipe_shader_state program;
-   unsigned id;
-   unsigned param_state;		/* flags indicating state tracked by params */
+   struct brw_shader_info info;
+   int id;
 };
 
 
 
 struct brw_fragment_program {
    struct pipe_shader_state program;
-   unsigned id;
-   unsigned param_state;		/* flags indicating state tracked by params */
+   struct brw_shader_info info;
+   
+   boolean UsesDepth;
    boolean UsesKill;
    boolean ComputesDepth;
+   int id;
 };
 
 
+
+
+struct pipe_setup_linkage {
+   struct {
+      unsigned vp_output:5;
+      unsigned interp_mode:4;
+      unsigned bf_vp_output:5;
+   } fp_input[PIPE_MAX_SHADER_INPUTS];
+
+   unsigned fp_input_count:5;
+   unsigned max_vp_output:5;
+};
+   
+
+
 struct brw_texture {
    struct pipe_texture base;
 
@@ -248,6 +268,12 @@ struct brw_texture {
  * corresponding to a different brw_wm_prog_key struct, with different
  * compiled programs:
  */
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+
 struct brw_wm_prog_data {
    unsigned curb_read_length;
    unsigned urb_read_length;
@@ -256,13 +282,14 @@ struct brw_wm_prog_data {
    unsigned total_grf;
    unsigned total_scratch;
 
-   unsigned nr_params;
-   boolean error;
-
-   /* Pointer to tracked values (only valid once
-    * _mesa_load_state_parameters has been called at runtime).
+   /* Internally generated constants for the CURBE.  These are loaded
+    * ahead of the data from the constant buffer.
     */
-   const float *param[BRW_MAX_CURBE];
+   const float internal_const[8];
+   unsigned nr_internal_consts;
+   unsigned max_const;
+
+   boolean error;
 };
 
 struct brw_sf_prog_data {
@@ -298,19 +325,14 @@ struct brw_vs_prog_data {
 
    unsigned inputs_read;
 
+   unsigned max_const;
+
    /* Used for calculating urb partitions:
     */
    unsigned urb_entry_size;
 };
 
 
-/* Size == 0 if output either not written, or always [0,0,0,1]
- */
-struct brw_vs_ouput_sizes {
-   ubyte output_size[PIPE_MAX_SHADER_OUTPUTS];
-};
-
-
 #define BRW_MAX_TEX_UNIT 8
 #define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + 1
 
@@ -374,8 +396,6 @@ struct brw_cache {
 struct brw_tracked_state {
    struct brw_state_flags dirty;
    void (*update)( struct brw_context *brw );
-   void (*emit_reloc)( struct brw_context *brw );
-   boolean always_update;
 };
 
 
@@ -455,8 +475,6 @@ struct brw_context
 
    struct {
       struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      unsigned nr_atoms;
    } state;
 
 
@@ -489,34 +507,23 @@ struct brw_context
       /* Arrays with buffer objects to copy non-bufferobj arrays into
        * for upload:
        */
-      struct pipe_vertex_buffer vbo_array[PIPE_ATTRIB_MAX];
+      struct pipe_vertex_buffer *vbo_array[PIPE_ATTRIB_MAX];
 
       struct brw_vertex_element inputs[PIPE_ATTRIB_MAX];
 
 #define BRW_NR_UPLOAD_BUFS 17
 #define BRW_UPLOAD_INIT_SIZE (128*1024)
 
-      struct {
-	 struct pipe_buffer_handle *vbo[BRW_NR_UPLOAD_BUFS];
-	 unsigned buf;
-	 unsigned offset;
-	 unsigned size;
-	 unsigned wrap;
-      } upload;
-
       /* Summary of size and varying of active arrays, so we can check
        * for changes to this state:
        */
       struct brw_vertex_info info;
-      int last_vb;
    } vb;
 
 
-   unsigned *batch_start;
    unsigned hardware_dirty;
    unsigned dirty;
    unsigned pci_id;
-
    /* BRW_NEW_URB_ALLOCATIONS:
     */
    struct {
@@ -557,11 +564,6 @@ struct brw_context
       unsigned vs_size;
       unsigned total_size;
 
-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
       unsigned gs_offset;
 
       float *last_buf;
@@ -595,6 +597,8 @@ struct brw_context
    struct {
       struct brw_sf_prog_data *prog_data;
 
+      struct pipe_setup_linkage linkage;
+
       unsigned prog_gs_offset;
       unsigned vp_gs_offset;
       unsigned state_gs_offset;
@@ -602,11 +606,8 @@ struct brw_context
 
    struct {
       struct brw_wm_prog_data *prog_data;
-      struct brw_wm_compile *compile_data;
 
-      /* Input sizes, calculated from active vertex program:
-       */
-      unsigned input_size_masks[4];
+//      struct brw_wm_compiler *compile_data;
 
 
       /**
@@ -667,8 +668,6 @@ void brw_destroy_state(struct brw_context *brw);
  * brw_tex.c
  */
 void brwUpdateTextureState( struct brw_context *brw );
-void brw_FrameBufferTexInit( struct brw_context *brw );
-void brw_FrameBufferTexDestroy( struct brw_context *brw );
 
 
 /* brw_urb.c
diff --git a/src/mesa/pipe/i965simple/brw_curbe.c b/src/mesa/pipe/i965simple/brw_curbe.c
index 0894e82d56..b943a7af98 100644
--- a/src/mesa/pipe/i965simple/brw_curbe.c
+++ b/src/mesa/pipe/i965simple/brw_curbe.c
@@ -35,6 +35,9 @@
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "brw_util.h"
+#include "brw_wm.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
 
 #define FILE_DEBUG_FLAG DEBUG_FALLBACKS
 
@@ -43,11 +46,10 @@
 static void calculate_curbe_offsets( struct brw_context *brw )
 {
    /* CACHE_NEW_WM_PROG */
-   unsigned nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
+   unsigned nr_fp_regs = align(brw->wm.prog_data->max_const, 16);
 
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->attribs.VertexProgram;
-   unsigned nr_vp_regs = (vp->program.num_inputs * 4 + 15) / 16;
+   unsigned nr_vp_regs = align(brw->vs.prog_data->max_const, 16);
    unsigned nr_clip_regs = 0;
    unsigned total_regs;
 
@@ -55,7 +57,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    /* BRW_NEW_CLIP ? */
    if (brw->attribs.Transform->ClipPlanesEnabled) {
       unsigned nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
-      nr_clip_regs = (nr_planes * 4 + 15) / 16;
+      nr_clip_regs = align(nr_planes * 4, 16);
    }
 #endif
 
@@ -172,28 +174,18 @@ static float fixed_plane[6][4] = {
    { 1,    0,    0, 1 }
 };
 
-#if 0
 /* Upload a new set of constants.  Too much variability to go into the
  * cache mechanism, but maybe would benefit from a comparison against
  * the current uploaded set of constants.
  */
 static void upload_constant_buffer(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
    struct brw_mem_pool *pool = &brw->pool[BRW_GS_POOL];
    unsigned sz = brw->curbe.total_size;
    unsigned bufsz = sz * 16 * sizeof(float);
    float *buf;
    unsigned i;
 
-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->param_state;
-   brw->curbe.tracked_state.dirty.mesa |= fp->param_state;
 
    if (sz == 0) {
       struct brw_constant_buffer cb;
@@ -220,10 +212,16 @@ static void upload_constant_buffer(struct brw_context *brw)
    if (brw->curbe.wm_size) {
       unsigned offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters);
+      /* First the constant buffer constants:
+       */
+      
+      /* Then any internally generated constants: 
+       */
+      for (i = 0; i < brw->wm.prog_data->nr_internal_consts; i++)
+	 buf[offset + i] = brw->wm.prog_data->internal_const[i];
 
-      for (i = 0; i < brw->wm.prog_data->nr_params; i++)
-	 buf[offset + i] = brw->wm.prog_data->param[i][0];
+      assert(brw->wm.prog_data->max_const == 
+	     brw->wm.prog_data->nr_internal_consts);
    }
 
 
@@ -243,34 +241,26 @@ static void upload_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
       }
 
-      /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
-       * clip-space:
+      /* Clip planes: BRW_NEW_CLIP:
        */
-      assert(MAX_CLIP_PLANES == 6);
-      for (j = 0; j < MAX_CLIP_PLANES; j++) {
-	 if (brw->attribs.Transform->ClipPlanesEnabled & (1<<j)) {
-	    buf[offset + i * 4 + 0] = brw->attribs.Transform->_ClipUserPlane[j][0];
-	    buf[offset + i * 4 + 1] = brw->attribs.Transform->_ClipUserPlane[j][1];
-	    buf[offset + i * 4 + 2] = brw->attribs.Transform->_ClipUserPlane[j][2];
-	    buf[offset + i * 4 + 3] = brw->attribs.Transform->_ClipUserPlane[j][3];
-	    i++;
-	 }
+      for (j = 0; j < brw->attribs.Clip.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->attribs.Clip.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->attribs.Clip.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->attribs.Clip.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->attribs.Clip.ucp[j][3];
+	 i++;
       }
    }
 
 
    if (brw->curbe.vs_size) {
-      unsigned offset = brw->curbe.vs_start * 16;
-      unsigned nr = vp->program.Base.Parameters->NumParameters;
+//      unsigned offset = brw->curbe.vs_start * 16;
+//      unsigned nr = vp->max_const;
 
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters);
+      /* map the vertex constant buffer and copy to curbe: */
 
-      for (i = 0; i < nr; i++) {
-	 buf[offset + i * 4 + 0] = vp->program.Base.Parameters->ParameterValues[i][0];
-	 buf[offset + i * 4 + 1] = vp->program.Base.Parameters->ParameterValues[i][1];
-	 buf[offset + i * 4 + 2] = vp->program.Base.Parameters->ParameterValues[i][2];
-	 buf[offset + i * 4 + 3] = vp->program.Base.Parameters->ParameterValues[i][3];
-      }
+//      assert(nr == 0);
+      assert(0);
    }
 
    if (0) {
@@ -309,7 +299,12 @@ static void upload_constant_buffer(struct brw_context *brw)
 
       /* Copy data to the buffer:
        */
-      dri_bo_subdata(pool->buffer, brw->curbe.gs_offset, bufsz, buf);
+      brw->winsys->buffer_subdata_typed(brw->winsys,
+					pool->buffer, 
+					brw->curbe.gs_offset, 
+					bufsz, 
+					buf,
+					BRW_CONSTANT_BUFFER );
    }
 
    /* TODO: only emit the constant_buffer packet when necessary, ie:
@@ -341,9 +336,7 @@ static void upload_constant_buffer(struct brw_context *brw)
        * flushes as necessary when doublebuffering of CURBEs isn't
        * possible.
        */
-/*       intel_batchbuffer_align(brw->intel.batch, 64, sizeof(cb)); */
       BRW_BATCH_STRUCT(brw, &cb);
-/*       intel_batchbuffer_align(brw->intel.batch, 64, 0); */
    }
 }
 
@@ -355,9 +348,8 @@ static void upload_constant_buffer(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
-      .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
-	       BRW_NEW_VERTEX_PROGRAM |
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_CONSTANTS |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
 	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
 	       BRW_NEW_CURBE_OFFSETS),
@@ -366,4 +358,3 @@ const struct brw_tracked_state brw_constant_buffer = {
    .update = upload_constant_buffer
 };
 
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_draw.c b/src/mesa/pipe/i965simple/brw_draw.c
index f443f41c6f..01c8ddb227 100644
--- a/src/mesa/pipe/i965simple/brw_draw.c
+++ b/src/mesa/pipe/i965simple/brw_draw.c
@@ -151,7 +151,6 @@ static boolean brw_try_draw_elements( struct pipe_context *pipe,
 				      unsigned count )
 {
    struct brw_context *brw = brw_context(pipe);
-   boolean retval = FALSE;
 
    /* Set the first primitive ahead of validate_state:
     */
diff --git a/src/mesa/pipe/i965simple/brw_draw_upload.c b/src/mesa/pipe/i965simple/brw_draw_upload.c
index 186a6274fa..79144837e8 100644
--- a/src/mesa/pipe/i965simple/brw_draw_upload.c
+++ b/src/mesa/pipe/i965simple/brw_draw_upload.c
@@ -207,25 +207,28 @@ static unsigned get_index_type(int type)
 boolean brw_upload_vertex_buffers( struct brw_context *brw )
 {
    struct brw_array_state vbp;
+   unsigned nr_enabled = 0;
    unsigned i;
-   int nr_enabled = brw->vb.last_vb + 1;
 
    memset(&vbp, 0, sizeof(vbp));
 
    /* This is a hardware limit:
     */
-   if (nr_enabled >= BRW_VEP_MAX)
-	 return FALSE;
 
-   for (i = 0; i < nr_enabled; i++)
+   for (i = 0; i < BRW_VEP_MAX; i++)
    {
-      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i].pitch;
+      if (brw->vb.vbo_array[i]->buffer == NULL) {
+	 nr_enabled = i;
+	 break;
+      }
+
+      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->pitch;
       vbp.vb[i].vb0.bits.pad = 0;
       vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
       vbp.vb[i].vb0.bits.vb_index = i;
-      vbp.vb[i].offset = brw->vb.vbo_array[i].buffer_offset;
-      vbp.vb[i].buffer = brw->vb.vbo_array[i].buffer;
-      vbp.vb[i].max_index = brw->vb.vbo_array[i].max_index;
+      vbp.vb[i].offset = brw->vb.vbo_array[i]->buffer_offset;
+      vbp.vb[i].buffer = brw->vb.vbo_array[i]->buffer;
+      vbp.vb[i].max_index = brw->vb.vbo_array[i]->max_index;
    }
 
 
@@ -260,7 +263,7 @@ boolean brw_upload_vertex_elements( struct brw_context *brw )
    for (i = 0; i < nr_enabled; i++) {
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      switch (brw->vb.vbo_array[input->vep.ve0.vertex_buffer_index].pitch) {
+      switch (brw->vb.vbo_array[input->vep.ve0.vertex_buffer_index]->pitch) {
       case 0: input->vep.ve1.vfcomponent0 = BRW_VFCOMPONENT_STORE_0;
       case 1: input->vep.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_0;
       case 2: input->vep.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_0;
diff --git a/src/mesa/pipe/i965simple/brw_eu.h b/src/mesa/pipe/i965simple/brw_eu.h
index 111edb1506..23151ae9ed 100644
--- a/src/mesa/pipe/i965simple/brw_eu.h
+++ b/src/mesa/pipe/i965simple/brw_eu.h
@@ -694,6 +694,17 @@ void brw_init_compile( struct brw_compile *p );
 const unsigned *brw_get_program( struct brw_compile *p, unsigned *sz );
 
 
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src );
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 );
+
 /* Helpers for regular instructions:
  */
 #define ALU1(OP)					\
diff --git a/src/mesa/pipe/i965simple/brw_eu_emit.c b/src/mesa/pipe/i965simple/brw_eu_emit.c
index bda63e8b9a..2423536dd1 100644
--- a/src/mesa/pipe/i965simple/brw_eu_emit.c
+++ b/src/mesa/pipe/i965simple/brw_eu_emit.c
@@ -363,10 +363,10 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
 }
 
 
-static struct brw_instruction *brw_alu1( struct brw_compile *p,
-					 unsigned opcode,
-					 struct brw_reg dest,
-					 struct brw_reg src )
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
    brw_set_dest(insn, dest);
@@ -374,11 +374,11 @@ static struct brw_instruction *brw_alu1( struct brw_compile *p,
    return insn;
 }
 
-static struct brw_instruction *brw_alu2(struct brw_compile *p,
-					unsigned opcode,
-					struct brw_reg dest,
-					struct brw_reg src0,
-					struct brw_reg src1 )
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
    brw_set_dest(insn, dest);
diff --git a/src/mesa/pipe/i965simple/brw_gs_state.c b/src/mesa/pipe/i965simple/brw_gs_state.c
index 8e62eb4bd7..3932e9e939 100644
--- a/src/mesa/pipe/i965simple/brw_gs_state.c
+++ b/src/mesa/pipe/i965simple/brw_gs_state.c
@@ -34,6 +34,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "pipe/p_util.h"
 
 
 
@@ -46,7 +47,7 @@ static void upload_gs_unit( struct brw_context *brw )
    /* CACHE_NEW_GS_PROG */
    if (brw->gs.prog_active) {
       gs.thread0.grf_reg_count =
-	 ALIGN(brw->gs.prog_data->total_grf, 16) / 16 - 1;
+	 align(brw->gs.prog_data->total_grf, 16) / 16 - 1;
       gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
       gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
    }
diff --git a/src/mesa/pipe/i965simple/brw_misc_state.c b/src/mesa/pipe/i965simple/brw_misc_state.c
index 0750502334..e600e9d8de 100644
--- a/src/mesa/pipe/i965simple/brw_misc_state.c
+++ b/src/mesa/pipe/i965simple/brw_misc_state.c
@@ -202,9 +202,7 @@ static void upload_depthbuffer(struct brw_context *brw)
       OUT_BATCH(((depth_surface->pitch * depth_surface->cpp) - 1) |
 		(format << 18) |
 		(BRW_TILEWALK_YMAJOR << 26) |
-#if 0
-		(depth_surface->region->tiled << 27) |
-#endif
+//		(depth_surface->region->tiled << 27) |
 		(BRW_SURFACE_2D << 29));
       OUT_RELOC(depth_surface->buffer,
 		PIPE_BUFFER_FLAG_READ | PIPE_BUFFER_FLAG_WRITE, 0);
@@ -317,7 +315,7 @@ static void upload_pipe_control(struct brw_context *brw)
 
 const struct brw_tracked_state brw_pipe_control = {
    .dirty = {
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_SCENE,
       .cache = 0
    },
    .update = upload_pipe_control
@@ -382,7 +380,7 @@ static void upload_invarient_state( struct brw_context *brw )
 
 const struct brw_tracked_state brw_invarient_state = {
    .dirty = {
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_SCENE,
       .cache = 0
    },
    .update = upload_invarient_state
@@ -418,7 +416,7 @@ static void upload_state_base_address( struct brw_context *brw )
 
 const struct brw_tracked_state brw_state_base_address = {
    .dirty = {
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_SCENE,
       .cache = 0
    },
    .update = upload_state_base_address
diff --git a/src/mesa/pipe/i965simple/brw_sf.c b/src/mesa/pipe/i965simple/brw_sf.c
index f009ff37d9..e7c02beda5 100644
--- a/src/mesa/pipe/i965simple/brw_sf.c
+++ b/src/mesa/pipe/i965simple/brw_sf.c
@@ -36,9 +36,8 @@
 #include "brw_util.h"
 #include "brw_sf.h"
 #include "brw_state.h"
+#include "tgsi/util/tgsi_parse.h"
 
-#if 0
-#define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
 
 static void compile_sf_prog( struct brw_context *brw,
 			     struct brw_sf_prog_key *key )
@@ -46,7 +45,6 @@ static void compile_sf_prog( struct brw_context *brw,
    struct brw_sf_compile c;
    const unsigned *program;
    unsigned program_size;
-   unsigned i, idx;
 
    memset(&c, 0, sizeof(c));
 
@@ -55,27 +53,17 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(&c.func);
 
    c.key = *key;
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+
+
+   c.nr_attrs = c.key.vp_output_count;
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+
+   c.nr_setup_attrs = c.key.fp_input_count;
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
    c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
 
-   /* Construct map from attribute number to position in the vertex.
-    */
-   for (i = idx = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
-	 c.attr_to_idx[i] = idx;
-	 c.idx_to_attr[idx] = i;
-	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
-		 c.point_attrs[i].CoordReplace =
-			brw->attribs.Point->CoordReplace[i - VERT_RESULT_TEX0];
-	 } else
-		 c.point_attrs[i].CoordReplace = FALSE;
-	 idx++;
-      }
 
    /* Which primitive?  Or all three?
     */
@@ -90,21 +78,17 @@ static void compile_sf_prog( struct brw_context *brw,
       break;
    case SF_POINTS:
       c.nr_verts = 1;
-      if (key->do_point_sprite)
-	  brw_emit_point_sprite_setup( &c );
-      else
-	  brw_emit_point_setup( &c );
+      brw_emit_point_setup( &c );
       break;
+
    case SF_UNFILLED_TRIS:
-      c.nr_verts = 3;
-      brw_emit_anyprim_setup( &c );
-      break;
    default:
       assert(0);
       return;
    }
 
 
+
    /* get the program
     */
    program = brw_get_program(&c.func, &program_size);
@@ -142,20 +126,15 @@ static void upload_sf_prog( struct brw_context *brw )
    /* Populate the key, noting state dependencies:
     */
    /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written;
+   key.vp_output_count = brw->vs.prog_data->outputs_written;
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
    switch (brw->reduced_primitive) {
    case PIPE_PRIM_TRIANGLES:
-      /* NOTE: We just use the edgeflag attribute as an indicator that
-       * unfilled triangles are active.  We don't actually do the
-       * edgeflag testing here, it is already done in the clip
-       * program.
-       */
-      if (key.attrs & (1<<VERT_RESULT_EDGE))
-	 key.primitive = SF_UNFILLED_TRIS;
-      else
-	 key.primitive = SF_TRIANGLES;
+//      if (key.attrs & (1<<VERT_RESULT_EDGE))
+//	 key.primitive = SF_UNFILLED_TRIS;
+//      else
+      key.primitive = SF_TRIANGLES;
       break;
    case PIPE_PRIM_LINES:
       key.primitive = SF_LINES;
@@ -165,16 +144,15 @@ static void upload_sf_prog( struct brw_context *brw )
       break;
    }
 
-   /* BRW_NEW_POINT */
-   key.do_point_sprite = brw->attribs.Point->PointSprite;
-   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
-   /* BRW_NEW_RASTER */
-   key.do_flat_shading = (brw->attribs.Raster->flatshade);
-   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
 
-   /* _NEW_POLYGON */
-   if (key.do_twoside_color)
-      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
+//   key.do_point_sprite = brw->attribs.Point->PointSprite;
+//   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
+
+//   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+//   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
+
+//   if (key.do_twoside_color)
+//      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
 
 
    if (!search_cache(brw, &key))
@@ -184,11 +162,150 @@ static void upload_sf_prog( struct brw_context *brw )
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .brw   = (BRW_NEW_RASTER |
-		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = CACHE_NEW_VS_PROG
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_REDUCED_PRIMITIVE |
+		BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
    },
    .update = upload_sf_prog
 };
 
-#endif
+
+/* Build a struct like the one we'd like the state tracker to pass to
+ * us.
+ */
+static void update_sf_linkage( struct brw_context *brw )
+{
+   const struct brw_vertex_program *vs = brw->attribs.VertexProgram;
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct pipe_setup_linkage state;
+   struct tgsi_parse_context parse;
+
+   int i, j;
+   int nr_vp_outputs = 0;
+   int done = 0;
+
+   struct { 
+      unsigned semantic:8;
+      unsigned semantic_index:16;
+   } fp_semantic[32], vp_semantic[32];
+
+   memset(&state, 0, sizeof(state));
+
+   state.fp_input_count = 0;
+
+
+
+   
+   /* First scan fp inputs
+    */
+   tgsi_parse_init( &parse, fs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.u.DeclarationRange.Last;
+
+	    for (i = first; i < last; i++) {
+	       state.fp_input[i].vp_output = ~0;
+	       state.fp_input[i].bf_vp_output = ~0;
+	       state.fp_input[i].interp_mode = 
+		  parse.FullToken.FullDeclaration.Interpolation.Interpolate;
+
+	       fp_semantic[i].semantic = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	       fp_semantic[i].semantic_index = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+
+	    }
+
+	    assert(last > state.fp_input_count);
+	    state.fp_input_count = last;
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+
+   assert(state.fp_input_count == fs->program.num_inputs);
+
+      
+   /* Then scan vp outputs
+    */
+   done = 0;
+   tgsi_parse_init( &parse, vs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.u.DeclarationRange.Last;
+
+	    for (i = first; i < last; i++) {
+	       vp_semantic[i].semantic = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	       vp_semantic[i].semantic_index = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+	    }
+	    
+	    assert(last > nr_vp_outputs);
+	    nr_vp_outputs = last;
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+
+   /* Now match based on semantic information.
+    */
+   for (i = 0; i< state.fp_input_count; i++) {
+      for (j = 0; j < nr_vp_outputs; j++) {
+	 if (fp_semantic[i].semantic == vp_semantic[j].semantic &&
+	     fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	    state.fp_input[i].vp_output = j;
+	 }
+      }
+      if (fp_semantic[i].semantic == TGSI_SEMANTIC_COLOR) {
+	 for (j = 0; j < nr_vp_outputs; j++) {
+	    if (TGSI_SEMANTIC_BCOLOR == vp_semantic[j].semantic &&
+		fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	       state.fp_input[i].bf_vp_output = j;
+	    }
+	 }
+      }
+   }
+
+   if (memcmp(&brw->sf.linkage, &state, sizeof(state)) != 0) {
+      brw->sf.linkage = state;
+      brw->state.dirty.brw |= BRW_NEW_SF_LINKAGE;
+   }
+}
+
+
+const struct brw_tracked_state brw_sf_linkage = {
+   .dirty = {
+      .brw   = (BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = update_sf_linkage
+};
+
diff --git a/src/mesa/pipe/i965simple/brw_sf.h b/src/mesa/pipe/i965simple/brw_sf.h
index d04388325d..b7ada47560 100644
--- a/src/mesa/pipe/i965simple/brw_sf.h
+++ b/src/mesa/pipe/i965simple/brw_sf.h
@@ -42,15 +42,26 @@
 #define SF_TRIANGLES 2
 #define SF_UNFILLED_TRIS   3
 
+
+
 struct brw_sf_prog_key {
-   unsigned attrs:32;
+   unsigned vp_output_count:5;
+   unsigned fp_input_count:5;
+
    unsigned primitive:2;
    unsigned do_twoside_color:1;
    unsigned do_flat_shading:1;
    unsigned frontface_ccw:1;
    unsigned do_point_sprite:1;
-   unsigned pad:10;
-   int SpriteOrigin;
+
+   /* Interpolation masks;
+    */
+   unsigned linear_mask;
+   unsigned persp_mask;
+   unsigned const_mask;
+
+
+//   int SpriteOrigin;
 };
 
 struct brw_sf_point_tex {
diff --git a/src/mesa/pipe/i965simple/brw_sf_emit.c b/src/mesa/pipe/i965simple/brw_sf_emit.c
index 93f23171f2..834b5efdfe 100644
--- a/src/mesa/pipe/i965simple/brw_sf_emit.c
+++ b/src/mesa/pipe/i965simple/brw_sf_emit.c
@@ -36,171 +36,6 @@
 #include "brw_util.h"
 #include "brw_sf.h"
 
-#if 0
-static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
-				    struct brw_reg vert,
-				    unsigned attr)
-{
-   unsigned off = c->attr_to_idx[attr] / 2;
-   unsigned sub = c->attr_to_idx[attr] % 2;
-
-   return brw_vec4_grf(vert.nr + off, sub * 4);
-}
-
-static boolean have_attr(struct brw_sf_compile *c,
-			   unsigned attr)
-{
-   return (c->key.attrs & (1<<attr)) ? 1 : 0;
-}
-
-
-
-/***********************************************************************
- * Twoside lighting
- */
-static void copy_bfc( struct brw_sf_compile *c,
-		      struct brw_reg vert )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   for (i = 0; i < 2; i++) {
-      if (have_attr(c, VERT_RESULT_COL0+i) &&
-	  have_attr(c, VERT_RESULT_BFC0+i))
-	 brw_MOV(p,
-		 get_vert_attr(c, vert, VERT_RESULT_COL0+i),
-		 get_vert_attr(c, vert, VERT_RESULT_BFC0+i));
-   }
-}
-
-
-static void do_twoside_color( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *if_insn;
-   unsigned backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
-
-   /* Already done in clip program:
-    */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
-      return;
-
-   /* XXX: What happens if BFC isn't present?  This could only happen
-    * for user-supplied vertex programs, as t_vp_build.c always does
-    * the right thing.
-    */
-   if (!(have_attr(c, VERT_RESULT_COL0) && have_attr(c, VERT_RESULT_BFC0)) &&
-       !(have_attr(c, VERT_RESULT_COL1) && have_attr(c, VERT_RESULT_BFC1)))
-      return;
-
-   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
-    * to get all channels active inside the IF.  In the clipping code
-    * we run with NoMask, so it's not an option and we can use
-    * BRW_EXECUTE_1 for all comparisions.
-    */
-   brw_push_insn_state(p);
-   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
-   if_insn = brw_IF(p, BRW_EXECUTE_4);
-   {
-      switch (c->nr_verts) {
-      case 3: copy_bfc(c, c->vert[2]);
-      case 2: copy_bfc(c, c->vert[1]);
-      case 1: copy_bfc(c, c->vert[0]);
-      }
-   }
-   brw_ENDIF(p, if_insn);
-   brw_pop_insn_state(p);
-}
-
-
-
-/***********************************************************************
- * Flat shading
- */
-
-#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
-                                 (1<<VERT_RESULT_COL1))
-
-static void copy_colors( struct brw_sf_compile *c,
-		     struct brw_reg dst,
-		     struct brw_reg src)
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   for (i = VERT_RESULT_COL0; i <= VERT_RESULT_COL1; i++) {
-      if (have_attr(c,i))
-	 brw_MOV(p,
-		 get_vert_attr(c, dst, i),
-		 get_vert_attr(c, src, i));
-   }
-}
-
-
-
-/* Need to use a computed jump to copy flatshaded attributes as the
- * vertices are ordered according to y-coordinate before reaching this
- * point, so the PV could be anywhere.
- */
-static void do_flatshade_triangle( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg ip = brw_ip_reg();
-   unsigned nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
-   if (!nr)
-      return;
-
-   /* Already done in clip program:
-    */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
-      return;
-
-   brw_push_insn_state(p);
-
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr*2+1));
-   brw_JMPI(p, ip, ip, c->pv);
-
-   copy_colors(c, c->vert[1], c->vert[0]);
-   copy_colors(c, c->vert[2], c->vert[0]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*4+1));
-
-   copy_colors(c, c->vert[0], c->vert[1]);
-   copy_colors(c, c->vert[2], c->vert[1]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*2));
-
-   copy_colors(c, c->vert[0], c->vert[2]);
-   copy_colors(c, c->vert[1], c->vert[2]);
-
-   brw_pop_insn_state(p);
-}
-
-
-static void do_flatshade_line( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg ip = brw_ip_reg();
-   unsigned nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
-
-   if (!nr)
-      return;
-
-   /* Already done in clip program:
-    */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
-      return;
-
-   brw_push_insn_state(p);
-
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr+1));
-   brw_JMPI(p, ip, ip, c->pv);
-   copy_colors(c, c->vert[1], c->vert[0]);
-
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr));
-   copy_colors(c, c->vert[0], c->vert[1]);
-
-   brw_pop_insn_state(p);
-}
-
 
 
 /***********************************************************************
@@ -277,9 +112,6 @@ static void copy_z_inv_w( struct brw_sf_compile *c )
 
 static void invert_det( struct brw_sf_compile *c)
 {
-   /* Looks like we invert all 8 elements just to get 1/det in
-    * position 2 !?!
-    */
    brw_math(&c->func,
 	    c->inv_det,
 	    BRW_MATH_FUNCTION_INV,
@@ -302,22 +134,16 @@ static boolean calculate_masks( struct brw_sf_compile *c,
 				  ushort *pc_linear)
 {
    boolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   unsigned persp_mask = c->key.attrs & ~NON_PERPECTIVE_ATTRS;
-   unsigned linear_mask;
 
-   if (c->key.do_flat_shading)
-      linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
-   else
-      linear_mask = c->key.attrs;
 
    *pc_persp = 0;
    *pc_linear = 0;
    *pc = 0xf;
 
-   if (persp_mask & (1 << c->idx_to_attr[reg*2]))
-      *pc_persp = 0xf;
+//   if (persp_mask & (1 << c->idx_to_attr[reg*2]))
+//      *pc_persp = 0xf;
 
-   if (linear_mask & (1 << c->idx_to_attr[reg*2]))
+//   if (linear_mask & (1 << c->idx_to_attr[reg*2]))
       *pc_linear = 0xf;
 
    /* Maybe only processs one attribute on the final round:
@@ -325,10 +151,10 @@ static boolean calculate_masks( struct brw_sf_compile *c,
    if (reg*2+1 < c->nr_setup_attrs) {
       *pc |= 0xf0;
 
-      if (persp_mask & (1 << c->idx_to_attr[reg*2+1]))
-	 *pc_persp |= 0xf0;
+//      if (persp_mask & (1 << c->idx_to_attr[reg*2+1]))
+//	 *pc_persp |= 0xf0;
 
-      if (linear_mask & (1 << c->idx_to_attr[reg*2+1]))
+//      if (linear_mask & (1 << c->idx_to_attr[reg*2+1]))
 	 *pc_linear |= 0xf0;
    }
 
@@ -347,12 +173,6 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
    invert_det(c);
    copy_z_inv_w(c);
 
-   if (c->key.do_twoside_color)
-      do_twoside_color(c);
-
-   if (c->key.do_flat_shading)
-      do_flatshade_triangle(c);
-
 
    for (i = 0; i < c->nr_setup_regs; i++)
    {
@@ -433,9 +253,6 @@ void brw_emit_line_setup( struct brw_sf_compile *c )
    invert_det(c);
    copy_z_inv_w(c);
 
-   if (c->key.do_flat_shading)
-      do_flatshade_line(c);
-
    for (i = 0; i < c->nr_setup_regs; i++)
    {
       /* Pair of incoming attributes:
@@ -491,86 +308,6 @@ void brw_emit_line_setup( struct brw_sf_compile *c )
    }
 }
 
-void brw_emit_point_sprite_setup( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   c->nr_verts = 1;
-   alloc_regs(c);
-   copy_z_inv_w(c);
-   for (i = 0; i < c->nr_setup_regs; i++)
-   {
-      struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]];
-      struct brw_reg a0 = offset(c->vert[0], i);
-      ushort pc, pc_persp, pc_linear;
-      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
-
-      if (pc_persp)
-      {
-	  if (!tex->CoordReplace) {
-	      brw_set_predicate_control_flag_value(p, pc_persp);
-	      brw_MUL(p, a0, a0, c->inv_w[0]);
-	  }
-      }
-
-      if (tex->CoordReplace) {
-	  /* Caculate 1.0/PointWidth */
-	  brw_math(&c->func,
-		  c->tmp,
-		  BRW_MATH_FUNCTION_INV,
-		  BRW_MATH_SATURATE_NONE,
-		  0,
-		  c->dx0,
-		  BRW_MATH_DATA_SCALAR,
-		  BRW_MATH_PRECISION_FULL);
-
-	  if (c->key.SpriteOrigin == GL_UPPER_LEFT) {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  } else {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  }
-      } else {
-	  brw_MOV(p, c->m1Cx, brw_imm_ud(0));
-	  brw_MOV(p, c->m2Cy, brw_imm_ud(0));
-      }
-
-      {
-	 brw_set_predicate_control_flag_value(p, pc);
-	 if (tex->CoordReplace) {
-	     if (c->key.SpriteOrigin == GL_UPPER_LEFT) {
-		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
-		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
-	     }
-	     else
-		 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
-	 } else {
-	 	brw_MOV(p, c->m3C0, a0); /* constant value */
-	 }
-
-	 /* Copy m0..m3 to URB.
-	  */
-	 brw_urb_WRITE(p,
-		       brw_null_reg(),
-		       0,
-		       brw_vec8_grf(0, 0),
-		       0, 	/* allocate */
-		       1,	/* used */
-		       4, 	/* msg len */
-		       0,	/* response len */
-		       last, 	/* eot */
-		       last, 	/* writes complete */
-		       i*4,	/* urb destination offset */
-		       BRW_URB_SWIZZLE_TRANSPOSE);
-      }
-   }
-}
 
 /* Points setup - several simplifications as all attributes are
  * constant across the face of the point (point sprites excluded!)
@@ -629,68 +366,3 @@ void brw_emit_point_setup( struct brw_sf_compile *c )
       }
    }
 }
-
-void brw_emit_anyprim_setup( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg ip = brw_ip_reg();
-   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
-   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
-   struct brw_reg primmask;
-   struct brw_instruction *jmp;
-   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
-
-   alloc_regs(c);
-
-   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
-
-   brw_MOV(p, primmask, brw_imm_ud(1));
-   brw_SHL(p, primmask, primmask, payload_prim);
-
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
-   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
-					       (1<<_3DPRIM_TRISTRIP) |
-					       (1<<_3DPRIM_TRIFAN) |
-					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
-					       (1<<_3DPRIM_POLYGON) |
-					       (1<<_3DPRIM_RECTLIST) |
-					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
-   {
-      brw_push_insn_state(p);
-      brw_emit_tri_setup( c );
-      brw_pop_insn_state(p);
-      /* note - thread killed in subroutine */
-   }
-   brw_land_fwd_jump(p, jmp);
-
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
-   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
-					       (1<<_3DPRIM_LINESTRIP) |
-					       (1<<_3DPRIM_LINELOOP) |
-					       (1<<_3DPRIM_LINESTRIP_CONT) |
-					       (1<<_3DPRIM_LINESTRIP_BF) |
-					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
-   {
-      brw_push_insn_state(p);
-      brw_emit_line_setup( c );
-      brw_pop_insn_state(p);
-      /* note - thread killed in subroutine */
-   }
-   brw_land_fwd_jump(p, jmp);
-
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
-   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
-   {
-      brw_push_insn_state(p);
-      brw_emit_point_sprite_setup( c );
-      brw_pop_insn_state(p);
-   }
-   brw_land_fwd_jump(p, jmp);
-
-   brw_emit_point_setup( c );
-}
-
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_sf_state.c b/src/mesa/pipe/i965simple/brw_sf_state.c
index 7b6ee215eb..0de6e7240e 100644
--- a/src/mesa/pipe/i965simple/brw_sf_state.c
+++ b/src/mesa/pipe/i965simple/brw_sf_state.c
@@ -34,91 +34,41 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "pipe/p_util.h"
 
-#if 0
 static void upload_sf_vp(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_sf_viewport sfv;
-   struct intel_renderbuffer *irb =
-      intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0][0]);
-   float y_scale, y_bias;
-   int x, y, w, h, x1, x2, y1, y2;
-   int draw_h = ctx->DrawBuffer->Height;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   if (ctx->DrawBuffer->Name) {
-      /* User-created FBO */
-      if (irb && !irb->RenderToTexture) {
-	 y_scale = -1.0;
-	 y_bias = draw_h;
-      } else {
-	 y_scale = 1.0;
-	 y_bias = 0;
-      }
-   } else {
-      if (brw->intel.driDrawable != NULL) {
-	 y_scale = -1.0;
-	 y_bias = draw_h;
-      } else {
-	 y_scale = 1.0;
-	 y_bias = 0;
-      }
-   }
-
-   /* _NEW_VIEWPORT, BRW_NEW_METAOPS */
 
-   if (!brw->metaops.active) {
-      const float *v = brw->intel.ctx.Viewport._WindowMap.m;
+   /* BRW_NEW_VIEWPORT */
+   {
+      const float *scale = brw->attribs.Viewport.scale;
+      const float *trans = brw->attribs.Viewport.translate;
 
-      sfv.viewport.m00 = v[MAT_SX];
-      sfv.viewport.m11 = v[MAT_SY] * y_scale;
-      sfv.viewport.m22 = v[MAT_SZ] * brw->intel.depth_scale;
-      sfv.viewport.m30 = v[MAT_TX];
-      sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
-      sfv.viewport.m32 = v[MAT_TZ] * brw->intel.depth_scale;
-   } else {
-      sfv.viewport.m00 =   1;
-      sfv.viewport.m11 = - 1;
-      sfv.viewport.m22 =   1;
-      sfv.viewport.m30 =   0;
-      sfv.viewport.m31 =   brw->intel.driDrawable->h;
-      sfv.viewport.m32 =   0;
+      sfv.viewport.m00 = scale[0];
+      sfv.viewport.m11 = scale[1];
+      sfv.viewport.m22 = scale[2]; 
+      sfv.viewport.m30 = trans[0];
+      sfv.viewport.m31 = trans[1];
+      sfv.viewport.m32 = trans[2];
    }
 
    /* _NEW_SCISSOR */
-   x = brw->attribs.Scissor->X;
-   y = brw->attribs.Scissor->Y;
-   w = brw->attribs.Scissor->Width;
-   h = brw->attribs.Scissor->Height;
-
-   if (ctx->DrawBuffer->Name == 0) {
-      x1 = x;
-      y1 = draw_h - (y + h);
-      x2 = x + w - 1;
-      y2 = y1 + h - 1;
-   } else {
-      /* FBO has non-inverted coords. */
-      x1 = x;
-      y1 = y;
-      x2 = x + w - 1;
-      y2 = y + h - 1;
-   }
-
-   sfv.scissor.xmin = CLAMP(x1, 0, ctx->DrawBuffer->Width - 1);
-   sfv.scissor.xmax = CLAMP(y1, 0, ctx->DrawBuffer->Height - 1);
-   sfv.scissor.ymin = CLAMP(x2, 0, ctx->DrawBuffer->Width - 1);
-   sfv.scissor.ymax = CLAMP(y2, 0, ctx->DrawBuffer->Height - 1);
+   sfv.scissor.xmin = brw->attribs.Scissor.minx;
+   sfv.scissor.xmax = brw->attribs.Scissor.maxx;
+   sfv.scissor.ymin = brw->attribs.Scissor.miny;
+   sfv.scissor.ymax = brw->attribs.Scissor.maxy;
 
    brw->sf.vp_gs_offset = brw_cache_data( &brw->cache[BRW_SF_VP], &sfv );
 }
 
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
-      .mesa  = (_NEW_VIEWPORT |
-		_NEW_SCISSOR),
-      .brw   = BRW_NEW_METAOPS,
+      .brw   = (BRW_NEW_SCISSOR |
+		BRW_NEW_VIEWPORT),
       .cache = 0
    },
    .update = upload_sf_vp
@@ -130,7 +80,7 @@ static void upload_sf_unit( struct brw_context *brw )
    memset(&sf, 0, sizeof(sf));
 
    /* CACHE_NEW_SF_PROG */
-   sf.thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1;
+   sf.thread0.grf_reg_count = align(brw->sf.prog_data->total_grf, 16) / 16 - 1;
    sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
    sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
 
@@ -151,19 +101,19 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* CACHE_NEW_SF_VP */
    sf.sf5.sf_viewport_state_offset = brw->sf.vp_gs_offset >> 5;
-
    sf.sf5.viewport_transform = 1;
 
-   /* _NEW_SCISSOR */
-   if (brw->attribs.Scissor->Enabled)
+   /* BRW_NEW_RASTER */
+   if (brw->attribs.Raster->scissor)
       sf.sf6.scissor = 1;
 
-   /* _NEW_POLYGON */
+#if 0
    if (brw->attribs.Polygon->FrontFace == GL_CCW)
       sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
+
    if (brw->attribs.Polygon->CullFlag) {
       switch (brw->attribs.Polygon->CullFaceMode) {
       case GL_FRONT:
@@ -182,25 +132,24 @@ static void upload_sf_unit( struct brw_context *brw )
    }
    else
       sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#else
+   sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+   sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#endif
 
-
-   /* _NEW_LINE */
-   /* XXX use ctx->Const.Min/MaxLineWidth here */
-   sf.sf6.line_width = CLAMP(brw->attribs.Line->Width, 1.0, 5.0) * (1<<1);
+   sf.sf6.line_width = CLAMP(brw->attribs.Raster->line_width, 1.0, 5.0) * (1<<1);
 
    sf.sf6.line_endcap_aa_region_width = 1;
-   if (brw->attribs.Line->SmoothFlag)
+   if (brw->attribs.Raster->line_smooth)
       sf.sf6.aa_enable = 1;
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_POINT */
    sf.sf6.point_rast_rule = 1;	/* opengl conventions */
-   /* XXX clamp max depends on AA vs. non-AA */
 
-   sf.sf7.sprite_point = brw->attribs.Point->PointSprite;
-   sf.sf7.point_size = CLAMP(brw->attribs.Point->Size, 1.0, 255.0) * (1<<3);
-   sf.sf7.use_point_size_state = !brw->attribs.Point->_Attenuated;
+   sf.sf7.sprite_point = brw->attribs.Raster->point_sprite;
+   sf.sf7.point_size = CLAMP(brw->attribs.Raster->line_width, 1.0, 255.0) * (1<<3);
+   sf.sf7.use_point_size_state = brw->attribs.Raster->point_size_per_vertex;
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
@@ -220,12 +169,8 @@ static void upload_sf_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
-      .mesa  = (_NEW_POLYGON |
-		_NEW_LINE |
-		_NEW_POINT |
-		_NEW_SCISSOR),
-      .brw   = (BRW_NEW_URB_FENCE |
-		BRW_NEW_METAOPS),
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_URB_FENCE),
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
    },
@@ -233,4 +178,3 @@ const struct brw_tracked_state brw_sf_unit = {
 };
 
 
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_shader_info.c b/src/mesa/pipe/i965simple/brw_shader_info.c
new file mode 100644
index 0000000000..431b45466a
--- /dev/null
+++ b/src/mesa/pipe/i965simple/brw_shader_info.c
@@ -0,0 +1,49 @@
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+
+
+
+
+void brw_shader_info(const struct tgsi_token *tokens,
+		     struct brw_shader_info *info )
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned last = decl->u.DeclarationRange.Last;
+	 
+	 assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+      
+	 // Broken by crazy wpos init:
+	 //assert( info->nr_regs[decl->Declaration.File] <= last);
+
+	 info->nr_regs[decl->Declaration.File] = MAX2(info->nr_regs[decl->Declaration.File],
+						      last+1);
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+}
diff --git a/src/mesa/pipe/i965simple/brw_state.c b/src/mesa/pipe/i965simple/brw_state.c
index ff4ae7999b..26450ae597 100644
--- a/src/mesa/pipe/i965simple/brw_state.c
+++ b/src/mesa/pipe/i965simple/brw_state.c
@@ -198,6 +198,13 @@ static void * brw_create_fs_state(struct pipe_context *pipe,
    /* XXX: Do I have to duplicate the tokens as well??
     */
    brw_fp->program = *shader;
+   brw_fp->id = brw_context(pipe)->program_id++;
+
+   brw_shader_info(shader->tokens,
+		   &brw_fp->info);
+
+   tgsi_dump(shader->tokens, 0);
+
 
    return (void *)brw_fp;
 }
@@ -228,6 +235,9 @@ static void *brw_create_vs_state(struct pipe_context *pipe,
    /* XXX: Do I have to duplicate the tokens as well??
     */
    brw_vp->program = *shader;
+   brw_vp->id = brw_context(pipe)->program_id++;
+   brw_shader_info(shader->tokens,
+		   &brw_vp->info);
 
    tgsi_dump(shader->tokens, 0);
 
@@ -273,14 +283,11 @@ static void brw_set_viewport_state( struct pipe_context *pipe,
 
 
 static void brw_set_vertex_buffer( struct pipe_context *pipe,
-                                    unsigned index,
-                                    const struct pipe_vertex_buffer *buffer )
+				   unsigned index,
+				   const struct pipe_vertex_buffer *buffer )
 {
    struct brw_context *brw = brw_context(pipe);
-   brw->vb.vbo_array[index] = *buffer;
-   if (index > brw->vb.last_vb)
-      brw->vb.last_vb = index;
-   assert(brw->vb.last_vb < BRW_VEP_MAX);
+   brw->vb.vbo_array[index] = buffer;
 }
 
 static void brw_set_vertex_element(struct pipe_context *pipe,
diff --git a/src/mesa/pipe/i965simple/brw_state.h b/src/mesa/pipe/i965simple/brw_state.h
index 4dabfe8082..d09711f6f0 100644
--- a/src/mesa/pipe/i965simple/brw_state.h
+++ b/src/mesa/pipe/i965simple/brw_state.h
@@ -154,4 +154,11 @@ void brw_upload_clip_prog(struct brw_context *brw);
 void brw_upload_blend_constant_color(struct brw_context *brw);
 void brw_upload_wm_samplers(struct brw_context *brw);
 
+/* brw_shader_info.c
+ */
+
+void brw_shader_info(const struct tgsi_token *tokens,
+		     struct brw_shader_info *info );
+
+
 #endif
diff --git a/src/mesa/pipe/i965simple/brw_state_cache.c b/src/mesa/pipe/i965simple/brw_state_cache.c
index 13e262d2e5..c5738733f4 100644
--- a/src/mesa/pipe/i965simple/brw_state_cache.c
+++ b/src/mesa/pipe/i965simple/brw_state_cache.c
@@ -178,8 +178,9 @@ unsigned brw_upload_cache( struct brw_cache *cache,
 
    if (BRW_DEBUG & DEBUG_STATE)
       printf("upload %s: %d bytes to pool buffer %p offset %x\n",
-             cache->name, data_size,
-             cache->pool->buffer,
+             cache->name, 
+	     data_size,
+             (void*)cache->pool->buffer,
              offset);
 
    /* Copy data to the buffer:
diff --git a/src/mesa/pipe/i965simple/brw_state_pool.c b/src/mesa/pipe/i965simple/brw_state_pool.c
index a490049024..78268ed8f2 100644
--- a/src/mesa/pipe/i965simple/brw_state_pool.c
+++ b/src/mesa/pipe/i965simple/brw_state_pool.c
@@ -43,17 +43,18 @@
  */
 
 #include "pipe/p_winsys.h"
+#include "pipe/p_util.h"
 #include "brw_context.h"
 #include "brw_state.h"
 
 boolean brw_pool_alloc( struct brw_mem_pool *pool,
 			  unsigned size,
-			  unsigned align,
+			  unsigned alignment,
 			  unsigned *offset_return)
 {
-   unsigned fixup = ALIGN(pool->offset, align) - pool->offset;
+   unsigned fixup = align(pool->offset, alignment) - pool->offset;
 
-   size = ALIGN(size, 4);
+   size = align(size, 4);
 
    if (pool->offset + fixup + size >= pool->size) {
       printf("%s failed\n", __FUNCTION__);
@@ -114,7 +115,7 @@ void brw_pool_check_wrap( struct brw_context *brw,
 			  struct brw_mem_pool *pool )
 {
    if (pool->offset > (pool->size * 3) / 4) {
-      brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+      brw->state.dirty.brw |= BRW_NEW_SCENE;
    }
 
 }
diff --git a/src/mesa/pipe/i965simple/brw_state_upload.c b/src/mesa/pipe/i965simple/brw_state_upload.c
index 1ca7484958..1fb480172d 100644
--- a/src/mesa/pipe/i965simple/brw_state_upload.c
+++ b/src/mesa/pipe/i965simple/brw_state_upload.c
@@ -97,39 +97,16 @@ const struct brw_tracked_state *atoms[] =
 
 void brw_init_state( struct brw_context *brw )
 {
-   unsigned i;
-
    brw_init_pools(brw);
    brw_init_caches(brw);
 
-   brw->state.atoms = MALLOC(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   memcpy(&brw->curbe.tracked_state,
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
-
    brw->state.dirty.brw = ~0;
    brw->emit_state_always = 0;
-
-
 }
 
 
 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      FREE(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
    brw_destroy_caches(brw);
    brw_destroy_batch_cache(brw);
    brw_destroy_pools(brw);
@@ -177,7 +154,7 @@ void brw_validate_state( struct brw_context *brw )
        state->brw == 0)
       return;
 
-   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
+   if (brw->state.dirty.brw & BRW_NEW_SCENE)
       brw_clear_batch_cache_flush(brw);
 
    if (BRW_DEBUG) {
@@ -189,21 +166,17 @@ void brw_validate_state( struct brw_context *brw )
       memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < brw->state.nr_atoms; i++) {
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
 	 assert(atom->dirty.brw ||
 		atom->dirty.cache);
 	 assert(atom->update);
 
-	 if (check_state(state, &atom->dirty) || atom->always_update) {
+	 if (check_state(state, &atom->dirty)) {
 	    atom->update( brw );
-
-/* 	    emit_foo(brw); */
 	 }
-	 if (atom->emit_reloc != NULL)
-	    atom->emit_reloc(brw);
 
 	 accumulate_state(&examined, &atom->dirty);
 
@@ -218,12 +191,10 @@ void brw_validate_state( struct brw_context *brw )
    }
    else {
       for (i = 0; i < Elements(atoms); i++) {
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];
 
-	 if (check_state(state, &atom->dirty) || atom->always_update)
+	 if (check_state(state, &atom->dirty))
 	    atom->update( brw );
-	 if (atom->emit_reloc != NULL)
-	    atom->emit_reloc(brw);
       }
    }
 
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index b9514be0c2..7d6e2851b1 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -149,10 +149,10 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
       unsigned mip1_width;
 
       if (pt->compressed) {
-         mip1_width = ALIGN(minify(pt->width[0]), align_w)
-                      + ALIGN(minify(minify(pt->width[0])), align_w);
+         mip1_width = align(minify(pt->width[0]), align_w)
+                      + align(minify(minify(pt->width[0])), align_w);
       } else {
-         mip1_width = ALIGN(minify(pt->width[0]), align_w)
+         mip1_width = align(minify(pt->width[0]), align_w)
                       + minify(minify(pt->width[0]));
       }
 
@@ -164,7 +164,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
    /* Pitch must be a whole number of dwords, even though we
     * express it in texels.
     */
-   tex->pitch = ALIGN(tex->pitch * pt->cpp, 4) / pt->cpp;
+   tex->pitch = align(tex->pitch * pt->cpp, 4) / pt->cpp;
    tex->total_height = 0;
 
    for ( level = pt->first_level ; level <= pt->last_level ; level++ ) {
@@ -176,7 +176,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
       if (pt->compressed)
 	 img_height = MAX2(1, height/4);
       else
-	 img_height = ALIGN(height, align_h);
+	 img_height = align(height, align_h);
 
 
       /* Because the images are packed better, the final offset
@@ -187,7 +187,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
       /* Layout_below: step right after second mipmap.
        */
       if (level == pt->first_level + 1) {
-	 x += ALIGN(width, align_w);
+	 x += align(width, align_w);
       }
       else {
 	 y += img_height;
@@ -221,13 +221,13 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
 #if 0
       if (pt->compressed) {
          align_w = intel_compressed_alignment(pt->internal_format);
-         pt->pitch = ALIGN(width, align_w);
+         pt->pitch = align(width, align_w);
          pack_y_pitch = (height + 3) / 4;
       } else
 #endif
       {
-         tex->pitch = ALIGN(pt->width[0] * pt->cpp, 4) / pt->cpp;
-         pack_y_pitch = ALIGN(pt->height[0], align_h);
+         tex->pitch = align(pt->width[0] * pt->cpp, 4) / pt->cpp;
+         pack_y_pitch = align(pt->height[0], align_h);
       }
 
       pack_x_pitch = tex->pitch;
@@ -262,8 +262,8 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
          if (pt->compressed) {
             pack_y_pitch = (height + 3) / 4;
 
-            if (pack_x_pitch > ALIGN(width, align_w)) {
-               pack_x_pitch = ALIGN(width, align_w);
+            if (pack_x_pitch > align(width, align_w)) {
+               pack_x_pitch = align(width, align_w);
                pack_x_nr <<= 1;
             }
          } else {
@@ -275,7 +275,7 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
 
             if (pack_y_pitch > 2) {
                pack_y_pitch >>= 1;
-               pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+               pack_y_pitch = align(pack_y_pitch, align_h);
             }
          }
 
@@ -305,8 +305,6 @@ brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
                                      sizeof(struct brw_texture));
 
    if (tex) {
-      struct brw_context *brw = brw_context(pipe);
-
       memset(&tex->base + 1, 0,
 	     sizeof(struct brw_texture) - sizeof(struct pipe_texture));
 
diff --git a/src/mesa/pipe/i965simple/brw_vs.c b/src/mesa/pipe/i965simple/brw_vs.c
index 33c6624214..738c6346d5 100644
--- a/src/mesa/pipe/i965simple/brw_vs.c
+++ b/src/mesa/pipe/i965simple/brw_vs.c
@@ -97,13 +97,6 @@ static void brw_upload_vs_prog( struct brw_context *brw )
    key.copy_edgeflag = (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
 			brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
-#if 0
-   /* BRW_NEW_METAOPS
-    */
-   if (brw->metaops.active)
-      key.know_w_is_one = 1;
-#endif
-
    /* Make an early check for the key.
     */
    if (brw_search_cache(&brw->cache[BRW_VS_PROG],
@@ -120,9 +113,6 @@ static void brw_upload_vs_prog( struct brw_context *brw )
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-#if 0
-      .brw   = BRW_NEW_VS | BRW_NEW_METAOPS,
-#endif
       .brw   = BRW_NEW_VS,
       .cache = 0
    },
diff --git a/src/mesa/pipe/i965simple/brw_vs_constval.c b/src/mesa/pipe/i965simple/brw_vs_constval.c
deleted file mode 100644
index de43e72c1d..0000000000
--- a/src/mesa/pipe/i965simple/brw_vs_constval.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_vs.h"
-
-#if 0
-/* Component is active if it may diverge from [0,0,0,1].  Undef values
- * are promoted to [0,0,0,1] for the purposes of this analysis.
- */
-struct tracker {
-   boolean twoside;
-   ubyte active[PROGRAM_OUTPUT+1][128];
-   unsigned size_masks[4];
-};
-
-
-static void set_active_component( struct tracker *t,
-				  unsigned file,
-				  unsigned index,
-				  ubyte active )
-{
-   switch (file) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_INPUT:
-   case PROGRAM_OUTPUT:
-      t->active[file][index] |= active;
-
-   default:
-      break;
-   }
-}
-
-static void set_active( struct tracker *t,
-			struct prog_dst_register dst,
-			unsigned active )
-{
-   set_active_component( t, dst.File, dst.Index, active & dst.WriteMask );
-}
-
-
-static ubyte get_active_component( struct tracker *t,
-				     unsigned file,
-				     unsigned index,
-				     unsigned component,
-				     ubyte swz )
-{
-   switch (swz) {
-   case SWIZZLE_ZERO:
-      return component < 3 ? 0 : (1<<component);
-   case SWIZZLE_ONE:
-      return component == 3 ? 0 : (1<<component);
-   default:
-      switch (file) {
-      case PROGRAM_TEMPORARY:
-      case PROGRAM_INPUT:
-      case PROGRAM_OUTPUT:
-	 return t->active[file][index] & (1<<component);
-      default:
-	 return 1 << component;
-      }
-   }
-}
-
-
-static ubyte get_active( struct tracker *t,
-			   struct prog_src_register src )
-{
-   unsigned i;
-   ubyte active = src.NegateBase; /* NOTE! */
-
-   if (src.RelAddr)
-      return 0xf;
-
-   for (i = 0; i < 4; i++)
-      active |= get_active_component(t, src.File, src.Index, i,
-				     GET_SWZ(src.Swizzle, i));
-
-   return active;
-}
-
-static ubyte get_output_size( struct tracker *t,
-				unsigned idx )
-{
-   ubyte active = t->active[PROGRAM_OUTPUT][idx];
-   if (active & (1<<3)) return 4;
-   if (active & (1<<2)) return 3;
-   if (active & (1<<1)) return 2;
-   if (active & (1<<0)) return 1;
-   return 0;
-}
-
-/* Note the potential copying that occurs in the setup program:
- */
-static void calc_sizes( struct tracker *t )
-{
-   unsigned i;
-
-   if (t->twoside) {
-      t->active[PROGRAM_OUTPUT][VERT_RESULT_COL0] |=
-	 t->active[PROGRAM_OUTPUT][VERT_RESULT_BFC0];
-
-      t->active[PROGRAM_OUTPUT][VERT_RESULT_COL1] |=
-	 t->active[PROGRAM_OUTPUT][VERT_RESULT_BFC1];
-   }
-
-   for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-      switch (get_output_size(t, i)) {
-      case 4: t->size_masks[4-1] |= 1<<i;
-      case 3: t->size_masks[3-1] |= 1<<i;
-      case 2: t->size_masks[2-1] |= 1<<i;
-      case 1: t->size_masks[1-1] |= 1<<i;
-	 break;
-      }
-   }
-}
-
-static ubyte szflag[4+1] = {
-   0,
-   0x1,
-   0x3,
-   0x7,
-   0xf
-};
-
-/* Pull a size out of the packed array:
- */
-static unsigned get_input_size(struct brw_context *brw,
-			     unsigned attr)
-{
-   unsigned sizes_dword = brw->vb.info.sizes[attr/16];
-   unsigned sizes_bits = (sizes_dword>>((attr%16)*2)) & 0x3;
-   return sizes_bits + 1;
-/*    return brw->vb.inputs[attr].glarray->Size; */
-}
-
-/* Calculate sizes of vertex program outputs.  Size is the largest
- * component index which might vary from [0,0,0,1]
- */
-static void calc_wm_input_sizes( struct brw_context *brw )
-{
-   /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp =
-      (struct brw_vertex_program *)brw->vertex_program;
-   /* BRW_NEW_INPUT_DIMENSIONS */
-   struct tracker t;
-   unsigned insn;
-   unsigned i;
-
-   memset(&t, 0, sizeof(t));
-
-   /* _NEW_LIGHT */
-   if (brw->attribs.Light->Model.TwoSide)
-      t.twoside = 1;
-
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
-      if (vp->program.Base.InputsRead & (1<<i))
-	 set_active_component(&t, PROGRAM_INPUT, i,
-			      szflag[get_input_size(brw, i)]);
-
-   for (insn = 0; insn < vp->program.Base.NumInstructions; insn++) {
-      struct prog_instruction *inst = &vp->program.Base.Instructions[insn];
-
-      switch (inst->Opcode) {
-      case OPCODE_ARL:
-	 break;
-
-      case OPCODE_MOV:
-	 set_active(&t, inst->DstReg, get_active(&t, inst->SrcReg[0]));
-	 break;
-
-      default:
-	 set_active(&t, inst->DstReg, 0xf);
-	 break;
-      }
-   }
-
-   calc_sizes(&t);
-
-   if (memcmp(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks)) != 0) {
-      memcpy(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks));
-      brw->state.dirty.brw |= BRW_NEW_WM_INPUT_DIMENSIONS;
-   }
-}
-
-const struct brw_tracked_state brw_wm_input_sizes = {
-   .dirty = {
-      .mesa  = _NEW_LIGHT,
-      .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_INPUT_DIMENSIONS,
-      .cache = 0
-   },
-   .update = calc_wm_input_sizes
-};
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_vs_emit.c b/src/mesa/pipe/i965simple/brw_vs_emit.c
index 59459d4200..530e17a736 100644
--- a/src/mesa/pipe/i965simple/brw_vs_emit.c
+++ b/src/mesa/pipe/i965simple/brw_vs_emit.c
@@ -103,28 +103,26 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c,
    c->first_output = reg;
    mrf = 4;
    for (i = 0; i < c->vp->program.num_outputs; i++) {
-      if (c->prog_data.outputs_written & (1<<i)) {
-	 c->nr_outputs++;
+      c->nr_outputs++;
 #if 0
-	 if (i == VERT_RESULT_HPOS) {
-	    c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
-	 }
-	 else if (i == VERT_RESULT_PSIZ) {
-	    c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
-	 }
-	 else {
-	    c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
-	 }
-#else
-         /* for now stuff everything in grf */
+      if (i == VERT_RESULT_HPOS) {
          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
          reg++;
-#endif
       }
+      else if (i == VERT_RESULT_PSIZ) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+         mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#else
+      /* for now stuff everything in grf */
+      c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+      reg++;
+#endif
    }
 
    /* Allocate program temporaries:
@@ -627,11 +625,9 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
    case TGSI_FILE_TEMPORARY:
    case TGSI_FILE_INPUT:
    case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_CONSTANT:
       assert(c->regs[file][index].nr != 0);
       return c->regs[file][index];
-   case TGSI_FILE_CONSTANT:
-      assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
-      return c->regs[TGSI_FILE_CONSTANT][index];
    case TGSI_FILE_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
diff --git a/src/mesa/pipe/i965simple/brw_vs_state.c b/src/mesa/pipe/i965simple/brw_vs_state.c
index 7d6fb383b9..c73469929c 100644
--- a/src/mesa/pipe/i965simple/brw_vs_state.c
+++ b/src/mesa/pipe/i965simple/brw_vs_state.c
@@ -44,7 +44,7 @@ static void upload_vs_unit( struct brw_context *brw )
 
    /* CACHE_NEW_VS_PROG */
    vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
-   vs.thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
+   vs.thread0.grf_reg_count = align(brw->vs.prog_data->total_grf, 16) / 16 - 1;
    vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
    vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
    vs.thread3.dispatch_grf_start_reg = 1;
diff --git a/src/mesa/pipe/i965simple/brw_vtbl.c b/src/mesa/pipe/i965simple/brw_vtbl.c
deleted file mode 100644
index 6dc3bd838b..0000000000
--- a/src/mesa/pipe/i965simple/brw_vtbl.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-
-#include "brw_draw.h"
-#include "brw_state.h"
-#include "brw_vs.h"
-#include <stdarg.h>
-
-#if 0
-/* called from intelDestroyContext()
- */
-static void brw_destroy_context( struct intel_context *intel )
-{
-   GLcontext *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw_destroy_metaops(brw);
-   brw_destroy_state(brw);
-   brw_draw_destroy( brw );
-
-   brw_ProgramCacheDestroy( ctx );
-   brw_FrameBufferTexDestroy( brw );
-}
-
-/* called from intelDrawBuffer()
- */
-static void brw_set_draw_region( struct intel_context *intel, 
-				  struct intel_region *draw_region,
-				  struct intel_region *depth_region)
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   intel_region_release(&brw->state.draw_region);
-   intel_region_release(&brw->state.depth_region);
-   intel_region_reference(&brw->state.draw_region, draw_region);
-   intel_region_reference(&brw->state.depth_region, depth_region);
-}
-
-
-/* called from intelFlushBatchLocked
- */
-static void brw_lost_hardware( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   /* Note that we effectively lose the context after this.
-    * 
-    * Setting this flag provokes a state buffer wrap and also flushes
-    * the hardware caches.
-    */
-   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
-
-   /* Which means there shouldn't be any commands already queued:
-    */
-   assert(intel->batch->ptr == intel->batch->map);
-
-   brw->state.dirty.mesa |= ~0;
-   brw->state.dirty.brw |= ~0;
-   brw->state.dirty.cache |= ~0;
-}
-
-static void brw_note_fence( struct intel_context *intel, 
-			    unsigned fence )
-{
-   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
-}
- 
-static void brw_note_unlock( struct intel_context *intel )
-{
-  struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw_pool_check_wrap(brw, &brw->pool[BRW_GS_POOL]);
-   brw_pool_check_wrap(brw, &brw->pool[BRW_SS_POOL]);
-
-   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_LOCK;
-}
-
-
-void brw_do_flush( struct brw_context *brw, 
-		   unsigned flags )
-{
-   struct brw_mi_flush flush;
-   memset(&flush, 0, sizeof(flush));      
-   flush.opcode = CMD_MI_FLUSH;
-   flush.flags = flags;
-   BRW_BATCH_STRUCT(brw, &flush);
-}
-
-
-static void brw_emit_flush( struct intel_context *intel,
-			unsigned unused )
-{
-   brw_do_flush(brw_context(&intel->ctx),
-		BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE);
-}
-
-
-/* called from intelWaitForIdle() and intelFlush()
- *
- * For now, just flush everything.  Could be smarter later.
- */
-static unsigned brw_flush_cmd( void )
-{
-   struct brw_mi_flush flush;
-   flush.opcode = CMD_MI_FLUSH;
-   flush.pad = 0;
-   flush.flags = BRW_FLUSH_READ_CACHE | BRW_FLUSH_STATE_CACHE;
-   return *(unsigned *)&flush;
-}
-
-static void brw_invalidate_state( struct intel_context *intel, unsigned new_state )
-{
-   /* nothing */
-}
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_winsys.h b/src/mesa/pipe/i965simple/brw_winsys.h
index cc0a210a9f..49a12a1c27 100644
--- a/src/mesa/pipe/i965simple/brw_winsys.h
+++ b/src/mesa/pipe/i965simple/brw_winsys.h
@@ -95,6 +95,8 @@ enum brw_cache_id {
    BRW_MAX_CACHE
 };
 
+#define BRW_CONSTANT_BUFFER BRW_MAX_CACHE
+
 /**
  * Additional winsys interface for i965simple.
  *
@@ -164,6 +166,13 @@ struct brw_winsys {
 				unsigned data_type);
    
 
+   /* A cheat so we don't have to think about relocations in a couple
+    * of places yet:
+    */
+   unsigned (*get_buffer_offset)( struct brw_winsys *sws,
+				  struct pipe_buffer_handle *buf,
+				  unsigned flags );
+
 };
 
 #define BRW_BUFFER_ACCESS_WRITE   0x1
diff --git a/src/mesa/pipe/i965simple/brw_wm.c b/src/mesa/pipe/i965simple/brw_wm.c
index 65271f22fd..f0a38d384b 100644
--- a/src/mesa/pipe/i965simple/brw_wm.c
+++ b/src/mesa/pipe/i965simple/brw_wm.c
@@ -33,153 +33,34 @@
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_wm.h"
+#include "brw_eu.h"
 #include "brw_state.h"
+#include "pipe/p_util.h"
 
-unsigned brw_wm_nr_args( unsigned opcode )
-{
-   switch (opcode) {
-
-   case WM_PIXELXY:
-   case TGSI_OPCODE_ABS:
-   case TGSI_OPCODE_FLR:
-   case TGSI_OPCODE_FRC:
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_COS:
-   case TGSI_OPCODE_EX2:
-   case TGSI_OPCODE_LG2:
-   case TGSI_OPCODE_RCP:
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_SIN:
-   case TGSI_OPCODE_SCS:
-   case TGSI_OPCODE_TEX:
-   case TGSI_OPCODE_TXB:
-   case TGSI_OPCODE_TXD:
-   case TGSI_OPCODE_KIL:
-   case TGSI_OPCODE_LIT:
-   case WM_CINTERP:
-   case WM_WPOSXY:
-      return 1;
-
-   case TGSI_OPCODE_POW:
-   case TGSI_OPCODE_SUB:
-   case TGSI_OPCODE_SGE:
-   case TGSI_OPCODE_SGT:
-   case TGSI_OPCODE_SLE:
-   case TGSI_OPCODE_SLT:
-   case TGSI_OPCODE_SEQ:
-   case TGSI_OPCODE_SNE:
-   case TGSI_OPCODE_ADD:
-   case TGSI_OPCODE_MAX:
-   case TGSI_OPCODE_MIN:
-   case TGSI_OPCODE_MUL:
-   case TGSI_OPCODE_XPD:
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-   case TGSI_OPCODE_DST:
-   case WM_LINTERP:
-   case WM_DELTAXY:
-   case WM_PIXELW:
-      return 2;
-
-   case WM_FB_WRITE:
-   case WM_PINTERP:
-   case TGSI_OPCODE_MAD:
-   case TGSI_OPCODE_CMP:
-   case TGSI_OPCODE_LRP:
-      return 3;
-
-   default:
-      return 0;
-   }
-}
-
-
-unsigned brw_wm_is_scalar_result( unsigned opcode )
-{
-   switch (opcode) {
-   case TGSI_OPCODE_COS:
-   case TGSI_OPCODE_EX2:
-   case TGSI_OPCODE_LG2:
-   case TGSI_OPCODE_POW:
-   case TGSI_OPCODE_RCP:
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_SIN:
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-   case TGSI_OPCODE_DST:
-      return 1;
-
-   default:
-      return 0;
-   }
-}
 
 
 static void do_wm_prog( struct brw_context *brw,
 			struct brw_fragment_program *fp,
 			struct brw_wm_prog_key *key)
 {
-   struct brw_wm_compile *c;
+   struct brw_wm_compile *c = CALLOC_STRUCT(brw_wm_compile);
    const unsigned *program;
    unsigned program_size;
 
-   c = brw->wm.compile_data;
-   if (c == NULL) {
-     brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
-     c = brw->wm.compile_data;
-   } else {
-     memset(c, 0, sizeof(*brw->wm.compile_data));
-   }
-   memcpy(&c->key, key, sizeof(*key));
-
+   c->key = *key;
    c->fp = fp;
-   fprintf(stderr, "XXXXXXXX FP\n");
    
-#if 0
-   c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
-
-   if (brw_wm_is_glsl(&c->fp->program)) {
-       brw_wm_glsl_emit(c);
-   } else
-   {
-       /* Augment fragment program.  Add instructions for pre- and
-	* post-fragment-program tasks such as interpolation and fogging.
-	*/
-       brw_wm_pass_fp(c);
+   c->delta_xy[0] = brw_null_reg();
+   c->delta_xy[1] = brw_null_reg();
+   c->pixel_xy[0] = brw_null_reg();
+   c->pixel_xy[1] = brw_null_reg();
+   c->pixel_w = brw_null_reg();
 
-       /* Translate to intermediate representation.  Build register usage
-	* chains.
-	*/
-       brw_wm_pass0(c);
 
-       /* Dead code removal.
-	*/
-       brw_wm_pass1(c);
-
-       /* Register allocation.
-	*/
-       c->grf_limit = BRW_WM_MAX_GRF/2;
-
-       /* This is where we start emitting gen4 code:
-	*/
-       brw_init_compile(&c->func);
-
-       brw_wm_pass2(c);
-
-       c->prog_data.total_grf = c->max_wm_grf;
-       if (c->last_scratch) {
-	   c->prog_data.total_scratch =
-	       c->last_scratch + 0x40;
-       } else {
-	   c->prog_data.total_scratch = 0;
-       }
+   fprintf(stderr, "XXXXXXXX FP\n");
+   
+   brw_wm_glsl_emit(c);
 
-       /* Emit GEN4 code.
-	*/
-       brw_wm_emit(c);
-   }
    /* get the program
     */
    program = brw_get_program(&c->func, &program_size);
@@ -193,7 +74,8 @@ static void do_wm_prog( struct brw_context *brw,
 					      program_size,
 					      &c->prog_data,
 					      &brw->wm.prog_data );
-#endif
+
+   FREE(c);
 }
 
 
@@ -206,8 +88,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
       (struct brw_fragment_program *)brw->attribs.FragmentProgram;
    unsigned lookup = 0;
    unsigned line_aa;
-   unsigned i;
-
+   
    memset(key, 0, sizeof(*key));
 
    /* Build the index for table lookup
@@ -274,14 +155,10 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
 
 #if 0
-   /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->projtex_mask = brw->wm.input_size_masks[4-1] >> (FRAG_ATTRIB_TEX0 - FRAG_ATTRIB_WPOS);
-#endif
-
-   /* _NEW_LIGHT */
-   key->flat_shade = (brw->attribs.Raster->flatshade);
-
-   /* _NEW_TEXTURE */
+   /* BRW_NEW_SAMPLER 
+    *
+    * Not doing any of this at the moment:
+    */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct pipe_sampler_state *unit = brw->attribs.Samplers[i];
 
@@ -291,12 +168,11 @@ static void brw_wm_populate_key( struct brw_context *brw,
              unit->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
 	    key->shadowtex_mask |= 1<<i;
 	 }
-#if 0
 	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
 	    key->yuvtex_mask |= 1<<i;
-#endif
       }
    }
+#endif
 
 
    /* Extra info:
@@ -329,7 +205,6 @@ static void brw_upload_wm_prog( struct brw_context *brw )
 const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .brw   = (BRW_NEW_FS |
-		BRW_NEW_WM_INPUT_DIMENSIONS |
 		BRW_NEW_REDUCED_PRIMITIVE),
       .cache = 0
    },
diff --git a/src/mesa/pipe/i965simple/brw_wm.h b/src/mesa/pipe/i965simple/brw_wm.h
index a394e25da3..a1ac0f504a 100644
--- a/src/mesa/pipe/i965simple/brw_wm.h
+++ b/src/mesa/pipe/i965simple/brw_wm.h
@@ -60,86 +60,19 @@ struct brw_wm_prog_key {
    unsigned aa_dest_stencil_reg:3;
    unsigned dest_depth_reg:3;
    unsigned nr_depth_regs:3;
-   unsigned projtex_mask:8;
    unsigned shadowtex_mask:8;
    unsigned computes_depth:1;	/* could be derived from program string */
    unsigned source_depth_to_render_target:1;
-   unsigned flat_shade:1;
    unsigned runtime_check_aads_emit:1;
-   
-   unsigned yuvtex_mask:8;
-   unsigned pad1:24;
-
-   unsigned program_string_id:32;
-};
-
-
-/* A bit of a glossary:
- *
- * brw_wm_value: A computed value or program input.  Values are
- * constant, they are created once and are never modified.  When a
- * fragment program register is written or overwritten, new values are
- * created fresh, preserving the rule that values are constant.
- *
- * brw_wm_ref: A reference to a value.  Wherever a value used is by an
- * instruction or as a program output, that is tracked with an
- * instance of this struct.  All references to a value occur after it
- * is created.  After the last reference, a value is dead and can be
- * discarded.
- *
- * brw_wm_grf: Represents a physical hardware register.  May be either
- * empty or hold a value.  Register allocation is the process of
- * assigning values to grf registers.  This occurs in pass2 and the
- * brw_wm_grf struct is not used before that.
- *
- * Fragment program registers: These are time-varying constructs that
- * are hard to reason about and which we translate away in pass0.  A
- * single fragment program register element (eg. temp[0].x) will be
- * translated to one or more brw_wm_value structs, one for each time
- * that temp[0].x is written to during the program. 
- */
-
-
 
-/* Used in pass2 to track register allocation.
- */
-struct brw_wm_grf {
-   struct brw_wm_value *value;
-   unsigned nextuse;
-};
+   unsigned yuvtex_mask:8;
 
-struct brw_wm_value {
-   struct brw_reg hw_reg;	/* emitted to this reg, may not always be there */
-   struct brw_wm_ref *lastuse;
-   struct brw_wm_grf *resident; 
-   unsigned contributes_to_output:1;
-   unsigned spill_slot:16;	/* if non-zero, spill immediately after calculation */
+   unsigned program_string_id;
 };
 
-struct brw_wm_ref {
-   struct brw_reg hw_reg;	/* nr filled in in pass2, everything else, pass0 */
-   struct brw_wm_value *value;
-   struct brw_wm_ref *prevuse;
-   unsigned unspill_reg:7;	/* unspill to reg */
-   unsigned emitted:1;
-   unsigned insn:24;
-};
 
-struct brw_wm_constref {
-   const struct brw_wm_ref *ref;
-   float constval;
-};
 
 
-struct brw_wm_instruction {
-   struct brw_wm_value *dst[4];
-   struct brw_wm_ref *src[3][4];
-   unsigned opcode:8;
-   unsigned saturate:1;
-   unsigned writemask:4;
-   unsigned tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
-   unsigned tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
-};
 
 #define PROGRAM_INTERNAL_PARAM
 #define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
@@ -151,124 +84,59 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_CONST 256
 #define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
 
-
-
-/* New opcodes to track internal operations required for WM unit.
- * These are added early so that the registers used can be tracked,
- * freed and reused like those of other instructions.
- */
-#define WM_PIXELXY        (TGSI_OPCODE_LAST)
-#define WM_DELTAXY        (TGSI_OPCODE_LAST + 1)
-#define WM_PIXELW         (TGSI_OPCODE_LAST + 2)
-#define WM_LINTERP        (TGSI_OPCODE_LAST + 3)
-#define WM_PINTERP        (TGSI_OPCODE_LAST + 4)
-#define WM_CINTERP        (TGSI_OPCODE_LAST + 5)
-#define WM_WPOSXY         (TGSI_OPCODE_LAST + 6)
-#define WM_FB_WRITE       (TGSI_OPCODE_LAST + 7)
-#define MAX_WM_OPCODE     (TGSI_OPCODE_LAST + 8)
-
 #define PAYLOAD_DEPTH     (PIPE_ATTRIB_MAX)
 
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+
 struct brw_wm_compile {
    struct brw_compile func;
    struct brw_wm_prog_key key;
-   struct brw_wm_prog_data prog_data;
+   struct brw_wm_prog_data prog_data; /* result */
 
    struct brw_fragment_program *fp;
 
-   float (*env_param)[4];
-
-   enum {
-      START,
-      PASS2_DONE
-   } state;
-
-   /* Initial pass - translate fp instructions to fp instructions,
-    * simplifying and adding instructions for interpolation and
-    * framebuffer writes.
-    */
-   const struct pipe_shader_state *prog_instructions;
-   unsigned nr_fp_insns;
-   unsigned fp_temp;
-   unsigned fp_interp_emitted;
-   unsigned fp_deriv_emitted;
-
-   struct tgsi_src_register pixel_xy;
-   struct tgsi_src_register delta_xy;
-   struct tgsi_src_register pixel_w;
-
-
-   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
-   unsigned nr_vreg;
-
-   struct brw_wm_value creg[BRW_WM_MAX_PARAM];
-   unsigned nr_creg;
+   unsigned grf_limit;
+   unsigned max_wm_grf;
 
-   struct {
-      struct brw_wm_value depth[4]; /* includes r0/r1 */
-      struct brw_wm_value input_interp[PIPE_ATTRIB_MAX];
-   } payload;
 
+   struct brw_reg pixel_xy[2];
+   struct brw_reg delta_xy[2];
+   struct brw_reg pixel_w;
 
-   const struct brw_wm_ref *pass0_fp_reg[16][256][4];
 
-   struct brw_wm_ref undef_ref;
-   struct brw_wm_value undef_value;
+   struct brw_reg wm_regs[8][32][4];
 
-   struct brw_wm_ref refs[BRW_WM_MAX_REF];
-   unsigned nr_refs;
+   struct brw_reg payload_depth[4];
+   struct brw_reg payload_coef[16];
 
-   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
-   unsigned nr_insns;
+   struct brw_reg emit_mask_reg;
 
-   struct brw_wm_constref constref[BRW_WM_MAX_CONST];
-   unsigned nr_constrefs;
+   struct brw_instruction *if_inst[MAX_IFSN];
+   int if_insn;
 
-   struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   int loop_insn;
 
-   unsigned grf_limit;
-   unsigned max_wm_grf;
-   unsigned last_scratch;
+   struct brw_instruction *inst0;
+   struct brw_instruction *inst1;
 
-   struct {
-	boolean inited;
-	struct brw_reg reg;
-   } wm_regs[16][256][4];
    struct brw_reg stack;
-   struct brw_reg emit_mask_reg;
+   struct brw_indirect stack_index;
+
    unsigned reg_index;
+
+   unsigned tmp_start;
    unsigned tmp_index;
 };
 
 
-unsigned brw_wm_nr_args( unsigned opcode );
-unsigned brw_wm_is_scalar_result( unsigned opcode );
-
-void brw_wm_pass_fp( struct brw_wm_compile *c );
-void brw_wm_pass0( struct brw_wm_compile *c );
-void brw_wm_pass1( struct brw_wm_compile *c );
-void brw_wm_pass2( struct brw_wm_compile *c );
-void brw_wm_emit( struct brw_wm_compile *c );
-
-void brw_wm_print_value( struct brw_wm_compile *c,
-			 struct brw_wm_value *value );
-
-void brw_wm_print_ref( struct brw_wm_compile *c,
-		       struct brw_wm_ref *ref );
-
-void brw_wm_print_insn( struct brw_wm_compile *c,
-			struct brw_wm_instruction *inst );
-
-void brw_wm_print_program( struct brw_wm_compile *c,
-			   const char *stage );
 
 void brw_wm_lookup_iz( unsigned line_aa,
 		       unsigned lookup,
 		       struct brw_wm_prog_key *key );
 
-#if 0
-boolean brw_wm_is_glsl(struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_wm_compile *c);
-#endif
+void brw_wm_emit_decls(struct brw_wm_compile *c);
 
 #endif
diff --git a/src/mesa/pipe/i965simple/brw_wm_decl.c b/src/mesa/pipe/i965simple/brw_wm_decl.c
new file mode 100644
index 0000000000..392f17fad6
--- /dev/null
+++ b/src/mesa/pipe/i965simple/brw_wm_decl.c
@@ -0,0 +1,377 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+
+static int is_null( struct brw_reg reg )
+{
+   return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	   reg.nr == BRW_ARF_NULL);
+}
+
+static void emit_pixel_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_xy[0])) {
+
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+      c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+      c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+
+      /* Calculate pixel centers by adding 1 or 0 to each of the
+       * micro-tile coordinates passed in r1.
+       */
+      brw_ADD(p,
+	      c->pixel_xy[0],
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+
+      brw_ADD(p,
+	      c->pixel_xy[1],
+	      stride(suboffset(r1_uw, 5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+}
+
+
+
+
+
+
+static void emit_delta_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->delta_xy[0])) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+      emit_pixel_xy(c);
+
+      c->delta_xy[0] = alloc_tmp(c);
+      c->delta_xy[1] = alloc_tmp(c);
+
+      /* Calc delta X,Y by subtracting origin in r1 from the pixel
+       * centers.
+       */
+      brw_ADD(p,
+	      c->delta_xy[0],
+	      retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+
+      brw_ADD(p,
+	      c->delta_xy[1],
+	      retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+   }
+}
+
+
+
+#if 0
+static void emit_pixel_w( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_w)) {
+      struct brw_compile *p = &c->func;
+
+      struct brw_reg interp_wpos = c->coef_wpos;
+      
+      c->pixel_w = alloc_tmp(c);
+
+      emit_delta_xy(c);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
+      brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
+
+      /* Calc w */
+      brw_math_16( p, 
+		   c->pixel_w,
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, 
+		   brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+#endif
+
+
+static void emit_cinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_MOV(p, dst, suboffset(interp[i],3));
+      }
+   }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   emit_delta_xy(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+      }
+   }
+}
+
+#if 0
+static void emit_pinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   get_delta_xy(c);
+   get_pixel_w(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+	 brw_MUL(p, dst, dst, c->pixel_w);
+      }
+   }
+}
+#endif
+
+
+
+#if 0
+static void emit_wpos( )
+{ 
+   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
+   struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+   struct tgsi_full_src_register deltas = get_delta_xy(c);
+   struct tgsi_full_src_register arg2;
+   unsigned opcode;
+
+   opcode = WM_LINTERP;
+   arg2 = src_undef();
+
+   /* Have to treat wpos.xy specially:
+    */
+   emit_op(c,
+	   WM_WPOSXY,
+	   dst_mask(dst, WRITEMASK_XY),
+	   0, 0, 0,
+	   get_pixel_xy(c),
+	   src_undef(),
+	   src_undef());
+      
+   dst = dst_mask(dst, WRITEMASK_ZW);
+
+   /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+    */
+   emit_op(c,
+	   WM_LINTERP,
+	   dst,
+	   0, 0, 0,
+	   interp,
+	   deltas,
+	   arg2);
+}
+#endif
+
+
+
+
+/* Perform register allocation:
+ * 
+ *  -- r0???
+ *  -- passthrough depth regs (and stencil/aa??)
+ *  -- curbe ??
+ *  -- inputs (coefficients)
+ *
+ * Use a totally static register allocation.  This will perform poorly
+ * but is an easy way to get started (again).
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+   int i, j;
+   int nr_curbe_regs = 0;
+
+   /* R0, then some depth related regs:
+    */
+   for (i = 0; i < c->key.nr_depth_regs; i++) {
+      c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
+      c->reg_index += 2;
+   }
+
+
+   /* Then a copy of our part of the CURBE entry:
+    */
+   {
+      int nr_constants = c->fp->info.nr_regs[TGSI_FILE_CONSTANT];
+      int index = 0;
+
+      c->prog_data.max_const = 4*nr_constants;
+      for (i = 0; i < nr_constants; i++) {
+	 for (j = 0; j < 4; j++, index++) 
+	    c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
+								index%8);
+      }
+
+      nr_curbe_regs = 2*((4*nr_constants+15)/16);
+      c->reg_index += nr_curbe_regs;
+   }
+
+   /* Next we receive the plane coefficients for parameter
+    * interpolation:
+    */
+   for (i = 0; i < c->fp->info.nr_regs[TGSI_FILE_INPUT]; i++) {
+      c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
+      c->reg_index += 2;
+   }
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = c->fp->program.num_inputs * 2;
+   c->prog_data.curb_read_length = nr_curbe_regs;
+
+   /* That's the end of the payload, now we can start allocating registers.
+    */
+   c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index++;
+
+   c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Now allocate room for the interpolated inputs and staging
+    * registers for the outputs:
+    */
+   for (i = 0; i < c->fp->info.nr_regs[TGSI_FILE_INPUT]; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   for (i = 0; i < c->fp->info.nr_regs[TGSI_FILE_OUTPUT]; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   /* Beyond this we should only need registers for internal temporaries:
+    */
+   c->tmp_start = c->reg_index;
+}
+
+
+
+
+
+/* Need to interpolate fragment program inputs in as a preamble to the
+ * shader.  A more sophisticated compiler would do this on demand, but
+ * we'll do it up front:
+ */
+void brw_wm_emit_decls(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   prealloc_reg(c);
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned first = decl->u.DeclarationRange.First;
+	 unsigned last = decl->u.DeclarationRange.Last;
+	 unsigned mask = decl->Declaration.UsageMask; /* ? */
+	 unsigned i;
+
+	 if (decl->Declaration.File != TGSI_FILE_INPUT)
+	    break;
+
+	 assert(decl->Declaration.Interpolate);
+
+	 for( i = first; i <= last; i++ ) {
+	    switch (decl->Interpolation.Interpolate) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       emit_cinterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_LINEAR:
+	       emit_linterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       //emit_pinterp(c, i, mask);
+	       emit_linterp(c, i, mask);
+	       break;
+	    }
+	 }
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+         done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   release_tmps(c);
+}
diff --git a/src/mesa/pipe/i965simple/brw_wm_fp.c b/src/mesa/pipe/i965simple/brw_wm_fp.c
deleted file mode 100644
index 20e90bc612..0000000000
--- a/src/mesa/pipe/i965simple/brw_wm_fp.c
+++ /dev/null
@@ -1,1007 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_wm.h"
-#include "brw_util.h"
-
-
-#define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
-
-#define X    0
-#define Y    1
-#define Z    2
-#define W    3
-
-
-static const char *wm_opcode_strings[] = {   
-   "PIXELXY",
-   "DELTAXY",
-   "PIXELW",
-   "LINTERP",
-   "PINTERP",
-   "CINTERP",
-   "WPOSXY",
-   "FB_WRITE"
-};
-
-#if 0
-static const char *wm_file_strings[] = {   
-   "PAYLOAD"
-};
-#endif
-
-
-/***********************************************************************
- * Source regs
- */
-#if 0
-static struct prog_src_register src_reg(unsigned file, unsigned idx)
-{
-   struct prog_src_register reg;
-   reg.File = file;
-   reg.Index = idx;
-   reg.Swizzle = SWIZZLE_NOOP;
-   reg.RelAddr = 0;
-   reg.NegateBase = 0;
-   reg.Abs = 0;
-   reg.NegateAbs = 0;
-   return reg;
-}
-
-static struct prog_src_register src_reg_from_dst(struct prog_dst_register dst)
-{
-   return src_reg(dst.File, dst.Index);
-}
-
-static struct prog_src_register src_undef( void )
-{
-   return src_reg(PROGRAM_UNDEFINED, 0);
-}
-
-static boolean src_is_undef(struct prog_src_register src)
-{
-   return src.File == PROGRAM_UNDEFINED;
-}
-
-static struct prog_src_register src_swizzle( struct prog_src_register reg, int x, int y, int z, int w )
-{
-   reg.Swizzle = MAKE_SWIZZLE4(x,y,z,w);
-   return reg;
-}
-
-static struct prog_src_register src_swizzle1( struct prog_src_register reg, int x )
-{
-   return src_swizzle(reg, x, x, x, x);
-}
-
-
-/***********************************************************************
- * Dest regs
- */
-
-static struct prog_dst_register dst_reg(unsigned file, unsigned idx)
-{
-   struct prog_dst_register reg;
-   reg.File = file;
-   reg.Index = idx;
-   reg.WriteMask = WRITEMASK_XYZW;
-   reg.CondMask = 0;
-   reg.CondSwizzle = 0;
-   reg.pad = 0;
-   reg.CondSrc = 0;
-   return reg;
-}
-
-static struct prog_dst_register dst_mask( struct prog_dst_register reg, int mask )
-{
-   reg.WriteMask &= mask;
-   return reg;
-}
-
-static struct prog_dst_register dst_undef( void )
-{
-   return dst_reg(PROGRAM_UNDEFINED, 0);
-}
-
-
-
-static struct prog_dst_register get_temp( struct brw_wm_compile *c )
-{
-   int bit = ffs( ~c->fp_temp );
-
-   if (!bit) {
-      _mesa_printf("%s: out of temporaries\n", __FILE__);
-      exit(1);
-   }
-
-   c->fp_temp |= 1<<(bit-1);
-   return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1));
-}
-
-
-static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
-{
-   c->fp_temp &= ~1<<(temp.Index + 1 - FIRST_INTERNAL_TEMP);
-}
-
-
-/***********************************************************************
- * Instructions 
- */
-
-static const struct tgsi_token *get_fp_inst(struct brw_wm_compile *c)
-{
-   return &c->prog_instructions->tokens[c->nr_fp_insns++];
-}
-
-static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
-					const struct prog_instruction *inst0)
-{
-   struct prog_instruction *inst = get_fp_inst(c);
-   *inst = *inst0;
-   inst->Data = (void *)inst0;
-   return inst;
-}
-
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
-				       unsigned op,
-				       struct prog_dst_register dest,
-				       unsigned saturate,
-				       unsigned tex_src_unit,
-				       unsigned tex_src_target,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
-{
-   struct prog_instruction *inst = get_fp_inst(c);
-      
-   memset(inst, 0, sizeof(*inst));
-
-   inst->Opcode = op;
-   inst->DstReg = dest;
-   inst->SaturateMode = saturate;   
-   inst->TexSrcUnit = tex_src_unit;
-   inst->TexSrcTarget = tex_src_target;
-   inst->SrcReg[0] = src0;
-   inst->SrcReg[1] = src1;
-   inst->SrcReg[2] = src2;
-   return inst;
-}
-   
-
-
-
-/***********************************************************************
- * Special instructions for interpolation and other tasks
- */
-
-static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
-{
-   if (src_is_undef(c->pixel_xy)) {
-      struct prog_dst_register pixel_xy = get_temp(c);
-      struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-      
-      
-      /* Emit the out calculations, and hold onto the results.  Use
-       * two instructions as a temporary is required.
-       */   
-      /* pixel_xy.xy = PIXELXY payload[0];
-       */
-      emit_op(c,
-	      WM_PIXELXY,
-	      dst_mask(pixel_xy, WRITEMASK_XY),
-	      0, 0, 0,
-	      payload_r0_depth,
-	      src_undef(),
-	      src_undef());
-
-      c->pixel_xy = src_reg_from_dst(pixel_xy);
-   }
-
-   return c->pixel_xy;
-}
-
-static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
-{
-   if (src_is_undef(c->delta_xy)) {
-      struct prog_dst_register delta_xy = get_temp(c);
-      struct prog_src_register pixel_xy = get_pixel_xy(c);
-      struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-      
-      /* deltas.xy = DELTAXY pixel_xy, payload[0]
-       */
-      emit_op(c,
-	      WM_DELTAXY,
-	      dst_mask(delta_xy, WRITEMASK_XY),
-	      0, 0, 0,
-	      pixel_xy, 
-	      payload_r0_depth,
-	      src_undef());
-      
-      c->delta_xy = src_reg_from_dst(delta_xy);
-   }
-
-   return c->delta_xy;
-}
-
-static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
-{
-   if (src_is_undef(c->pixel_w)) {
-      struct prog_dst_register pixel_w = get_temp(c);
-      struct prog_src_register deltas = get_delta_xy(c);
-      struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS);
-      
-      
-      /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
-       */
-      emit_op(c,
-	      WM_PIXELW,
-	      dst_mask(pixel_w, WRITEMASK_W),
-	      0, 0, 0,
-	      interp_wpos,
-	      deltas, 
-	      src_undef());
-      
-
-      c->pixel_w = src_reg_from_dst(pixel_w);
-   }
-
-   return c->pixel_w;
-}
-
-static void emit_interp( struct brw_wm_compile *c,
-			 unsigned idx )
-{
-   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
-   struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-   struct prog_src_register deltas = get_delta_xy(c);
-   struct prog_src_register arg2;
-   unsigned opcode;
-   
-   /* Need to use PINTERP on attributes which have been
-    * multiplied by 1/W in the SF program, and LINTERP on those
-    * which have not:
-    */
-   switch (idx) {
-   case FRAG_ATTRIB_WPOS:
-      opcode = WM_LINTERP;
-      arg2 = src_undef();
-
-      /* Have to treat wpos.xy specially:
-       */
-      emit_op(c,
-	      WM_WPOSXY,
-	      dst_mask(dst, WRITEMASK_XY),
-	      0, 0, 0,
-	      get_pixel_xy(c),
-	      src_undef(),
-	      src_undef());
-      
-      dst = dst_mask(dst, WRITEMASK_ZW);
-
-      /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
-       */
-      emit_op(c,
-	      WM_LINTERP,
-	      dst,
-	      0, 0, 0,
-	      interp,
-	      deltas,
-	      arg2);
-      break;
-   case FRAG_ATTRIB_COL0:
-   case FRAG_ATTRIB_COL1:
-      if (c->key.flat_shade) {
-	 emit_op(c,
-		 WM_CINTERP,
-		 dst,
-		 0, 0, 0,
-		 interp,
-		 src_undef(),
-		 src_undef());
-      }
-      else {
-	 emit_op(c,
-		 WM_LINTERP,
-		 dst,
-		 0, 0, 0,
-		 interp,
-		 deltas,
-		 src_undef());
-      }
-      break;
-   default:
-      emit_op(c,
-	      WM_PINTERP,
-	      dst,
-	      0, 0, 0,
-	      interp,
-	      deltas,
-	      get_pixel_w(c));
-      break;
-   }
-
-   c->fp_interp_emitted |= 1<<idx;
-}
-
-static void emit_ddx( struct brw_wm_compile *c,
-        const struct prog_instruction *inst )
-{
-    unsigned idx = inst->SrcReg[0].Index;
-    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-
-    c->fp_deriv_emitted |= 1<<idx;
-    emit_op(c,
-            OPCODE_DDX,
-            inst->DstReg,
-            0, 0, 0,
-            interp,
-            get_pixel_w(c),
-            src_undef());
-}
-
-static void emit_ddy( struct brw_wm_compile *c,
-        const struct prog_instruction *inst )
-{
-    unsigned idx = inst->SrcReg[0].Index;
-    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-
-    c->fp_deriv_emitted |= 1<<idx;
-    emit_op(c,
-            OPCODE_DDY,
-            inst->DstReg,
-            0, 0, 0,
-            interp,
-            get_pixel_w(c),
-            src_undef());
-}
-
-/***********************************************************************
- * Hacks to extend the program parameter and constant lists.
- */
-
-/* Add the fog parameters to the parameter list of the original
- * program, rather than creating a new list.  Doesn't really do any
- * harm and it's not as if the parameter handling isn't a big hack
- * anyway.
- */
-static struct prog_src_register search_or_add_param5(struct brw_wm_compile *c, 
-                                                     int s0,
-                                                     int s1,
-                                                     int s2,
-                                                     int s3,
-                                                     int s4)
-{
-   struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters;
-   gl_state_index tokens[STATE_LENGTH];
-   unsigned idx;
-   tokens[0] = s0;
-   tokens[1] = s1;
-   tokens[2] = s2;
-   tokens[3] = s3;
-   tokens[4] = s4;
-   
-   for (idx = 0; idx < paramList->NumParameters; idx++) {
-      if (paramList->Parameters[idx].Type == PROGRAM_STATE_VAR &&
-	  memcmp(paramList->Parameters[idx].StateIndexes, tokens, sizeof(tokens)) == 0)
-	 return src_reg(PROGRAM_STATE_VAR, idx);
-   }
-
-   idx = _mesa_add_state_reference( paramList, tokens );
-
-   /* Recalculate state dependency: 
-    */
-   c->fp->param_state = paramList->StateFlags;
-
-   return src_reg(PROGRAM_STATE_VAR, idx);
-}
-
-
-static struct prog_src_register search_or_add_const4f( struct brw_wm_compile *c, 
-						     float s0,
-						     float s1,
-						     float s2,
-						     float s3)
-{
-   struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters;
-   float values[4];
-   unsigned idx;
-   unsigned swizzle;
-
-   values[0] = s0;
-   values[1] = s1;
-   values[2] = s2;
-   values[3] = s3;
-
-   /* Have to search, otherwise multiple compilations will each grow
-    * the parameter list.
-    */
-   for (idx = 0; idx < paramList->NumParameters; idx++) {
-      if (paramList->Parameters[idx].Type == PROGRAM_CONSTANT &&
-	  memcmp(paramList->ParameterValues[idx], values, sizeof(values)) == 0)
-
-	 /* XXX: this mimics the mesa bug which puts all constants and
-	  * parameters into the "PROGRAM_STATE_VAR" category:
-	  */
-	 return src_reg(PROGRAM_STATE_VAR, idx);
-   }
-   
-   idx = _mesa_add_unnamed_constant( paramList, values, 4, &swizzle );
-   /* XXX what about swizzle? */
-   return src_reg(PROGRAM_STATE_VAR, idx);
-}
-
-
-
-/***********************************************************************
- * Expand various instructions here to simpler forms.  
- */
-static void precalc_dst( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
-{
-   struct prog_src_register src0 = inst->SrcReg[0];
-   struct prog_src_register src1 = inst->SrcReg[1];
-   struct prog_dst_register dst = inst->DstReg;
-   
-   if (dst.WriteMask & WRITEMASK_Y) {      
-      /* dst.y = mul src0.y, src1.y
-       */
-      emit_op(c,
-	      OPCODE_MUL,
-	      dst_mask(dst, WRITEMASK_Y),
-	      inst->SaturateMode, 0, 0,
-	      src0,
-	      src1,
-	      src_undef());
-   }
-
-
-   if (dst.WriteMask & WRITEMASK_XZ) {
-      unsigned z = GET_SWZ(src0.Swizzle, Z);
-
-      /* dst.xz = swz src0.1zzz
-       */
-      emit_op(c,
-	      OPCODE_SWZ,
-	      dst_mask(dst, WRITEMASK_XZ),
-	      inst->SaturateMode, 0, 0,
-	      src_swizzle(src0, SWIZZLE_ONE, z, z, z),
-	      src_undef(),
-	      src_undef());
-   }
-   if (dst.WriteMask & WRITEMASK_W) {
-      /* dst.w = mov src1.w
-       */
-      emit_op(c,
-	      OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_W),
-	      inst->SaturateMode, 0, 0,
-	      src1,
-	      src_undef(),
-	      src_undef());
-   }
-}
-
-
-static void precalc_lit( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-   struct prog_src_register src0 = inst->SrcReg[0];
-   struct prog_dst_register dst = inst->DstReg;
-   
-   if (dst.WriteMask & WRITEMASK_XW) {
-      /* dst.xw = swz src0.1111
-       */
-      emit_op(c,
-	      OPCODE_SWZ,
-	      dst_mask(dst, WRITEMASK_XW),
-	      0, 0, 0,
-	      src_swizzle1(src0, SWIZZLE_ONE),
-	      src_undef(),
-	      src_undef());
-   }
-
-
-   if (dst.WriteMask & WRITEMASK_YZ) {
-      emit_op(c,
-	      OPCODE_LIT,
-	      dst_mask(dst, WRITEMASK_YZ),
-	      inst->SaturateMode, 0, 0,
-	      src0,
-	      src_undef(),
-	      src_undef());
-   }
-}
-
-static void precalc_tex( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-   struct prog_src_register coord;
-   struct prog_dst_register tmpcoord;
-
-   if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) {
-       struct prog_instruction *out;
-       struct prog_dst_register tmp0 = get_temp(c);
-       struct prog_src_register tmp0src = src_reg_from_dst(tmp0);
-       struct prog_dst_register tmp1 = get_temp(c);
-       struct prog_src_register tmp1src = src_reg_from_dst(tmp1);
-       struct prog_src_register src0 = inst->SrcReg[0];
-
-       tmpcoord = get_temp(c);
-       coord = src_reg_from_dst(tmpcoord);
-
-       out = emit_op(c, OPCODE_MOV,
-                     tmpcoord,
-                     0, 0, 0,
-                     src0,
-                     src_undef(),
-                     src_undef());
-       out->SrcReg[0].NegateBase = 0;
-       out->SrcReg[0].Abs = 1;
-
-       emit_op(c, OPCODE_MAX,
-               tmp0,
-               0, 0, 0,
-               src_swizzle1(coord, X),
-               src_swizzle1(coord, Y),
-               src_undef());
-
-       emit_op(c, OPCODE_MAX,
-               tmp1,
-               0, 0, 0,
-               tmp0src,
-               src_swizzle1(coord, Z),
-               src_undef());
-
-       emit_op(c, OPCODE_RCP,
-               tmp0,
-               0, 0, 0,
-               tmp1src,
-               src_undef(),
-               src_undef());
-
-       emit_op(c, OPCODE_MUL,
-               tmpcoord,
-               0, 0, 0,
-               src0,
-               tmp0src,
-               src_undef());
-
-       release_temp(c, tmp0);
-       release_temp(c, tmp1);
-   } else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
-      struct prog_src_register scale = 
-	 search_or_add_param5( c, 
-			       STATE_INTERNAL, 
-			       STATE_TEXRECT_SCALE,
-			       inst->TexSrcUnit,
-			       0,0 );
-
-      tmpcoord = get_temp(c);
-
-      /* coord.xy   = MUL inst->SrcReg[0], { 1/width, 1/height }
-       */
-      emit_op(c,
-	      OPCODE_MUL,
-	      tmpcoord,
-	      0, 0, 0,
-	      inst->SrcReg[0],
-	      scale,
-	      src_undef());
-
-      coord = src_reg_from_dst(tmpcoord);
-   }
-   else {
-      coord = inst->SrcReg[0];
-   }
-
-   /* Need to emit YUV texture conversions by hand.  Probably need to
-    * do this here - the alternative is in brw_wm_emit.c, but the
-    * conversion requires allocating a temporary variable which we
-    * don't have the facility to do that late in the compilation.
-    */
-   if (!(c->key.yuvtex_mask & (1<<inst->TexSrcUnit))) {
-      emit_op(c, 
-	      OPCODE_TEX,
-	      inst->DstReg,
-	      inst->SaturateMode,
-	      inst->TexSrcUnit,
-	      inst->TexSrcTarget,
-	      coord,
-	      src_undef(),
-	      src_undef());
-   }
-   else {
-      /* 
-	 CONST C0 = { -.5, -.0625,  -.5, 1.164 }
-	 CONST C1 = { 1.596, -0.813, 2.018, -.391 }
-	 UYV     = TEX ...
-	 UYV.xyz = ADD UYV,     C0
-	 UYV.y   = MUL UYV.y,   C0.w
-	 RGB.xyz = MAD UYV.xxz, C1,   UYV.y
-	 RGB.y   = MAD UYV.z,   C1.w, RGB.y
-      */
-      struct prog_dst_register dst = inst->DstReg;
-      struct prog_src_register src0 = inst->SrcReg[0];
-      struct prog_dst_register tmp = get_temp(c);
-      struct prog_src_register tmpsrc = src_reg_from_dst(tmp);
-      struct prog_src_register C0 = search_or_add_const4f( c,  -.5, -.0625, -.5, 1.164 );
-      struct prog_src_register C1 = search_or_add_const4f( c, 1.596, -0.813, 2.018, -.391 );
-     
-      /* tmp     = TEX ...
-       */
-      emit_op(c, 
-	      OPCODE_TEX,
-	      tmp,
-	      inst->SaturateMode,
-	      inst->TexSrcUnit,
-	      inst->TexSrcTarget,
-	      src0,
-	      src_undef(),
-	      src_undef());
-
-      /* tmp.xyz =  ADD TMP, C0
-       */
-      emit_op(c,
-	      OPCODE_ADD,
-	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
-	      tmpsrc,
-	      C0,
-	      src_undef());
-
-      /* YUV.y   = MUL YUV.y, C0.w
-       */
-      emit_op(c,
-	      OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_Y),
-	      0, 0, 0,
-	      tmpsrc,
-	      src_swizzle1(C0, W),
-	      src_undef());
-
-      /* RGB.xyz = MAD YUV.xxz, C1, YUV.y
-       */
-      emit_op(c,
-	      OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_XYZ),
-	      0, 0, 0,
-	      src_swizzle(tmpsrc, X,X,Z,Z),
-	      C1,
-	      src_swizzle1(tmpsrc, Y));
-
-      /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
-       */
-      emit_op(c,
-	      OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_Y),
-	      0, 0, 0,
-	      src_swizzle1(tmpsrc, Z),
-	      src_swizzle1(C1, W),
-	      src_swizzle1(src_reg_from_dst(dst), Y));
-
-      release_temp(c, tmp);
-   }
-
-   if (inst->TexSrcTarget == GL_TEXTURE_RECTANGLE_NV) 
-      release_temp(c, tmpcoord);
-}
-
-
-static boolean projtex( struct brw_wm_compile *c,
-			  const struct prog_instruction *inst )
-{
-   struct prog_src_register src = inst->SrcReg[0];
-
-   /* Only try to detect the simplest cases.  Could detect (later)
-    * cases where we are trying to emit code like RCP {1.0}, MUL x,
-    * {1.0}, and so on.
-    *
-    * More complex cases than this typically only arise from
-    * user-provided fragment programs anyway:
-    */
-   if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)
-      return 0;  /* ut2004 gun rendering !?! */
-   else if (src.File == PROGRAM_INPUT && 
-	    GET_SWZ(src.Swizzle, W) == W &&
-           (c->key.projtex_mask & (1<<(src.Index + FRAG_ATTRIB_WPOS - FRAG_ATTRIB_TEX0))) == 0)
-      return 0;
-   else
-      return 1;
-}
-
-
-static void precalc_txp( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
-{
-   struct prog_src_register src0 = inst->SrcReg[0];
-
-   if (projtex(c, inst)) {
-      struct prog_dst_register tmp = get_temp(c);
-      struct prog_instruction tmp_inst;
-
-      /* tmp0.w = RCP inst.arg[0][3]
-       */
-      emit_op(c,
-	      OPCODE_RCP,
-	      dst_mask(tmp, WRITEMASK_W),
-	      0, 0, 0,
-	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
-	      src_undef(),
-	      src_undef());
-
-      /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
-       */
-      emit_op(c,
-	      OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
-	      src0,
-	      src_swizzle1(src_reg_from_dst(tmp), W),
-	      src_undef());
-
-      /* dst = precalc(TEX tmp0)
-       */
-      tmp_inst = *inst;
-      tmp_inst.SrcReg[0] = src_reg_from_dst(tmp);
-      precalc_tex(c, &tmp_inst);
-
-      release_temp(c, tmp);
-   }
-   else
-   {
-      /* dst = precalc(TEX src0)
-       */
-      precalc_tex(c, inst);
-   }
-}
-
-
-
-
-
-/***********************************************************************
- * Add instructions to perform fog blending
- */
-
-static void fog_blend( struct brw_wm_compile *c,
-			     struct prog_src_register fog_factor )
-{
-   struct prog_dst_register outcolor = dst_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
-   struct prog_src_register fogcolor = search_or_add_param5( c, STATE_FOG_COLOR, 0,0,0,0 );
-
-   /* color.xyz = LRP fog_factor.xxxx, output_color, fog_color */
-   
-   emit_op(c, 
-	   OPCODE_LRP,
-	   dst_mask(outcolor, WRITEMASK_XYZ),
-	   0, 0, 0,
-	   fog_factor,
-	   src_reg_from_dst(outcolor),
-	   fogcolor);
-}
-
-
-
-/* This one is simple - just take the interpolated fog coordinate and
- * use it as the fog blend factor.
- */
-static void fog_interpolated( struct brw_wm_compile *c )
-{
-   struct prog_src_register fogc = src_reg(PROGRAM_INPUT, FRAG_ATTRIB_FOGC);
-   
-   if (!(c->fp_interp_emitted & (1<<FRAG_ATTRIB_FOGC))) 
-      emit_interp(c, FRAG_ATTRIB_FOGC);
-
-   fog_blend( c, src_swizzle1(fogc, GET_SWZ(fogc.Swizzle,X)));
-}
-
-static void emit_fog( struct brw_wm_compile *c ) 
-{
-   if (!c->fp->program.FogOption)
-      return;
-
-   if (1) 
-      fog_interpolated( c );
-   else {
-      /* TODO: per-pixel fog */
-      assert(0);
-   }
-}
-
-static void emit_fb_write( struct brw_wm_compile *c )
-{
-   struct prog_src_register outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
-   struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPR);
-
-   emit_op(c,
-	   WM_FB_WRITE,
-	   dst_mask(dst_undef(),0),
-	   0, 0, 0,
-	   outcolor,
-	   payload_r0_depth,
-	   outdepth);
-}
-
-
-
-
-/***********************************************************************
- * Emit INTERP instructions ahead of first use of each attrib.
- */
-
-static void validate_src_regs( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
-{
-   unsigned nr_args = brw_wm_nr_args( inst->Opcode );
-   unsigned i;
-
-   for (i = 0; i < nr_args; i++) {
-      if (inst->SrcReg[i].File == PROGRAM_INPUT) {
-	 unsigned idx = inst->SrcReg[i].Index;
-	 if (!(c->fp_interp_emitted & (1<<idx))) {
-	    emit_interp(c, idx);
-	 }
-      }
-   }
-}
-	 
-
-
-static void print_insns( const struct prog_instruction *insn,
-			 unsigned nr )
-{
-   unsigned i;
-   for (i = 0; i < nr; i++, insn++) {
-      _mesa_printf("%3d: ", i);
-      if (insn->Opcode < MAX_OPCODE)
-	 _mesa_print_instruction(insn);
-      else if (insn->Opcode < MAX_WM_OPCODE) {
-	 unsigned idx = insn->Opcode - MAX_OPCODE;
-
-	 _mesa_print_alu_instruction(insn,
-				     wm_opcode_strings[idx],
-				     3);
-      }
-      else 
-	 _mesa_printf("UNKNOWN\n");
-	   
-   }
-}
-void brw_wm_pass_fp( struct brw_wm_compile *c )
-{
-   struct brw_fragment_program *fp = c->fp;
-   unsigned insn;
-   if (BRW_DEBUG & DEBUG_WM) {
-      _mesa_printf("\n\n\npre-fp:\n");
-      _mesa_print_program(&fp->program.Base); 
-      _mesa_printf("\n");
-   }
-
-   c->pixel_xy = src_undef();
-   c->delta_xy = src_undef();
-   c->pixel_w = src_undef();
-   c->nr_fp_insns = 0;
-
-   /* Emit preamble instructions:
-    */
-
-
-   for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
-      const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
-      struct prog_instruction *out;
-
-      /* Check for INPUT values, emit INTERP instructions where
-       * necessary:
-       */
-      validate_src_regs(c, inst);
-
-
-      switch (inst->Opcode) {
-      case OPCODE_SWZ: 
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
-	 break;
-	 
-      case OPCODE_ABS:
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
-	 out->SrcReg[0].NegateBase = 0;
-	 out->SrcReg[0].Abs = 1;
-	 break;
-
-      case OPCODE_SUB: 
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_ADD;
-	 out->SrcReg[1].NegateBase ^= 0xf;
-	 break;
-
-      case OPCODE_SCS: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
-	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XY;
-	 break;
-	 
-      case OPCODE_DST:
-	 precalc_dst(c, inst);
-	 break;
-
-      case OPCODE_LIT:
-	 precalc_lit(c, inst);
-	 break;
-     
-      case OPCODE_TXP:
-	 precalc_txp(c, inst);
-	 break;
-
-      case OPCODE_XPD: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
-	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
-	 break;
-
-      case OPCODE_KIL: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
-	  */
-	 out->DstReg.WriteMask = 0;
-	 break;
-      case OPCODE_DDX:
-	 emit_ddx(c, inst);
-	 break;
-      case OPCODE_DDY:
-         emit_ddy(c, inst);
-	break;
-      case OPCODE_END:
-	 emit_fog(c);
-	 emit_fb_write(c);
-	 break;
-      case OPCODE_PRINT:
-	 break;
-	 
-      default:
-	 emit_insn(c, inst);
-	 break;
-      }
-   }
-
-   if (BRW_DEBUG & DEBUG_WM) {
-	   _mesa_printf("\n\n\npass_fp:\n");
-	   print_insns( c->prog_instructions, c->nr_fp_insns );
-	   _mesa_printf("\n");
-   }
-}
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_wm_glsl.c b/src/mesa/pipe/i965simple/brw_wm_glsl.c
index 90e73a605a..d6dfaed826 100644
--- a/src/mesa/pipe/i965simple/brw_wm_glsl.c
+++ b/src/mesa/pipe/i965simple/brw_wm_glsl.c
@@ -2,753 +2,437 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
 
-#if 0
 
-/* Only guess, need a flag in gl_fragment_program later */
-boolean brw_wm_is_glsl(struct gl_fragment_program *fp)
-{
-    int i;
-    for (i = 0; i < fp->Base.NumInstructions; i++) {
-	struct prog_instruction *inst = &fp->Base.Instructions[i];
-	switch (inst->Opcode) {
-	    case OPCODE_IF:
-	    case OPCODE_INT:
-	    case OPCODE_ENDIF:
-	    case OPCODE_CAL:
-	    case OPCODE_BRK:
-	    case OPCODE_RET:
-	    case OPCODE_DDX:
-	    case OPCODE_DDY:
-	    case OPCODE_BGNLOOP:
-		return TRUE;
-	    default:
-		break;
-	}
-    }
-    return FALSE;
-}
-
-static void set_reg(struct brw_wm_compile *c, int file, int index,
-	int component, struct brw_reg reg)
-{
-    c->wm_regs[file][index][component].reg = reg;
-    c->wm_regs[file][index][component].inited = TRUE;
-}
 
-static int get_scalar_dst_index(struct prog_instruction *inst)
+static int get_scalar_dst_index(struct tgsi_full_instruction *inst)
 {
-    int i;
-    for (i = 0; i < 4; i++)
-	if (inst->DstReg.WriteMask & (1<<i))
-	    break;
-    return i;
+   struct tgsi_dst_register dst = inst->FullDstRegisters[0].DstRegister;
+   int i;
+   for (i = 0; i < 4; i++)
+      if (dst.WriteMask & (1<<i))
+	 break;
+   return i;
 }
 
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
-    struct brw_reg reg;
-    reg = brw_vec8_grf(c->tmp_index--, 0);
-    return reg;
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
 }
 
 static void release_tmps(struct brw_wm_compile *c)
 {
-    c->tmp_index = 127;
+   c->tmp_index = 0;
 }
 
+
 static struct brw_reg
-get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, unsigned neg, unsigned abs)
+get_reg(struct brw_wm_compile *c, int file, int index, int component )
 {
-    struct brw_reg reg;
-    switch (file) {
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_UNIFORM:
-	    file = PROGRAM_STATE_VAR;
-	    break;
-	case PROGRAM_UNDEFINED:
-	    return brw_null_reg();
-	default:
-	    break;
-    }
+   switch (file) {
+   case TGSI_FILE_NULL:
+      return brw_null_reg();
 
-    if(c->wm_regs[file][index][component].inited)
-	reg = c->wm_regs[file][index][component].reg;
-    else
-	reg = brw_vec8_grf(c->reg_index, 0);
+   case TGSI_FILE_SAMPLER:
+      /* Should never get here:
+       */
+      assert (0);	       
+      return brw_null_reg();
 
-    if(!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
-    }
+   case TGSI_FILE_IMMEDIATE:
+      /* These need a different path:
+       */
+      assert(0);
+      return brw_null_reg();
 
-    if (neg & (1<< component)) {
-	reg = negate(reg);
-    }
-    if (abs)
-	reg = brw_abs(reg);
-    return reg;
-}
-
-static void prealloc_reg(struct brw_wm_compile *c)
-{
-    int i, j;
-    struct brw_reg reg;
-    int nr_interp_regs = 0;
-    unsigned inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+       
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+      return c->wm_regs[file][index][component];
 
-    for (i = 0; i < 4; i++) {
-	reg = (i < c->key.nr_depth_regs)
-	    ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
-	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
-    }
-    c->reg_index += 2*c->key.nr_depth_regs;
-    {
-	int nr_params = c->fp->program.Base.Parameters->NumParameters;
-	struct gl_program_parameter_list *plist =
-	    c->fp->program.Base.Parameters;
-	int index = 0;
-	c->prog_data.nr_params = 4*nr_params;
-	for (i = 0; i < nr_params; i++) {
-	    for (j = 0; j < 4; j++, index++) {
-		reg = brw_vec1_grf(c->reg_index + index/8,
-			index%8);
-		c->prog_data.param[index] =
-		    &plist->ParameterValues[i][j];
-		set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-	    }
-	}
-	c->nr_creg = 2*((4*nr_params+15)/16);
-	c->reg_index += c->nr_creg;
-    }
-    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	if (inputs & (1<<i)) {
-	    nr_interp_regs++;
-	    reg = brw_vec8_grf(c->reg_index, 0);
-	    for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	    c->reg_index += 2;
-
-	}
-    }
-    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-    c->prog_data.urb_read_length = nr_interp_regs * 2;
-    c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
 }
 
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-	struct prog_instruction *inst, int component, int nr)
-{
-    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
-	    0, 0);
-}
 
-static struct brw_reg get_src_reg(struct brw_wm_compile *c,
-	struct prog_src_register *src, int index, int nr)
-{
-    int component = GET_SWZ(src->Swizzle, index);
-    return get_reg(c, src->File, src->Index, component, nr,
-	    src->NegateBase, src->Abs);
-}
-
-static void emit_abs( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_instruction *inst, 
+				  int component)
 {
-    int i;
-    struct brw_compile *p = &c->func;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (inst->DstReg.WriteMask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_MOV(p, dst, brw_abs(src));
-	}
-    }
-    brw_set_saturate(p, 0);
+   return get_reg(c, 
+		  inst->FullDstRegisters[0].DstRegister.File, 
+		  inst->FullDstRegisters[0].DstRegister.Index,
+		  component);
 }
 
-static void emit_int( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static int get_swz( struct tgsi_src_register src, int index )
 {
-    int i;
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1) ;
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_RNDD(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
+   switch (index & 3) {
+   case 0: return src.SwizzleX;
+   case 1: return src.SwizzleY;
+   case 2: return src.SwizzleZ;
+   case 3: return src.SwizzleW;
+   default: return 0;
+   }
 }
 
-static void emit_mov( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static int get_ext_swz( struct tgsi_src_register_ext_swz src, int index )
 {
-    int i;
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_MOV(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
+   switch (index & 3) {
+   case 0: return src.ExtSwizzleX;
+   case 1: return src.ExtSwizzleY;
+   case 2: return src.ExtSwizzleZ;
+   case 3: return src.ExtSwizzleW;
+   default: return 0;
+   }
 }
 
-static void emit_pixel_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static struct brw_reg get_src_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_src_register *src, 
+				  int index)
 {
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+   struct brw_reg reg;
+   int component = index;
+   int neg = 0;
+   int abs = 0;
 
-    struct brw_reg dst0, dst1;
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
+   if (src->SrcRegister.Negate)
+      neg = 1;
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
-    /* Calculate pixel centers by adding 1 or 0 to each of the
-     * micro-tile coordinates passed in r1.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 4), 2, 4, 0),
-		brw_imm_v(0x10101010));
-    }
+   component = get_swz(src->SrcRegister, component);
 
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 5), 2, 4, 0),
-		brw_imm_v(0x11001100));
-    }
-
-}
-
-static void emit_delta_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg dst0, dst1, src0, src1;
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
+   /* Yes, there are multiple negates:
+    */
+   switch (component & 3) {
+   case 0: neg ^= src->SrcRegisterExtSwz.NegateX; break;
+   case 1: neg ^= src->SrcRegisterExtSwz.NegateY; break;
+   case 2: neg ^= src->SrcRegisterExtSwz.NegateZ; break;
+   case 3: neg ^= src->SrcRegisterExtSwz.NegateW; break;
+   }
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
-    /* Calc delta X,Y by subtracting origin in r1 from the pixel
-     * centers.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		dst0,
-		retype(src0, BRW_REGISTER_TYPE_UW),
-		negate(r1));
-    }
+   /* And multiple swizzles, fun isn't it:
+    */
+   component = get_ext_swz(src->SrcRegisterExtSwz, component);
 
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		dst1,
-		retype(src1, BRW_REGISTER_TYPE_UW),
-		negate(suboffset(r1,1)));
+   /* Can't handle this, don't know if we need to:
+    */
+   assert(src->SrcRegisterExtSwz.ExtDivide == 0);
+       
+   /* Not handling indirect lookups yet:
+    */
+   assert(src->SrcRegister.Indirect == 0);
 
-    }
+   /* Don't know what dimension means:
+    */
+   assert(src->SrcRegister.Dimension == 0);
 
-}
+   /* Will never handle any of this stuff: 
+    */
+   assert(src->SrcRegisterExtMod.Complement == 0);
+   assert(src->SrcRegisterExtMod.Bias == 0);
+   assert(src->SrcRegisterExtMod.Scale2X == 0);
 
+   if (src->SrcRegisterExtMod.Absolute)
+      abs = 1;
 
-static void fire_fb_write( struct brw_wm_compile *c,
-                           unsigned base_reg,
-                           unsigned nr )
-{
-    struct brw_compile *p = &c->func;
+   /* Another negate!  This is a post-absolute negate, which we
+    * can't do.  Need to clean the crap out of tgsi somehow.
+    */
+   assert(src->SrcRegisterExtMod.Negate == 0);
 
-    /* Pass through control information:
-     */
-    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
-    {
-	brw_push_insn_state(p);
-	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
-	brw_MOV(p,
-		brw_message_reg(base_reg + 1),
-		brw_vec8_grf(1, 0));
-	brw_pop_insn_state(p);
-    }
-    /* Send framebuffer write message: */
-    brw_fb_WRITE(p,
-	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
-	    base_reg,
-	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
-	    0,              /* render surface always 0 */
-	    nr,
-	    0,
-	    1);
-}
+   switch( component ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      reg = get_reg(c, 
+		    src->SrcRegister.File, 
+		    src->SrcRegister.Index, 
+		    component );
 
-static void emit_fb_write(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    int nr = 2;
-    int channel;
-    struct brw_reg src0;//, src1, src2, dst;
+      if (neg) 
+	 reg = negate(reg);
+   
+      if (abs)
+	 reg = brw_abs(reg);
 
-    /* Reserve a space for AA - may not be needed:
-     */
-    if (c->key.aa_dest_stencil_reg)
-	nr += 1;
-    {
-	brw_push_insn_state(p);
-	for (channel = 0; channel < 4; channel++) {
-	    src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
-	    /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-	    /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-	    brw_MOV(p, brw_message_reg(nr + channel), src0);
-	}
-	/* skip over the regs populated above: */
-	nr += 8;
-	brw_pop_insn_state(p);
-    }
-    fire_fb_write(c, 0, nr);
-}
+      break;
 
-static void emit_pixel_w( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    if (mask & WRITEMASK_W) {
-	struct brw_reg dst, src0, delta0, delta1;
-	struct brw_reg interp3;
+      /* XXX: this won't really work in the general case, but we know
+       * that the extended swizzle is only allowed in the SWZ
+       * instruction (right??), in which case using an immediate
+       * directly will work.
+       */
+   case TGSI_EXTSWIZZLE_ZERO:
+      reg = brw_imm_f(0);
+      break;
 
-	dst = get_dst_reg(c, inst, 3, 1);
-	src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-	delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-	delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+   case TGSI_EXTSWIZZLE_ONE:
+      if (neg && !abs)
+	 reg = brw_imm_f(-1.0);
+      else
+	 reg = brw_imm_f(1.0);
+      break;
 
-	interp3 = brw_vec1_grf(src0.nr+1, 4);
-	/* Calc 1/w - just linterp wpos[3] optimized by putting the
-	 * result straight into a message reg.
-	 */
-	brw_LINE(p, brw_null_reg(), interp3, delta0);
-	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
+   default:
+      assert(0);
+      break;
+   }
 
-	/* Calc w */
-	brw_math_16( p, dst,
-		BRW_MATH_FUNCTION_INV,
-		BRW_MATH_SATURATE_NONE,
-		2, brw_null_reg(),
-		BRW_MATH_PRECISION_FULL);
-    }
+    
+   return reg;
 }
 
-static void emit_linterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0;
-
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    unsigned nr = src0.nr;
-    int i;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
-	}
-    }
-}
-
-static void emit_cinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_abs( struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 
-    struct brw_reg interp[4];
-    struct brw_reg dst, src0;
-
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    unsigned nr = src0.nr;
-    int i;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    brw_MOV(p, dst, suboffset(interp[i],3));
-	}
-    }
+   int i;
+   struct brw_compile *p = &c->func;
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 struct brw_reg src, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_MOV(p, dst, brw_abs(src)); /* NOTE */
+      }
+   }
+   brw_set_saturate(p, 0);
 }
 
-static void emit_pinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0, w;
-
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
-    unsigned nr = src0.nr;
-    int i;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1),
-		    delta1);
-	    brw_MUL(p, dst, dst, w);
-	}
-    }
-}
 
 static void emit_xpd(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    int i;
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    for (i = 0; i < 4; i++) {
-	unsigned i2 = (i+2)%3;
-	unsigned i1 = (i+1)%3;
-	if (mask & (1<<i)) {
-	    struct brw_reg src0, src1, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
-	    brw_MUL(p, brw_null_reg(), src0, src1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
-	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-	    brw_MAC(p, dst, src0, src1);
-	    brw_set_saturate(p, 0);
-	}
-    }
-    brw_set_saturate(p, 0);
+   int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   for (i = 0; i < 4; i++) {
+      unsigned i2 = (i+2)%3;
+      unsigned i1 = (i+1)%3;
+      if (mask & (1<<i)) {
+	 struct brw_reg src0, src1, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = negate(get_src_reg(c, &inst->FullSrcRegisters[0], i2));
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i1);
+	 brw_MUL(p, brw_null_reg(), src0, src1);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i1);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i2);
+	 brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+	 brw_MAC(p, dst, src0, src1);
+	 brw_set_saturate(p, 0);
+      }
+   }
+   brw_set_saturate(p, 0);
 }
 
 static void emit_dp3(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_reg src0[3], src1[3], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    for (i = 0; i < 3; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
-    }
+   struct brw_reg src0[3], src1[3], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 3; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
 
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, 0);
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, 0);
 }
 
 static void emit_dp4(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
-    }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[3], src1[3]);
-    brw_set_saturate(p, 0);
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
 }
 
 static void emit_dph(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
-    }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_ADD(p, dst, src0[3], src1[3]);
-    brw_set_saturate(p, 0);
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_ADD(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
 }
 
 static void emit_math1(struct brw_wm_compile *c,
-		struct prog_instruction *inst, unsigned func)
+		       struct tgsi_full_instruction *inst, unsigned func)
 {
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_math(p,
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_math(p,
 	    dst,
 	    func,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    ((inst->Instruction.Saturate != TGSI_SAT_NONE) 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
 	    2,
 	    brw_null_reg(),
 	    BRW_MATH_DATA_VECTOR,
 	    BRW_MATH_PRECISION_FULL);
 }
 
-static void emit_rcp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
-}
-
-static void emit_rsq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
-}
-
-static void emit_sin(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
-}
-
-static void emit_cos(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
-}
-
-static void emit_ex2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
-}
-
-static void emit_lg2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
-}
 
-static void emit_add(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_alu2(struct brw_wm_compile *c,		      
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
 {
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    unsigned mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_ADD(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, src1, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_alu2(p, opcode, dst, src0, src1);
+      }
+   }
+   brw_set_saturate(p, 0);
 }
 
-static void emit_sub(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    unsigned mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_ADD(p, dst, src0, negate(src1));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_mul(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    unsigned mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_MUL(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
 
-static void emit_frc(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_alu1(struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
 {
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    unsigned mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_FRC(p, dst, src0);
-	}
-    }
-    if (inst->SaturateMode != SATURATE_OFF)
-	brw_set_saturate(p, 0);
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_alu1(p, opcode, dst, src0);
+      }
+   }
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
+      brw_set_saturate(p, 0);
 }
 
-static void emit_flr(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    unsigned mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_RNDD(p, dst, src0);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
 
 static void emit_max(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
 
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-	}
-    }
-    brw_pop_insn_state(p);
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
 }
 
 static void emit_min(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
 
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-	}
-    }
-    brw_pop_insn_state(p);
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
 }
 
 static void emit_pow(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst, src0, src1;
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+   struct brw_compile *p = &c->func;
+   struct brw_reg dst, src0, src1;
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   src1 = get_src_reg(c, &inst->FullSrcRegisters[1], 0);
 
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_MOV(p, brw_message_reg(3), src1);
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_MOV(p, brw_message_reg(3), src1);
 
-    brw_math(p,
+   brw_math(p,
 	    dst,
 	    BRW_MATH_FUNCTION_POW,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    (inst->Instruction.Saturate != TGSI_SAT_NONE 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
 	    2,
 	    brw_null_reg(),
 	    BRW_MATH_DATA_VECTOR,
@@ -756,601 +440,636 @@ static void emit_pow(struct brw_wm_compile *c,
 }
 
 static void emit_lrp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
-    int i;
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+   int i;
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
 
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
 
-	    if (src1.nr == dst.nr) {
-		tmp1 = alloc_tmp(c);
-		brw_MOV(p, tmp1, src1);
-	    } else
-		tmp1 = src1;
+	 if (src1.nr == dst.nr) {
+	    tmp1 = alloc_tmp(c);
+	    brw_MOV(p, tmp1, src1);
+	 } else
+	    tmp1 = src1;
 
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
-	    if (src2.nr == dst.nr) {
-		tmp2 = alloc_tmp(c);
-		brw_MOV(p, tmp2, src2);
-	    } else
-		tmp2 = src2;
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 if (src2.nr == dst.nr) {
+	    tmp2 = alloc_tmp(c);
+	    brw_MOV(p, tmp2, src2);
+	 } else
+	    tmp2 = src2;
 
-	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
-	    brw_MUL(p, brw_null_reg(), dst, tmp2);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MAC(p, dst, src0, tmp1);
-	    brw_set_saturate(p, 0);
-	}
-	release_tmps(c);
-    }
+	 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst, tmp2);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MAC(p, dst, src0, tmp1);
+	 brw_set_saturate(p, 0);
+      }
+      release_tmps(c);
+   }
 }
 
 static void emit_kil(struct brw_wm_compile *c)
 {
-	struct brw_compile *p = &c->func;
-	struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-	brw_push_insn_state(p);
-	brw_set_mask_control(p, BRW_MASK_DISABLE);
-	brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
-	brw_AND(p, depth, c->emit_mask_reg, depth);
-	brw_pop_insn_state(p);
+   struct brw_compile *p = &c->func;
+   struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, depth, c->emit_mask_reg, depth);
+   brw_pop_insn_state(p);
 }
 
 static void emit_mad(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1, src2;
-    int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1, src2;
+   int i;
 
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
-	    brw_MUL(p, dst, src0, src1);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 brw_MUL(p, dst, src0, src1);
 
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_ADD(p, dst, dst, src2);
-	    brw_set_saturate(p, 0);
-	}
-    }
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_ADD(p, dst, dst, src2);
+	 brw_set_saturate(p, 0);
+      }
+   }
 }
 
 static void emit_sop(struct brw_wm_compile *c,
-		struct prog_instruction *inst, unsigned cond)
+		     struct tgsi_full_instruction *inst, unsigned cond)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1;
-    int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1;
+   int i;
 
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	    brw_MOV(p, dst, brw_imm_f(0.0));
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, brw_imm_f(1.0));
-	}
-    }
-    brw_pop_insn_state(p);
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst, brw_imm_f(0.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, brw_imm_f(1.0));
+      }
+   }
+   brw_pop_insn_state(p);
 }
 
-static void emit_slt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_L);
-}
 
-static void emit_sle(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_ddx(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
 {
-    emit_sop(c, inst, BRW_CONDITIONAL_LE);
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   nr = src0.nr;
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, interp[i]);
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
 }
 
-static void emit_sgt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_ddy(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
 {
-    emit_sop(c, inst, BRW_CONDITIONAL_G);
-}
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
 
-static void emit_sge(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_GE);
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   nr = src0.nr;
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, suboffset(interp[i], 1));
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
 }
 
-static void emit_seq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+/* TODO
+   BIAS on SIMD8 not workind yet...
+*/
+static void emit_txb(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
 {
-    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
-}
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned i;
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
 
-static void emit_sne(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
-}
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   default:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), src[2]);
+      break;
+   }
+#else
+   brw_MOV(p, brw_message_reg(2), src[0]);
+   brw_MOV(p, brw_message_reg(3), src[1]);
+   brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+#endif
 
-static void emit_ddx(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst;
-    struct brw_reg src0, w;
-    unsigned nr, i;
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
-    nr = src0.nr;
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for(i = 0; i < 4; i++ ) {
-        if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
-            brw_MOV(p, dst, interp[i]);
-            brw_MUL(p, dst, dst, w);
-        }
-    }
-    brw_set_saturate(p, 0);
+   brw_MOV(p, brw_message_reg(5), src[3]);
+   brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      4,
+	      4,
+	      0);
+#endif
 }
 
-static void emit_ddy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+static void emit_tex(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst;
-    struct brw_reg src0, w;
-    unsigned nr, i;
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned msg_len;
+   unsigned i, nr;
+   unsigned emit;
+   boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    nr = src0.nr;
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for(i = 0; i < 4; i++ ) {
-        if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
-            brw_MOV(p, dst, suboffset(interp[i], 1));
-            brw_MUL(p, dst, dst, w);
-        }
-    }
-    brw_set_saturate(p, 0);
-}
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
 
-static void emit_wpos_xy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    unsigned mask = inst->DstReg.WriteMask;
-    struct brw_reg src0[2], dst[2];
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      emit = WRITEMASK_X;
+      nr = 1;
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      emit = WRITEMASK_XY;
+      nr = 2;
+      break;
+   default:
+      emit = WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   }
+#else
+   emit = WRITEMASK_XY;
+   nr = 2;
+#endif
 
-    dst[0] = get_dst_reg(c, inst, 0, 1);
-    dst[1] = get_dst_reg(c, inst, 1, 1);
+   msg_len = 1;
 
-    src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+   for (i = 0; i < nr; i++) {
+      static const unsigned swz[4] = {0,1,2,2};
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+      msg_len += 1;
+   }
 
-    /* Calc delta X,Y by subtracting origin in r1 from the pixel
-     * centers.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_MOV(p,
-		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_UW));
-    }
+   if (shadow) {
+      brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), src[2]);
+   }
 
-    if (mask & WRITEMASK_Y) {
-	/* TODO -- window_height - Y */
-	brw_MOV(p,
-		dst[1],
-		retype(src0[1], BRW_REGISTER_TYPE_UW));
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+	      4,
+	      shadow ? 6 : 4,
+	      0);
 
-    }
+   if (shadow)
+      brw_MOV(p, dst[3], brw_imm_f(1.0));
+#endif
 }
 
-/* TODO
-   BIAS on SIMD8 not workind yet...
- */
-static void emit_txb(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    unsigned i;
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
-    for (i = 0; i < 4; i++)
-	dst[i] = get_dst_reg(c, inst, i, 1);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
 
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-	    break;
-	default:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), src[2]);
-	    break;
-    }
-    brw_MOV(p, brw_message_reg(5), src[3]);
-    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
-    brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    inst->TexSrcUnit + 1, /* surface */
-	    inst->TexSrcUnit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
-	    4,
-	    4,
-	    0);
-}
 
-static void emit_tex(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+
+
+
+
+
+static void emit_fb_write(struct brw_wm_compile *c,
+			  struct tgsi_full_instruction *inst)
 {
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    unsigned msg_len;
-    unsigned i, nr;
-    unsigned emit;
-    boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
+   struct brw_compile *p = &c->func;
+   int nr = 2;
+   int channel;
+   int base_reg = 0;
 
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+   // src0 = output color
+   // src1 = payload_depth[0]
+   // src2 = output depth
+   // dst = ???
 
-    for (i = 0; i < 4; i++)
-	dst[i] = get_dst_reg(c, inst, i, 1);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
 
 
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    emit = WRITEMASK_X;
-	    nr = 1;
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    emit = WRITEMASK_XY;
-	    nr = 2;
-	    break;
-	default:
-	    emit = WRITEMASK_XYZ;
-	    nr = 3;
-	    break;
-    }
-    msg_len = 1;
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
 
-    for (i = 0; i < nr; i++) {
-	static const unsigned swz[4] = {0,1,2,2};
-	if (emit & (1<<i))
-	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
-	else
-	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
-	msg_len += 1;
-    }
+   {
+      brw_push_insn_state(p);
+      for (channel = 0; channel < 4; channel++) {
+	 struct brw_reg src0 = c->wm_regs[TGSI_FILE_OUTPUT][0][channel];
 
-    if (shadow) {
-	brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
-	brw_MOV(p, brw_message_reg(6), src[2]);
-    }
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+	 brw_MOV(p, brw_message_reg(nr + channel), src0);
+      }
+      /* skip over the regs populated above: */
+      nr += 8;
+      brw_pop_insn_state(p);
+   }
+    
 
-    brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    inst->TexSrcUnit + 1, /* surface */
-	    inst->TexSrcUnit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
-	    4,
-	    shadow ? 6 : 4,
-	    0);
+   /* Pass through control information:
+    */
+   /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_MOV(p,
+	      brw_message_reg(base_reg + 1),
+	      brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
 
-    if (shadow)
-	brw_MOV(p, dst[3], brw_imm_f(1.0));
-}
+   /* Send framebuffer write message: */
+   brw_fb_WRITE(p,
+		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		0,              /* render surface always 0 */
+		nr,
+		0,
+		1);
 
-static void post_wm_emit( struct brw_wm_compile *c )
-{
-    unsigned nr_insns = c->fp->program.Base.NumInstructions;
-    unsigned insn, target_insn;
-    struct prog_instruction *inst1, *inst2;
-    struct brw_instruction *brw_inst1, *brw_inst2;
-    int offset;
-    for (insn = 0; insn < nr_insns; insn++) {
-	inst1 = &c->fp->program.Base.Instructions[insn];
-	brw_inst1 = inst1->Data;
-	switch (inst1->Opcode) {
-	    case OPCODE_CAL:
-		target_insn = inst1->BranchTarget;
-		inst2 = &c->fp->program.Base.Instructions[target_insn];
-		brw_inst2 = inst2->Data;
-		offset = brw_inst2 - brw_inst1;
-		brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-		break;
-	    default:
-		break;
-	}
-    }
 }
 
-static void brw_wm_emit_glsl(struct brw_wm_compile *c)
 
+static void brw_wm_emit_instruction( struct brw_wm_compile *c,
+				     struct tgsi_full_instruction *inst )
 {
-#define MAX_IFSN 32
-#define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
-    struct brw_instruction *inst0, *inst1;
-    int i, if_insn = 0, loop_insn = 0;
-    struct brw_compile *p = &c->func;
-    struct brw_indirect stack_index = brw_indirect(0, 0);
+   struct brw_compile *p = &c->func;
 
-    brw_init_compile(&c->func);
-    c->reg_index = 0;
-    prealloc_reg(c);
-    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+#if 0   
+   if (inst->CondUpdate)
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   else
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#else
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#endif
 
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	struct prog_instruction *inst = &c->prog_instructions[i];
-	struct prog_instruction *orig_inst;
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_abs(c, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+      emit_alu2(c, inst, BRW_OPCODE_ADD);
+      break;
+   case TGSI_OPCODE_SUB:
+      assert(0);
+//      emit_alu2(c, inst, BRW_OPCODE_SUB);
+      break;
+   case TGSI_OPCODE_FRC:
+      emit_alu1(c, inst, BRW_OPCODE_FRC);
+      break;
+   case TGSI_OPCODE_FLR:
+      assert(0);
+//      emit_alu1(c, inst, BRW_OPCODE_FLR);
+      break;
+   case TGSI_OPCODE_LRP:
+      emit_lrp(c, inst);
+      break;
+   case TGSI_OPCODE_INT:
+      emit_alu1(c, inst, BRW_OPCODE_RNDD);
+      break;
+   case TGSI_OPCODE_MOV:
+      emit_alu1(c, inst, BRW_OPCODE_MOV);
+      break;
+   case TGSI_OPCODE_DP3:
+      emit_dp3(c, inst);
+      break;
+   case TGSI_OPCODE_DP4:
+      emit_dp4(c, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(c, inst);
+      break;
+   case TGSI_OPCODE_DPH:
+      emit_dph(c, inst);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(c, inst);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(c, inst);
+      break;
+   case TGSI_OPCODE_DDX:
+      emit_ddx(c, inst);
+      break;
+   case TGSI_OPCODE_DDY:
+      emit_ddy(c, inst);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_sop(c, inst, BRW_CONDITIONAL_L);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sop(c, inst, BRW_CONDITIONAL_LE);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sop(c, inst, BRW_CONDITIONAL_G);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sop(c, inst, BRW_CONDITIONAL_GE);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+      break;
+   case TGSI_OPCODE_MUL:
+      emit_alu2(c, inst, BRW_OPCODE_MUL);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(c, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+      emit_mad(c, inst);
+      break;
+   case TGSI_OPCODE_TEX:
+      emit_tex(c, inst);
+      break;
+   case TGSI_OPCODE_TXB:
+      emit_txb(c, inst);
+      break;
+   case TGSI_OPCODE_TEXKILL:
+      emit_kil(c);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_insn < MAX_IFSN);
+      c->if_inst[c->if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_insn-1]  = brw_ELSE(p, c->if_inst[c->if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_insn > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_insn]);
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1ud(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+//      orig_inst = inst->Data;
+//      orig_inst->Data = &p->store[p->nr_insn];
+      assert(0);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_pop_insn_state(p);
+      break;
+
+   case TGSI_OPCODE_RET:
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1ud(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_pop_insn_state(p);
+
+      break;
+   case TGSI_OPCODE_LOOP:
+      c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      c->loop_insn--;
+      c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
+      /* patch all the BREAK instructions from
+	 last BEGINLOOP */
+      while (c->inst0 > c->loop_inst[c->loop_insn]) {
+	 c->inst0--;
+	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0 + 1;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 } else if (c->inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+      break;
+   case TGSI_OPCODE_END:
+      emit_fb_write(c, inst);
+      break;
+
+   default:
+      _mesa_printf("unsupported IR in fragment shader %d\n",
+		   inst->Instruction.Opcode);
+   }
+#if 0
+   if (inst->CondUpdate)
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   else
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+}
 
-	if ((orig_inst = inst->Data) != 0)
-	    orig_inst->Data = current_insn(p);
 
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 
-	switch (inst->Opcode) {
-	    case WM_PIXELXY:
-		emit_pixel_xy(c, inst);
-		break;
-	    case WM_DELTAXY:
-		emit_delta_xy(c, inst);
-		break;
-	    case WM_PIXELW:
-		emit_pixel_w(c, inst);
-		break;
-	    case WM_LINTERP:
-		emit_linterp(c, inst);
-		break;
-	    case WM_PINTERP:
-		emit_pinterp(c, inst);
-		break;
-	    case WM_CINTERP:
-		emit_cinterp(c, inst);
-		break;
-	    case WM_WPOSXY:
-		emit_wpos_xy(c, inst);
-		break;
-	    case WM_FB_WRITE:
-		emit_fb_write(c, inst);
-		break;
-	    case OPCODE_ABS:
-		emit_abs(c, inst);
-		break;
-	    case OPCODE_ADD:
-		emit_add(c, inst);
-		break;
-	    case OPCODE_SUB:
-		emit_sub(c, inst);
-		break;
-	    case OPCODE_FRC:
-		emit_frc(c, inst);
-		break;
-	    case OPCODE_FLR:
-		emit_flr(c, inst);
-		break;
-	    case OPCODE_LRP:
-		emit_lrp(c, inst);
-		break;
-	    case OPCODE_INT:
-		emit_int(c, inst);
-		break;
-	    case OPCODE_MOV:
-		emit_mov(c, inst);
-		break;
-	    case OPCODE_DP3:
-		emit_dp3(c, inst);
-		break;
-	    case OPCODE_DP4:
-		emit_dp4(c, inst);
-		break;
-	    case OPCODE_XPD:
-		emit_xpd(c, inst);
-		break;
-	    case OPCODE_DPH:
-		emit_dph(c, inst);
-		break;
-	    case OPCODE_RCP:
-		emit_rcp(c, inst);
-		break;
-	    case OPCODE_RSQ:
-		emit_rsq(c, inst);
-		break;
-	    case OPCODE_SIN:
-		emit_sin(c, inst);
-		break;
-	    case OPCODE_COS:
-		emit_cos(c, inst);
-		break;
-	    case OPCODE_EX2:
-		emit_ex2(c, inst);
-		break;
-	    case OPCODE_LG2:
-		emit_lg2(c, inst);
-		break;
-	    case OPCODE_MAX:
-		emit_max(c, inst);
-		break;
-	    case OPCODE_MIN:
-		emit_min(c, inst);
-		break;
-	    case OPCODE_DDX:
-		emit_ddx(c, inst);
-		break;
-	    case OPCODE_DDY:
-                emit_ddy(c, inst);
-                break;
-	    case OPCODE_SLT:
-		emit_slt(c, inst);
-		break;
-	    case OPCODE_SLE:
-		emit_sle(c, inst);
-		break;
-	    case OPCODE_SGT:
-		emit_sgt(c, inst);
-		break;
-	    case OPCODE_SGE:
-		emit_sge(c, inst);
-		break;
-	    case OPCODE_SEQ:
-		emit_seq(c, inst);
-		break;
-	    case OPCODE_SNE:
-		emit_sne(c, inst);
-		break;
-	    case OPCODE_MUL:
-		emit_mul(c, inst);
-		break;
-	    case OPCODE_POW:
-		emit_pow(c, inst);
-		break;
-	    case OPCODE_MAD:
-		emit_mad(c, inst);
-		break;
-	    case OPCODE_TEX:
-		emit_tex(c, inst);
-		break;
-	    case OPCODE_TXB:
-		emit_txb(c, inst);
-		break;
-	    case OPCODE_KIL_NV:
-		emit_kil(c);
-		break;
-	    case OPCODE_IF:
-		assert(if_insn < MAX_IFSN);
-		if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
-		break;
-	    case OPCODE_ELSE:
-		if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
-		break;
-	    case OPCODE_ENDIF:
-		assert(if_insn > 0);
-		brw_ENDIF(p, if_inst[--if_insn]);
-		break;
-	    case OPCODE_BGNSUB:
-	    case OPCODE_ENDSUB:
-		break;
-	    case OPCODE_CAL:
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-                brw_ADD(p, get_addr_reg(stack_index),
-                         get_addr_reg(stack_index), brw_imm_d(4));
-                orig_inst = inst->Data;
-                orig_inst->Data = &p->store[p->nr_insn];
-                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-                brw_pop_insn_state(p);
-		break;
 
-	    case OPCODE_RET:
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_ADD(p, get_addr_reg(stack_index),
-                        get_addr_reg(stack_index), brw_imm_d(-4));
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-		brw_pop_insn_state(p);
 
-		break;
-	    case OPCODE_BGNLOOP:
-		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
-		break;
-	    case OPCODE_BRK:
-		brw_BREAK(p);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_CONT:
-		brw_CONT(p);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_ENDLOOP:
-		loop_insn--;
-		inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
-		/* patch all the BREAK instructions from
-		   last BEGINLOOP */
-		while (inst0 > loop_inst[loop_insn]) {
-		    inst0--;
-		    if (inst0->header.opcode == BRW_OPCODE_BREAK) {
-			inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
-			inst0->bits3.if_else.pop_count = 0;
-		    } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
-                        inst0->bits3.if_else.jump_count = inst1 - inst0;
-                        inst0->bits3.if_else.pop_count = 0;
-                    }
-		}
-		break;
-	    default:
-		_mesa_printf("unsupported IR in fragment shader %d\n",
-			inst->Opcode);
-	}
-	if (inst->CondUpdate)
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	else
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    }
-    post_wm_emit(c);
-    for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
-	c->fp->program.Base.Instructions[i].Data = NULL;
-}
 
 void brw_wm_glsl_emit(struct brw_wm_compile *c)
 {
-    brw_wm_pass_fp(c);
-    c->tmp_index = 127;
-    brw_wm_emit_glsl(c);
-    c->prog_data.total_grf = c->reg_index;
-    c->prog_data.total_scratch = 0;
-}
+   struct tgsi_parse_context parse;
+   struct brw_compile *p = &c->func;
+
+   brw_init_compile(&c->func);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   c->reg_index = 0;
+   c->if_insn = 0;
+   c->loop_insn = 0;
+   c->stack_index = brw_indirect(0,0);
+
+   /* Do static register allocation and parameter interpolation:
+    */
+   brw_wm_emit_decls( c );
+
+   /* Emit the actual program.  All done with very direct translation,
+    * hopefully we can improve on this shortly...
+    */
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* already done */
+	 break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* not handled yet */
+	 assert(0);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         brw_wm_emit_instruction(c, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   /* Fix up call targets:
+    */
+#if 0
+   {
+      unsigned nr_insns = c->fp->program.Base.NumInstructions;
+      unsigned insn, target_insn;
+      struct tgsi_full_instruction *inst1, *inst2;
+      struct brw_instruction *brw_inst1, *brw_inst2;
+      int offset;
+      for (insn = 0; insn < nr_insns; insn++) {
+	 inst1 = &c->fp->program.Base.Instructions[insn];
+	 brw_inst1 = inst1->Data;
+	 switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->fp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+	 }
+      }
+   }
 #endif
+
+   c->prog_data.total_grf = c->reg_index;
+   c->prog_data.total_scratch = 0;
+}
diff --git a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
index 4ea0dd7db0..fbeea8c809 100644
--- a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
@@ -229,12 +229,12 @@ static void brw_update_sampler_state( const struct pipe_sampler_state *pipe_samp
  * complicates various things.  However, this is still too confusing -
  * FIXME: simplify all the different new texture state flags.
  */
-void brw_upload_wm_samplers(struct brw_context *brw)
+static void upload_wm_samplers(struct brw_context *brw)
 {
    unsigned unit;
    unsigned sampler_count = 0;
 
-   /* _NEW_TEXTURE */
+   /* BRW_NEW_SAMPLER */
    for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
       if (brw->attribs.Samplers[unit]) { /* FIXME: correctly detect enabled ones */
          const struct pipe_sampler_state *sampler = brw->attribs.Samplers[unit];
@@ -262,14 +262,11 @@ void brw_upload_wm_samplers(struct brw_context *brw)
 			   sizeof(struct brw_sampler_state) * brw->wm.sampler_count);
 }
 
-#if 0
 const struct brw_tracked_state brw_wm_samplers = {
    .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = 0,
+      .brw = BRW_NEW_SAMPLER,
       .cache = 0
    },
    .update = upload_wm_samplers
 };
-#endif
 
diff --git a/src/mesa/pipe/i965simple/brw_wm_state.c b/src/mesa/pipe/i965simple/brw_wm_state.c
index b45fb2f56b..52d2c85423 100644
--- a/src/mesa/pipe/i965simple/brw_wm_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_state.c
@@ -34,15 +34,13 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
+#include "pipe/p_util.h"
 
 /***********************************************************************
  * WM unit - fragment programs and rasterization
  */
-
-#if 0
 static void upload_wm_unit(struct brw_context *brw )
 {
-   struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_state wm;
    unsigned max_threads;
    unsigned per_thread;
@@ -56,7 +54,7 @@ static void upload_wm_unit(struct brw_context *brw )
    memset(&wm, 0, sizeof(wm));
 
    /* CACHE_NEW_WM_PROG */
-   wm.thread0.grf_reg_count = ALIGN(brw->wm.prog_data->total_grf, 16) / 16 - 1;
+   wm.thread0.grf_reg_count = align(brw->wm.prog_data->total_grf, 16) / 16 - 1;
    wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
    wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
    wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
@@ -64,9 +62,10 @@ static void upload_wm_unit(struct brw_context *brw )
 
    wm.wm5.max_threads = max_threads;
 
-   per_thread = ALIGN(brw->wm.prog_data->total_scratch, 1024);
+   per_thread = align(brw->wm.prog_data->total_scratch, 1024);
    assert(per_thread <= 12 * 1024);
 
+#if 0
    if (brw->wm.prog_data->total_scratch) {
       unsigned total = per_thread * (max_threads + 1);
 
@@ -95,6 +94,7 @@ static void upload_wm_unit(struct brw_context *brw )
     * so just fail for now if we hit that path.
     */
    assert(brw->wm.prog_data->total_scratch == 0);
+#endif
 
    /* CACHE_NEW_SURFACE */
    wm.thread1.binding_table_entry_count = brw->wm.nr_surfaces;
@@ -112,23 +112,20 @@ static void upload_wm_unit(struct brw_context *brw )
 
    /* BRW_NEW_FRAGMENT_PROGRAM */
    {
-      const struct gl_fragment_program *fp = brw->fragment_program;
+      const struct brw_fragment_program *fp = brw->attribs.FragmentProgram;
 
-      if (fp->Base.InputsRead & (1<<FRAG_ATTRIB_WPOS))
+      if (fp->UsesDepth)
 	 wm.wm5.program_uses_depth = 1; /* as far as we can tell */
 
-      if (fp->Base.OutputsWritten & (1<<FRAG_RESULT_DEPR))
+      if (fp->ComputesDepth)
 	 wm.wm5.program_computes_depth = 1;
 
-      /* _NEW_COLOR */
+      /* BRW_NEW_ALPHA_TEST */
       if (fp->UsesKill ||
-	  brw->attribs.Color->AlphaEnabled)
+	  brw->attribs.AlphaTest->enabled)
 	 wm.wm5.program_uses_killpixel = 1;
 
-      if (brw_wm_is_glsl(fp))
-	  wm.wm5.enable_8_pix = 1;
-      else
-	  wm.wm5.enable_16_pix = 1;
+      wm.wm5.enable_8_pix = 1;
    }
 
    wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
@@ -138,11 +135,11 @@ static void upload_wm_unit(struct brw_context *brw )
    wm.wm5.line_aa_region_width = 0;
    wm.wm5.line_endcap_aa_region_width = 1;
 
-   /* _NEW_POLYGONSTIPPLE */
-   if (brw->attribs.Polygon->StippleFlag)
+   /* BRW_NEW_RASTERIZER */
+   if (brw->attribs.Raster->poly_stipple_enable)
       wm.wm5.polygon_stipple = 1;
 
-   /* _NEW_POLYGON */
+#if 0
    if (brw->attribs.Polygon->OffsetFill) {
       wm.wm5.depth_offset = 1;
       /* Something wierd going on with legacy_global_depth_bias,
@@ -156,13 +153,13 @@ static void upload_wm_unit(struct brw_context *brw )
        */
       wm.global_depth_offset_scale = brw->attribs.Polygon->OffsetFactor;
    }
+#endif
 
-   /* _NEW_LINE */
-   if (brw->attribs.Line->StippleFlag) {
+   if (brw->attribs.Raster->line_stipple_enable) {
       wm.wm5.line_stipple = 1;
    }
 
-   if (BRW_DEBUG & DEBUG_STATS || intel->stats_wm)
+   if (BRW_DEBUG & DEBUG_STATS)
       wm.wm4.stats_enable = 1;
 
    brw->wm.state_gs_offset = brw_cache_data( &brw->cache[BRW_WM_UNIT], &wm );
@@ -183,14 +180,10 @@ static void upload_wm_unit(struct brw_context *brw )
 
 const struct brw_tracked_state brw_wm_unit = {
    .dirty = {
-      .mesa = (_NEW_POLYGON |
-	       _NEW_POLYGONSTIPPLE |
-	       _NEW_LINE |
-	       _NEW_COLOR),
-
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM |
-	      BRW_NEW_CURBE_OFFSETS |
-	      BRW_NEW_LOCK),
+      .brw = (BRW_NEW_RASTERIZER |
+	      BRW_NEW_ALPHA_TEST |
+	      BRW_NEW_FS |
+	      BRW_NEW_CURBE_OFFSETS),
 
       .cache = (CACHE_NEW_SURFACE |
 		CACHE_NEW_WM_PROG |
@@ -199,4 +192,3 @@ const struct brw_tracked_state brw_wm_unit = {
    .update = upload_wm_unit
 };
 
-#endif
diff --git a/src/mesa/pipe/i965simple/brw_wm_surface_state.c b/src/mesa/pipe/i965simple/brw_wm_surface_state.c
index 844cfc54a9..6e68c4c660 100644
--- a/src/mesa/pipe/i965simple/brw_wm_surface_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_surface_state.c
@@ -33,8 +33,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-
-#if 0
 static unsigned translate_tex_target( int target )
 {
    switch (target) {
@@ -75,13 +73,13 @@ static unsigned translate_tex_format( unsigned mesa_format )
       assert(0);		/* not supported for sampling */
       return BRW_SURFACEFORMAT_R8G8B8_UNORM;
 
-   case PIPE_FORMAT_U_A8_R8_G8_B8:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
-   case PIPE_FORMAT_RGBA8888_REV:
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
       return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
 
-   case PIPE_FORMAT_U_R5_G6_B5:
+   case PIPE_FORMAT_R5G6B5_UNORM:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
 
    case PIPE_FORMAT_A1R5G5B5_UNORM:
@@ -95,14 +93,15 @@ static unsigned translate_tex_format( unsigned mesa_format )
 
    case PIPE_FORMAT_YCBCR:
       return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
-
+#if 0
    case PIPE_FORMAT_RGB_FXT1:
    case PIPE_FORMAT_RGBA_FXT1:
       return BRW_SURFACEFORMAT_FXT1;
+#endif
 
    case PIPE_FORMAT_Z16_UNORM:
       return BRW_SURFACEFORMAT_I16_UNORM;
-
+#if 0
    case PIPE_FORMAT_RGB_DXT1:
        return BRW_SURFACEFORMAT_DXT1_RGB;
 
@@ -119,6 +118,7 @@ static unsigned translate_tex_format( unsigned mesa_format )
       return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
    case PIPE_FORMAT_SRGB_DXT1:
       return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+#endif
 
    default:
       assert(0);
@@ -126,41 +126,46 @@ static unsigned translate_tex_format( unsigned mesa_format )
    }
 }
 
+static unsigned brw_buffer_offset(struct brw_context *brw,
+                                  struct pipe_buffer_handle *buffer)
+{
+   return brw->winsys->get_buffer_offset(brw->winsys,
+                                         buffer,
+                                         0);
+}
+
 static
-void brw_update_texture_surface( GLcontext *ctx,
+void brw_update_texture_surface( struct brw_context *brw,
 				 unsigned unit )
 {
-   struct brw_context *brw = brw_context(ctx);
-   struct gl_texture_object *tObj = brw->attribs.Texture->Unit[unit]._Current;
-   struct intel_texture_object *intelObj = intel_texture_object(tObj);
-   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   const struct brw_texture *tObj = brw->attribs.Texture[unit];
    struct brw_surface_state surf;
 
    memset(&surf, 0, sizeof(surf));
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf.ss0.surface_type = translate_tex_target(tObj->Target);
-   surf.ss0.surface_format = translate_tex_format(firstImage->TexFormat->MesaFormat);
+   surf.ss0.surface_type = translate_tex_target(tObj->base.target);
+   surf.ss0.surface_format = translate_tex_format(tObj->base.format);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
 /*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
 
    /* Updated in emit_reloc */
-   surf.ss1.base_addr = brw_buffer_offset( intelObj->mt->region->buffer );
+   surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
 
-   surf.ss2.mip_count = intelObj->lastLevel - intelObj->firstLevel;
-   surf.ss2.width = firstImage->Width - 1;
-   surf.ss2.height = firstImage->Height - 1;
+   surf.ss2.mip_count = tObj->base.last_level - tObj->base.first_level;
+   surf.ss2.width = tObj->base.width[0];
+   surf.ss2.height = tObj->base.height[0];
 
    surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
-   surf.ss3.tiled_surface = intelObj->mt->region->tiled; /* always zero */
-   surf.ss3.pitch = (intelObj->mt->pitch * intelObj->mt->cpp) - 1;
-   surf.ss3.depth = firstImage->Depth - 1;
+   surf.ss3.tiled_surface = 0; /* always zero */
+   surf.ss3.pitch = tObj->pitch;
+   surf.ss3.depth = tObj->base.depth[0];
 
    surf.ss4.min_lod = 0;
 
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+   if (tObj->base.target == PIPE_TEXTURE_CUBE) {
       surf.ss0.cube_pos_x = 1;
       surf.ss0.cube_pos_y = 1;
       surf.ss0.cube_pos_z = 1;
@@ -180,13 +185,14 @@ void brw_update_texture_surface( GLcontext *ctx,
 
 static void upload_wm_surfaces(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   struct intel_context *intel = &brw->intel;
    unsigned i;
 
    {
       struct brw_surface_state surf;
-      struct intel_region *region = brw->state.draw_region;
+
+      /* BRW_NEW_FRAMEBUFFER
+       */
+      struct pipe_surface *region = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
 
       memset(&surf, 0, sizeof(surf));
 
@@ -198,27 +204,27 @@ static void upload_wm_surfaces(struct brw_context *brw )
 
 	 surf.ss0.surface_type = BRW_SURFACE_2D;
 
-	 surf.ss1.base_addr = brw_buffer_offset( region->buffer );
+	 surf.ss1.base_addr = brw_buffer_offset( brw, region->buffer );
 
-	 surf.ss2.width = region->pitch - 1; /* XXX: not really! */
-	 surf.ss2.height = region->height - 1;
+	 surf.ss2.width = region->width;
+	 surf.ss2.height = region->height;
 	 surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
-	 surf.ss3.tiled_surface = region->tiled;
-	 surf.ss3.pitch = (region->pitch * region->cpp) - 1;
+	 surf.ss3.tiled_surface = 0;
+	 surf.ss3.pitch = region->pitch;
       } else {
 	 surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 surf.ss0.surface_type = BRW_SURFACE_NULL;
       }
 
-      /* _NEW_COLOR */
-      surf.ss0.color_blend = (!brw->attribs.Color->_LogicOpEnabled &&
-			      brw->attribs.Color->BlendEnabled);
+      /* BRW_NEW_BLEND */
+      surf.ss0.color_blend = (!brw->attribs.Blend->logicop_enable &&
+			      brw->attribs.Blend->blend_enable);
 
 
-      surf.ss0.writedisable_red =   !brw->attribs.Color->ColorMask[0];
-      surf.ss0.writedisable_green = !brw->attribs.Color->ColorMask[1];
-      surf.ss0.writedisable_blue =  !brw->attribs.Color->ColorMask[2];
-      surf.ss0.writedisable_alpha = !brw->attribs.Color->ColorMask[3];
+      surf.ss0.writedisable_red =   !brw->attribs.BlendColor.color[0];
+      surf.ss0.writedisable_green = !brw->attribs.BlendColor.color[1];
+      surf.ss0.writedisable_blue =  !brw->attribs.BlendColor.color[2];
+      surf.ss0.writedisable_alpha = !brw->attribs.BlendColor.color[3];
 
 
 
@@ -230,23 +236,24 @@ static void upload_wm_surfaces(struct brw_context *brw )
 
 
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
+      const struct brw_texture *texUnit = brw->attribs.Texture[i];
 
-      /* _NEW_TEXTURE, BRW_NEW_TEXDATA
+      /* BRW_NEW_TEXTURE
        */
-      if (texUnit->_ReallyEnabled &&
-	  intel_finalize_mipmap_tree(intel, i)) {
+      if (texUnit->base.refcount/*(texUnit->refcount > 0) == really used */) {
 
-	 brw_update_texture_surface(ctx, i);
+	 brw_update_texture_surface(brw, i);
 
 	 brw->wm.nr_surfaces = i+2;
       }
-      else if( texUnit->_ReallyEnabled &&
+#if 0
+      else if( texUnit->refcount &&
 	       texUnit->_Current == intel->frame_buffer_texobj )
       {
 	 brw->wm.bind.surf_ss_offset[i+1] = brw->wm.bind.surf_ss_offset[0];
 	 brw->wm.nr_surfaces = i+2;
       }
+#endif
       else {
 	 brw->wm.bind.surf_ss_offset[i+1] = 0;
       }
@@ -293,14 +300,12 @@ static void emit_reloc_wm_surfaces(struct brw_context *brw)
 }
 #endif
 
-
-
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS,
-      .brw = BRW_NEW_CONTEXT,
+      .brw = (BRW_NEW_FRAMEBUFFER |
+	      BRW_NEW_BLEND |
+	      BRW_NEW_TEXTURE),
       .cache = 0
    },
    .update = upload_wm_surfaces,
 };
-#endif
diff --git a/src/mesa/pipe/p_state.h b/src/mesa/pipe/p_state.h
index 4e42838f1d..af65d365bf 100644
--- a/src/mesa/pipe/p_state.h
+++ b/src/mesa/pipe/p_state.h
@@ -94,6 +94,7 @@ struct pipe_rasterizer_state
    unsigned line_stipple_factor:8;  /**< [1..256] actually */
    unsigned line_stipple_pattern:16;
    unsigned bypass_clipping:1;
+   unsigned origin_lower_left:1;  /**< Is (0,0) the lower-left corner? */
 
    float line_width;
    float point_size;           /**< used when no per-vertex size */
@@ -139,6 +140,7 @@ struct pipe_shader_state {
    const struct tgsi_token *tokens;
    ubyte num_inputs;
    ubyte num_outputs;
+   ubyte input_map[PIPE_MAX_SHADER_INPUTS]; /* XXX this may be temporary */
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index e6d284d932..46edcf3075 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -360,6 +360,11 @@ static INLINE float LOG2(float val)
 #define CEILF(x)   ((float) ceil(x))
 #endif
 
+static INLINE int align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
 /* Convenient...
  */
 extern void _mesa_printf(const char *str, ...);
diff --git a/src/mesa/pipe/softpipe/sp_context.c b/src/mesa/pipe/softpipe/sp_context.c
index dcf0444f6e..b62e691e87 100644
--- a/src/mesa/pipe/softpipe/sp_context.c
+++ b/src/mesa/pipe/softpipe/sp_context.c
@@ -55,8 +55,6 @@ static boolean
 softpipe_is_format_supported( struct pipe_context *pipe,
                               enum pipe_format format, uint type )
 {
-   struct softpipe_context *softpipe = softpipe_context( pipe );
-
    switch (type) {
    case PIPE_TEXTURE:
       /* softpipe supports all texture formats */
diff --git a/src/mesa/pipe/softpipe/sp_context.h b/src/mesa/pipe/softpipe/sp_context.h
index 2c038de5f7..8fd44933f2 100644
--- a/src/mesa/pipe/softpipe/sp_context.h
+++ b/src/mesa/pipe/softpipe/sp_context.h
@@ -110,8 +110,6 @@ struct softpipe_context {
    struct vertex_info vertex_info;
    unsigned attr_mask;
    unsigned nr_frag_attrs;  /**< number of active fragment attribs */
-   boolean need_z;  /**< produce quad/fragment Z values? */
-   boolean need_w;  /**< produce quad/fragment W values? */
    int psize_slot;
 
 #if 0
diff --git a/src/mesa/pipe/softpipe/sp_headers.h b/src/mesa/pipe/softpipe/sp_headers.h
index b9f2b2205a..0ae31d8796 100644
--- a/src/mesa/pipe/softpipe/sp_headers.h
+++ b/src/mesa/pipe/softpipe/sp_headers.h
@@ -73,6 +73,7 @@ struct quad_header {
    float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
 
    const struct tgsi_interp_coef *coef;
+   const struct tgsi_interp_coef *posCoef;
 
    unsigned nr_attrs;
 };
diff --git a/src/mesa/pipe/softpipe/sp_prim_setup.c b/src/mesa/pipe/softpipe/sp_prim_setup.c
index fc96f92af1..2ccf5e2624 100644
--- a/src/mesa/pipe/softpipe/sp_prim_setup.c
+++ b/src/mesa/pipe/softpipe/sp_prim_setup.c
@@ -36,10 +36,12 @@
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_state.h"
 #include "sp_prim_setup.h"
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_vertex.h"
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
 
 #define DEBUG_VERTS 0
 
@@ -80,8 +82,11 @@ struct setup_stage {
    float oneoverarea;
 
    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+   struct tgsi_interp_coef posCoef;  /* For Z, W */
    struct quad_header quad; 
 
+   uint firstFpInput;  /** Semantic type of first frag input */
+
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
@@ -365,18 +370,17 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
  * \param i  which component of the slot (0..3)
  */
 static void const_coeff( struct setup_stage *setup,
-			 unsigned slot,
-			 unsigned i )
+                         struct tgsi_interp_coef *coef,
+                         uint vertSlot, uint i)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = 0;
-   setup->coef[slot].dady[i] = 0;
+   coef->dadx[i] = 0;
+   coef->dady[i] = 0;
 
    /* need provoking vertex info!
     */
-   setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+   coef->a0[i] = setup->vprovoke->data[vertSlot][i];
 }
 
 
@@ -385,19 +389,20 @@ static void const_coeff( struct setup_stage *setup,
  * for a triangle.
  */
 static void tri_linear_coeff( struct setup_stage *setup,
-                              unsigned slot,
-                              unsigned i)
+                              struct tgsi_interp_coef *coef,
+                              uint vertSlot, uint i)
 {
-   float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i];
-   float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
+   float botda = setup->vmid->data[vertSlot][i] - setup->vmin->data[vertSlot][i];
+   float majda = setup->vmax->data[vertSlot][i] - setup->vmin->data[vertSlot][i];
    float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
    float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-   
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
@@ -411,9 +416,9 @@ static void tri_linear_coeff( struct setup_stage *setup,
     * to define a0 as the sample at a pixel center somewhere near vmin
     * instead - i'll switch to this later.
     */
-   setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   coef->a0[i] = (setup->vmin->data[vertSlot][i] - 
+                  (dadx * (setup->vmin->data[0][0] - 0.5f) + 
+                   dady * (setup->vmin->data[0][1] - 0.5f)));
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -434,39 +439,76 @@ static void tri_linear_coeff( struct setup_stage *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void tri_persp_coeff( struct setup_stage *setup,
-                             unsigned slot,
-                             unsigned i )
+                             struct tgsi_interp_coef *coef,
+                             uint vertSlot, uint i)
 {
-   /* premultiply by 1/w:
+   /* premultiply by 1/w  (v->data[0][3] is always W):
     */
-   float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3];
-   float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3];
-
+   float mina = setup->vmin->data[vertSlot][i] * setup->vmin->data[0][3];
+   float mida = setup->vmid->data[vertSlot][i] * setup->vmid->data[0][3];
+   float maxa = setup->vmax->data[vertSlot][i] * setup->vmax->data[0][3];
    float botda = mida - mina;
    float majda = maxa - mina;
    float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
    float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
       
    /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup->vmin->data[slot][i],
-          setup->vmid->data[slot][i],
-          setup->vmax->data[slot][i]
+   printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          setup->vmin->data[vertSlot][i],
+          setup->vmid->data[vertSlot][i],
+          setup->vmax->data[vertSlot][i]
           );
    */
-
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
-   setup->coef[slot].a0[i] = (mina - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (mina - 
+                  (dadx * (setup->vmin->data[0][0] - 0.5f) + 
+                   dady * (setup->vmin->data[0][1] - 0.5f)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * Z and W are copied from posCoef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coeff(struct setup_stage *setup)
+{
+   /*X*/
+   setup->coef[0].a0[0] = 0;
+   setup->coef[0].dadx[0] = 1.0;
+   setup->coef[0].dady[0] = 0.0;
+   /*Y*/
+   if (setup->softpipe->rasterizer->origin_lower_left) {
+      /* y=0=bottom */
+      const int winHeight = setup->softpipe->framebuffer.cbufs[0]->height;
+      setup->coef[0].a0[1] = winHeight - 1;
+      setup->coef[0].dady[1] = -1.0;
+   }
+   else {
+      /* y=0=top */
+      setup->coef[0].a0[1] = 0.0;
+      setup->coef[0].dady[1] = 1.0;
+   }
+   setup->coef[0].dadx[1] = 0.0;
+   /*Z*/
+   setup->coef[0].a0[2] = setup->posCoef.a0[2];
+   setup->coef[0].dadx[2] = setup->posCoef.dadx[2];
+   setup->coef[0].dady[2] = setup->posCoef.dady[2];
+   /*w*/
+   setup->coef[0].a0[3] = setup->posCoef.a0[3];
+   setup->coef[0].dadx[3] = setup->posCoef.dadx[3];
+   setup->coef[0].dady[3] = setup->posCoef.dady[3];
 }
 
 
+
 /**
  * Compute the setup->coef[] array dadx, dady, a0 values.
  * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
@@ -474,36 +516,67 @@ static void tri_persp_coeff( struct setup_stage *setup,
 static void setup_tri_coefficients( struct setup_stage *setup )
 {
    const enum interp_mode *interp = setup->softpipe->vertex_info.interp_mode;
-   unsigned slot, j;
+#define USE_INPUT_MAP 0
+#if USE_INPUT_MAP
+   const struct pipe_shader_state *fs = &setup->softpipe->fs->shader;
+#endif
+   uint fragSlot;
 
    /* z and w are done by linear interpolation:
     */
-   tri_linear_coeff(setup, 0, 2);
-   tri_linear_coeff(setup, 0, 3);
+   tri_linear_coeff(setup, &setup->posCoef, 0, 2);
+   tri_linear_coeff(setup, &setup->posCoef, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 1; slot < setup->quad.nr_attrs; slot++) {
-      switch (interp[slot]) {
-      case INTERP_CONSTANT:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    const_coeff(setup, slot, j);
-	 break;
-      
-      case INTERP_LINEAR:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    tri_linear_coeff(setup, slot, j);
-	 break;
+   for (fragSlot = 0; fragSlot < setup->quad.nr_attrs; fragSlot++) {
+      /* which vertex output maps to this fragment input: */
+#if !USE_INPUT_MAP
+      uint vertSlot;
+      if (setup->firstFpInput == TGSI_SEMANTIC_POSITION) {
+         if (fragSlot == 0) {
+            setup_fragcoord_coeff(setup);
+            continue;
+         }
+         vertSlot = fragSlot;
+      }
+      else {
+         vertSlot = fragSlot + 1;
+      }
 
-      case INTERP_PERSPECTIVE:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    tri_persp_coeff(setup, slot, j);
-	 break;
+#else
+      uint vertSlot = fs->input_map[fragSlot];
 
-      default:
-         /* invalid interp mode */
-         assert(0);
+      if (vertSlot == 0) {
+         /* special case: shader is reading gl_FragCoord */
+         /* XXX with a new INTERP_POSITION token, we could just add a
+          * new case to the switch below.
+          */
+         setup_fragcoord_coeff(setup);
+      }
+      else {
+#endif
+         uint j;
+         switch (interp[vertSlot]) {
+         case INTERP_CONSTANT:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         case INTERP_LINEAR:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               tri_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         case INTERP_PERSPECTIVE:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               tri_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         default:
+            /* invalid interp mode */
+            assert(0);
+         }
+#if USE_INPUT_MAP
       }
+#endif
    }
 }
 
@@ -660,17 +733,18 @@ static void setup_tri( struct draw_stage *stage,
  * for a line.
  */
 static void
-line_linear_coeff(struct setup_stage *setup, unsigned slot, unsigned i)
+line_linear_coeff(struct setup_stage *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
 {
-   const float da = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
+   const float da = setup->vmax->data[vertSlot][i] - setup->vmin->data[vertSlot][i];
    const float dadx = da * setup->emaj.dx * setup->oneoverarea;
    const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   setup->coef[slot].dadx[i] = dadx;
-   setup->coef[slot].dady[i] = dady;
-   setup->coef[slot].a0[i]
-      = (setup->vmin->data[slot][i] - 
-         (dadx * (setup->vmin->data[0][0] - 0.5f) + 
-          dady * (setup->vmin->data[0][1] - 0.5f)));
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin->data[vertSlot][i] - 
+                  (dadx * (setup->vmin->data[0][0] - 0.5f) + 
+                   dady * (setup->vmin->data[0][1] - 0.5f)));
 }
 
 
@@ -679,21 +753,21 @@ line_linear_coeff(struct setup_stage *setup, unsigned slot, unsigned i)
  * for a line.
  */
 static void
-line_persp_coeff(struct setup_stage *setup, unsigned slot, unsigned i)
+line_persp_coeff(struct setup_stage *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
 {
    /* XXX double-check/verify this arithmetic */
-   const float a0 = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   const float a1 = setup->vmax->data[slot][i] * setup->vmin->data[0][3];
+   const float a0 = setup->vmin->data[vertSlot][i] * setup->vmin->data[0][3];
+   const float a1 = setup->vmax->data[vertSlot][i] * setup->vmin->data[0][3];
    const float da = a1 - a0;
    const float dadx = da * setup->emaj.dx * setup->oneoverarea;
    const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   setup->coef[slot].dadx[i] = dadx;
-   setup->coef[slot].dady[i] = dady;
-   setup->coef[slot].a0[i]
-      = (setup->vmin->data[slot][i] - 
-         (dadx * (setup->vmin->data[0][0] - 0.5f) + 
-          dady * (setup->vmin->data[0][1] - 0.5f)));
-
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin->data[vertSlot][i] - 
+                  (dadx * (setup->vmin->data[0][0] - 0.5f) + 
+                   dady * (setup->vmin->data[0][1] - 0.5f)));
 }
 
 
@@ -705,7 +779,8 @@ static INLINE void
 setup_line_coefficients(struct setup_stage *setup, struct prim_header *prim)
 {
    const enum interp_mode *interp = setup->softpipe->vertex_info.interp_mode;
-   unsigned slot, j;
+   const struct pipe_shader_state *fs = &setup->softpipe->fs->shader;
+   unsigned fragSlot;
 
    /* use setup->vmin, vmax to point to vertices */
    setup->vprovoke = prim->v[1];
@@ -720,31 +795,39 @@ setup_line_coefficients(struct setup_stage *setup, struct prim_header *prim)
 
    /* z and w are done by linear interpolation:
     */
-   line_linear_coeff(setup, 0, 2);
-   line_linear_coeff(setup, 0, 3);
+   line_linear_coeff(setup, &setup->posCoef, 0, 2);
+   line_linear_coeff(setup, &setup->posCoef, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 1; slot < setup->quad.nr_attrs; slot++) {
-      switch (interp[slot]) {
-      case INTERP_CONSTANT:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    const_coeff(setup, slot, j);
-	 break;
-      
-      case INTERP_LINEAR:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    line_linear_coeff(setup, slot, j);
-	 break;
-
-      case INTERP_PERSPECTIVE:
-	 for (j = 0; j < NUM_CHANNELS; j++)
-	    line_persp_coeff(setup, slot, j);
-	 break;
+   for (fragSlot = 0; fragSlot < setup->quad.nr_attrs; fragSlot++) {
+      /* which vertex output maps to this fragment input: */
+      uint vertSlot = fs->input_map[fragSlot];
 
-      default:
-         /* invalid interp mode */
-         assert(0);
+      if (vertSlot == 0) {
+         /* special case: shader is reading gl_FragCoord */
+         setup_fragcoord_coeff(setup);
+      }
+      else {
+         uint j;
+         switch (interp[vertSlot]) {
+         case INTERP_CONSTANT:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         case INTERP_LINEAR:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               line_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         case INTERP_PERSPECTIVE:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               line_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+            
+         default:
+            /* invalid interp mode */
+            assert(0);
+         }
       }
    }
 }
@@ -910,14 +993,15 @@ setup_line(struct draw_stage *stage, struct prim_header *prim)
 
 
 static void
-point_persp_coeff(struct setup_stage *setup, const struct vertex_header *vert,
-                  uint slot, uint i)
+point_persp_coeff(struct setup_stage *setup,
+                  const struct vertex_header *vert,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
-   setup->coef[slot].dadx[i] = 0.0F;
-   setup->coef[slot].dady[i] = 0.0F;
-   setup->coef[slot].a0[i] = vert->data[slot][i] * vert->data[0][3];
+   coef->dadx[i] = 0.0F;
+   coef->dady[i] = 0.0F;
+   coef->a0[i] = vert->data[vertSlot][i] * vert->data[0][3];
 }
 
 
@@ -930,6 +1014,7 @@ static void
 setup_point(struct draw_stage *stage, struct prim_header *prim)
 {
    struct setup_stage *setup = setup_stage( stage );
+   const struct pipe_shader_state *fs = &setup->softpipe->fs->shader;
    const enum interp_mode *interp = setup->softpipe->vertex_info.interp_mode;
    const struct vertex_header *v0 = prim->v[0];
    const int sizeAttr = setup->softpipe->psize_slot;
@@ -940,7 +1025,7 @@ setup_point(struct draw_stage *stage, struct prim_header *prim)
    const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
    const float x = v0->data[0][0];  /* Note: data[0] is always position */
    const float y = v0->data[0][1];
-   unsigned slot, j;
+   uint fragSlot;
 
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
@@ -959,22 +1044,36 @@ setup_point(struct draw_stage *stage, struct prim_header *prim)
     * probably should be ruled out on that basis.
     */
    setup->vprovoke = prim->v[0];
-   const_coeff(setup, 0, 2);
-   const_coeff(setup, 0, 3);
-   for (slot = 1; slot < setup->quad.nr_attrs; slot++) {
-      switch (interp[slot]) {
-      case INTERP_CONSTANT:
-         /* fall-through */
-      case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, slot, j);
-         break;
-      case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            point_persp_coeff(setup, v0, slot, j);
-         break;
-      default:
-         assert(0);
+
+   /* setup Z, W */
+   const_coeff(setup, &setup->posCoef, 0, 2);
+   const_coeff(setup, &setup->posCoef, 0, 3);
+
+   for (fragSlot = 0; fragSlot < setup->quad.nr_attrs; fragSlot++) {
+      /* which vertex output maps to this fragment input: */
+      uint vertSlot = fs->input_map[fragSlot];
+
+      if (vertSlot == 0) {
+         /* special case: shader is reading gl_FragCoord */
+         setup_fragcoord_coeff(setup);
+      }
+      else {
+         uint j;
+         switch (interp[vertSlot]) {
+         case INTERP_CONSTANT:
+            /* fall-through */
+         case INTERP_LINEAR:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+            break;
+         case INTERP_PERSPECTIVE:
+            for (j = 0; j < NUM_CHANNELS; j++)
+               point_persp_coeff(setup, setup->vprovoke,
+                                 &setup->coef[fragSlot], vertSlot, j);
+            break;
+         default:
+            assert(0);
+         }
       }
    }
 
@@ -1108,9 +1207,12 @@ static void setup_begin( struct draw_stage *stage )
 {
    struct setup_stage *setup = setup_stage(stage);
    struct softpipe_context *sp = setup->softpipe;
+   const struct pipe_shader_state *fs = &setup->softpipe->fs->shader;
 
    setup->quad.nr_attrs = setup->softpipe->nr_frag_attrs;
 
+   setup->firstFpInput = fs->input_semantic_name[0];
+
    sp->quad.first->begin(sp->quad.first);
 }
 
@@ -1151,6 +1253,7 @@ struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
    setup->stage.destroy = render_destroy;
 
    setup->quad.coef = setup->coef;
+   setup->quad.posCoef = &setup->posCoef;
 
    return &setup->stage;
 }
diff --git a/src/mesa/pipe/softpipe/sp_quad_earlyz.c b/src/mesa/pipe/softpipe/sp_quad_earlyz.c
index 3abd1f1fb9..22ea99049f 100644
--- a/src/mesa/pipe/softpipe/sp_quad_earlyz.c
+++ b/src/mesa/pipe/softpipe/sp_quad_earlyz.c
@@ -47,9 +47,9 @@ earlyz_quad(
 {
    const float fx = (float) quad->x0;
    const float fy = (float) quad->y0;
-   const float dzdx = quad->coef[0].dadx[2];
-   const float dzdy = quad->coef[0].dady[2];
-   const float z0 = quad->coef[0].a0[2] + dzdx * fx + dzdy * fy;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
    quad->outputs.depth[0] = z0;
    quad->outputs.depth[1] = z0 + dzdx;
diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index 251b47341a..6e7e7eb074 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -74,15 +74,49 @@ quad_shade_stage(struct quad_stage *qs)
 }
 
 
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ * Note that we only need to "compute" X and Y for the upper-left fragment.
+ * We could do less work if we're not depth testing, or there's no
+ * perspective-corrected attributes, but that's seldom.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
 typedef void (XSTDCALL *codegen_function)(
    const struct tgsi_exec_vector *input,
    struct tgsi_exec_vector *output,
    float (*constant)[4],
    struct tgsi_exec_vector *temporary,
-   const struct tgsi_interp_coef *coef );
+   const struct tgsi_interp_coef *coef
+#if 0
+   ,const struct tgsi_exec_vector *quadPos
+#endif
+ );
+
 
-/* This should be done by the fragment shader execution unit (code
- * generated from the decl instructions).  Do it here for now.
+/**
+ * Execute fragment shader for the four fragments in the quad.
  */
 static void
 shade_quad(
@@ -91,28 +125,15 @@ shade_quad(
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
    struct tgsi_exec_machine *machine = &qss->machine;
 
-   /* Consts does not require 16 byte alignment. */
+   /* Consts do not require 16 byte alignment. */
    machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
 
    machine->InterpCoefs = quad->coef;
 
-   machine->Inputs[0].xyzw[0].f[0] = fx;
-   machine->Inputs[0].xyzw[0].f[1] = fx + 1.0f;
-   machine->Inputs[0].xyzw[0].f[2] = fx;
-   machine->Inputs[0].xyzw[0].f[3] = fx + 1.0f;
-
-   /* XXX for OpenGL we need to invert the Y pos here (y=0=top).
-    * but that'll mess up linear/perspective interpolation of other
-    * attributes...
-    */
-   machine->Inputs[0].xyzw[1].f[0] = fy;
-   machine->Inputs[0].xyzw[1].f[1] = fy;
-   machine->Inputs[0].xyzw[1].f[2] = fy + 1.0f;
-   machine->Inputs[0].xyzw[1].f[3] = fy + 1.0f;
+   /* Compute X, Y, Z, W vals for this quad */
+   setup_pos_vector(quad->posCoef, quad->x0, quad->y0, &machine->QuadPos);
 
    /* run shader */
 #if defined(__i386__) || defined(__386__)
@@ -123,7 +144,11 @@ shade_quad(
          machine->Outputs,
          machine->Consts,
          machine->Temps,
-         machine->InterpCoefs );
+         machine->InterpCoefs
+#if 0
+         ,machine->QuadPos
+#endif
+           );
       quad->mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
    }
    else
diff --git a/src/mesa/pipe/softpipe/sp_quad_stipple.c b/src/mesa/pipe/softpipe/sp_quad_stipple.c
index 04d95989c4..0c42963dfe 100644
--- a/src/mesa/pipe/softpipe/sp_quad_stipple.c
+++ b/src/mesa/pipe/softpipe/sp_quad_stipple.c
@@ -22,10 +22,18 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
    if (quad->prim == PRIM_TRI) {
       struct softpipe_context *softpipe = qs->softpipe;
       /* need to invert Y to index into OpenGL's stipple pattern */
-      const int y0 = softpipe->framebuffer.cbufs[0]->height - 1 - quad->y0;
-      const int y1 = y0 - 1;
-      const unsigned stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
-      const unsigned stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
+      int y0, y1;
+      uint stipple0, stipple1;
+      if (softpipe->rasterizer->origin_lower_left) {
+         y0 = softpipe->framebuffer.cbufs[0]->height - 1 - quad->y0;
+         y1 = y0 - 1;
+      }
+      else {
+         y0 = quad->y0;
+         y1 = y0 + 1;
+      }
+      stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
+      stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
 
 #if 1
       const int col0 = quad->x0 % 32;
diff --git a/src/mesa/pipe/softpipe/sp_state_derived.c b/src/mesa/pipe/softpipe/sp_state_derived.c
index c4f1a0a01a..736ac1c33b 100644
--- a/src/mesa/pipe/softpipe/sp_state_derived.c
+++ b/src/mesa/pipe/softpipe/sp_state_derived.c
@@ -51,18 +51,11 @@ static void calculate_vertex_layout( struct softpipe_context *softpipe )
 
    memset(vinfo, 0, sizeof(*vinfo));
 
-   if (softpipe->depth_stencil->depth.enabled)
-      softpipe->need_z = TRUE;
-   else
-      softpipe->need_z = FALSE;
-   softpipe->need_w = FALSE;
 
    if (fs->input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
       /* Need Z if depth test is enabled or the fragment program uses the
        * fragment position (XYZW).
        */
-      softpipe->need_z = TRUE;
-      softpipe->need_w = TRUE;
    }
 
    softpipe->psize_slot = -1;
@@ -121,7 +114,6 @@ static void calculate_vertex_layout( struct softpipe_context *softpipe )
       case TGSI_SEMANTIC_GENERIC:
          /* this includes texcoords and varying vars */
          draw_emit_vertex_attr(vinfo, FORMAT_4F, INTERP_PERSPECTIVE);
-         softpipe->need_w = TRUE;
          break;
 
       default:
@@ -129,7 +121,11 @@ static void calculate_vertex_layout( struct softpipe_context *softpipe )
       }
    }
 
+#if 00
    softpipe->nr_frag_attrs = vinfo->num_attribs;
+#else
+   softpipe->nr_frag_attrs = fs->num_inputs;
+#endif
 
    /* We want these after all other attribs since they won't get passed
     * to the fragment shader.  All prior vertex output attribs should match
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index 2dd1add6f7..44512e4281 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -52,10 +52,6 @@ static unsigned minify( unsigned d )
    return MAX2(1, d>>1);
 }
 
-static int align(int value, int alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
 
 
 static void
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index 8636271a34..1f43f3643e 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -1352,8 +1352,8 @@ linear_interpolation(
    unsigned attrib,
    unsigned chan )
 {
-   const float x = mach->Inputs[0].xyzw[0].f[0];
-   const float y = mach->Inputs[0].xyzw[1].f[0];
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
    const float dady = mach->InterpCoefs[attrib].dady[chan];
    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
@@ -1369,15 +1369,17 @@ perspective_interpolation(
    unsigned attrib,
    unsigned chan )
 {
-   const float x = mach->Inputs[0].xyzw[0].f[0];
-   const float y = mach->Inputs[0].xyzw[1].f[0];
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
    const float dady = mach->InterpCoefs[attrib].dady[chan];
    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
-   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / mach->Inputs[0].xyzw[3].f[0];
-   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / mach->Inputs[0].xyzw[3].f[1];
-   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / mach->Inputs[0].xyzw[3].f[2];
-   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / mach->Inputs[0].xyzw[3].f[3];
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
 }
 
 
@@ -1402,17 +1404,6 @@ exec_declaration(
          last = decl->u.DeclarationRange.Last;
          mask = decl->Declaration.UsageMask;
 
-         /* Do not touch WPOS.xy */
-         if( first == 0 ) {
-            mask &= ~TGSI_WRITEMASK_XY;
-            if( mask == TGSI_WRITEMASK_NONE ) {
-               first++;
-               if( first > last ) {
-                  return;
-               }
-            }
-         }
-
          switch( decl->Interpolation.Interpolate ) {
          case TGSI_INTERPOLATE_CONSTANT:
             interp = constant_interpolation;
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.h b/src/mesa/pipe/tgsi/exec/tgsi_exec.h
index e7952a08e3..db92e282df 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.h
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.h
@@ -177,6 +177,7 @@ struct tgsi_exec_machine
 
    /* FRAGMENT processor only. */
    const struct tgsi_interp_coef *InterpCoefs;
+   struct tgsi_exec_vector       QuadPos;
 
    /* Conditional execution masks */
    uint CondMask;  /**< For IF/ELSE/ENDIF */
diff --git a/src/mesa/pipe/xlib/xm_winsys_aub.c b/src/mesa/pipe/xlib/xm_winsys_aub.c
index ee3c2d6181..0348c2ad40 100644
--- a/src/mesa/pipe/xlib/xm_winsys_aub.c
+++ b/src/mesa/pipe/xlib/xm_winsys_aub.c
@@ -493,6 +493,13 @@ static void aub_i965_batch_reloc( struct brw_winsys *sws,
    iws->data[iws->nr++] = aub_bo(buf)->offset + delta;
 }
 
+static unsigned aub_i965_get_buffer_offset( struct brw_winsys *sws,
+					    struct pipe_buffer_handle *buf,
+					    unsigned access_flags )
+{
+   return aub_bo(buf)->offset;
+}
+
 
 
 static void aub_i965_batch_flush( struct brw_winsys *sws,
@@ -605,6 +612,7 @@ xmesa_create_i965simple( struct pipe_winsys *winsys )
    iws->winsys.batch_reloc = aub_i965_batch_reloc;
    iws->winsys.batch_flush = aub_i965_batch_flush;
    iws->winsys.buffer_subdata_typed = aub_i965_buffer_subdata_typed;
+   iws->winsys.get_buffer_offset = aub_i965_get_buffer_offset;
 
    iws->pipe_winsys = winsys;
 
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 2a7128dd27..5c6b89d78c 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -77,6 +77,8 @@ static void update_raster_state( struct st_context *st )
    uint i;
 
    memset(&raster, 0, sizeof(raster));
+
+   raster.origin_lower_left = 1; /* Always true for OpenGL */
    
    /* _NEW_POLYGON, _NEW_BUFFERS
     */
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 052b6dd144..6241e70b55 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -35,6 +35,7 @@
 #include "st_context.h"
 #include "st_cache.h"
 #include "st_atom.h"
+#include "st_program.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 
@@ -116,17 +117,23 @@ gl_filter_to_img_filter(GLenum filter)
 static void 
 update_samplers(struct st_context *st)
 {
-   GLuint u;
+   const struct st_fragment_program *fs = st->fp;
+   GLuint su;
 
-   for (u = 0; u < st->ctx->Const.MaxTextureImageUnits; u++) {
-      const struct gl_texture_object *texobj
-         = st->ctx->Texture.Unit[u]._Current;
+   /* loop over sampler units (aka tex image units) */
+   for (su = 0; su < st->ctx->Const.MaxTextureImageUnits; su++) {
       struct pipe_sampler_state sampler;
       const struct cso_sampler *cso;
 
       memset(&sampler, 0, sizeof(sampler));
 
-      if (texobj) {
+      if (fs->Base.Base.SamplersUsed & (1 << su)) {
+         GLuint texUnit = fs->Base.Base.SamplerUnits[su];
+         const struct gl_texture_object *texobj
+            = st->ctx->Texture.Unit[texUnit]._Current;
+
+         assert(texobj);
+
          sampler.wrap_s = gl_wrap_to_sp(texobj->WrapS);
          sampler.wrap_t = gl_wrap_to_sp(texobj->WrapT);
          sampler.wrap_r = gl_wrap_to_sp(texobj->WrapR);
@@ -138,7 +145,7 @@ update_samplers(struct st_context *st)
          if (texobj->Target != GL_TEXTURE_RECTANGLE_ARB)
             sampler.normalized_coords = 1;
 
-         sampler.lod_bias = st->ctx->Texture.Unit[u].LodBias;
+         sampler.lod_bias = st->ctx->Texture.Unit[su].LodBias;
 #if 1
          sampler.min_lod = texobj->MinLod;
          sampler.max_lod = texobj->MaxLod;
@@ -166,10 +173,10 @@ update_samplers(struct st_context *st)
 
       cso = st_cached_sampler_state(st, &sampler);
 
-      if (cso != st->state.sampler[u]) {
+      if (cso != st->state.sampler[su]) {
          /* state has changed */
-         st->state.sampler[u] = cso;
-         st->pipe->bind_sampler_state(st->pipe, u, cso->data);
+         st->state.sampler[su] = cso;
+         st->pipe->bind_sampler_state(st->pipe, su, cso->data);
       }
    }
 }
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 4ec10badad..33372b0f39 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -151,8 +151,7 @@ find_translated_vp(struct st_context *st,
 {
    static const GLuint UNUSED = ~0;
    struct translated_vertex_program *xvp;
-   const GLbitfield fragInputsRead
-      = stfp->Base.Base.InputsRead | FRAG_BIT_WPOS;
+   const GLbitfield fragInputsRead = stfp->Base.Base.InputsRead;
 
    /*
     * Translate fragment program if needed.
@@ -206,6 +205,7 @@ find_translated_vp(struct st_context *st,
    if (xvp->serialNo != stvp->serialNo) {
       GLuint outAttr, dummySlot;
       const GLbitfield outputsWritten = stvp->Base.Base.OutputsWritten;
+      GLuint numVpOuts = 0;
 
       /* Compute mapping of vertex program outputs to slots, which depends
        * on the fragment program's input->slot mapping.
@@ -214,11 +214,24 @@ find_translated_vp(struct st_context *st,
          /* set default: */
          xvp->output_to_slot[outAttr] = UNUSED;
 
-         if (outputsWritten & (1 << outAttr)) {
+         if (outAttr == VERT_RESULT_HPOS) {
+            /* always put xformed position into slot zero */
+            xvp->output_to_slot[VERT_RESULT_HPOS] = 0;
+            numVpOuts++;
+         }
+         else if (outputsWritten & (1 << outAttr)) {
             /* see if the frag prog wants this vert output */
-            GLint fpIn = vp_out_to_fp_in(outAttr);
-            if (fpIn >= 0) {
-               xvp->output_to_slot[outAttr] = stfp->input_to_slot[fpIn];
+            GLint fpInAttrib = vp_out_to_fp_in(outAttr);
+            if (fpInAttrib >= 0) {
+               GLuint fpInSlot = stfp->input_to_slot[fpInAttrib];
+               GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
+               xvp->output_to_slot[outAttr] = vpOutSlot;
+               numVpOuts++;
+            }
+            else if (outAttr == VERT_RESULT_BFC0 ||
+                     outAttr == VERT_RESULT_BFC1) {
+               /* backface colors go into last slots */
+               xvp->output_to_slot[outAttr] = numVpOuts++;
             }
          }
       }
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 254740ff20..c40f75417f 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -243,6 +243,10 @@ st_new_renderbuffer_fb(enum pipe_format format)
       strb->Base.InternalFormat = GL_DEPTH24_STENCIL8_EXT;
       strb->Base._BaseFormat = GL_DEPTH_STENCIL_EXT;
       break;
+   case PIPE_FORMAT_S8_UNORM:
+      strb->Base.InternalFormat = GL_STENCIL_INDEX8_EXT;
+      strb->Base._BaseFormat = GL_STENCIL_INDEX;
+      break;
    case PIPE_FORMAT_R16G16B16A16_SNORM:
       strb->Base.InternalFormat = GL_RGBA16;
       strb->Base._BaseFormat = GL_RGBA;
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index b392edf16d..bccabd8004 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -585,6 +585,20 @@ make_temp_decl(
 }
 
 
+static struct tgsi_full_declaration
+make_sampler_decl(GLuint index)
+{
+   struct tgsi_full_declaration decl;
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.Declaration.Declare = TGSI_DECLARE_RANGE;
+   decl.u.DeclarationRange.First = index;
+   decl.u.DeclarationRange.Last = index;
+   return decl;
+}
+
+
+
 /**
  * Find the temporaries which are used in the given program.
  */
@@ -675,44 +689,22 @@ tgsi_translate_mesa_program(
    if (procType == TGSI_PROCESSOR_FRAGMENT) {
       for (i = 0; i < numInputs; i++) {
          struct tgsi_full_declaration fulldecl;
-         switch (inputSemanticName[i]) {
-         case TGSI_SEMANTIC_POSITION:
-            /* Fragment XY pos */
-            fulldecl = make_input_decl(i,
-                                       GL_TRUE, TGSI_INTERPOLATE_CONSTANT,
-                                       TGSI_WRITEMASK_XY,
-                                       GL_TRUE, TGSI_SEMANTIC_POSITION, 0 );
-            ti += tgsi_build_full_declaration(
-                                              &fulldecl,
-                                              &tokens[ti],
-                                              header,
-                                              maxTokens - ti );
-            /* Fragment ZW pos */
-            fulldecl = make_input_decl(i,
-                                       GL_TRUE, TGSI_INTERPOLATE_LINEAR,
-                                       TGSI_WRITEMASK_ZW,
-                                       GL_TRUE, TGSI_SEMANTIC_POSITION, 0 );
-            ti += tgsi_build_full_declaration(&fulldecl,
-                                              &tokens[ti],
-                                              header,
-                                              maxTokens - ti );
-            break;
-         default:
-            fulldecl = make_input_decl(i,
-                                       GL_TRUE, interpMode[i],
-                                       TGSI_WRITEMASK_XYZW,
-                                       GL_TRUE, inputSemanticName[i],
-                                       inputSemanticIndex[i]);
-            ti += tgsi_build_full_declaration(&fulldecl,
-                                              &tokens[ti],
-                                              header,
-                                              maxTokens - ti );
-            break;
-         }
+         fulldecl = make_input_decl(i,
+                                    GL_TRUE, interpMode[i],
+                                    TGSI_WRITEMASK_XYZW,
+                                    GL_TRUE, inputSemanticName[i],
+                                    inputSemanticIndex[i]);
+         ti += tgsi_build_full_declaration(&fulldecl,
+                                           &tokens[ti],
+                                           header,
+                                           maxTokens - ti );
       }
    }
    else {
       /* vertex prog */
+      /* XXX: this could probaby be merged with the clause above.
+       * the only difference is the semantic tags.
+       */
       for (i = 0; i < numInputs; i++) {
          struct tgsi_full_declaration fulldecl;
          fulldecl = make_input_decl(i,
@@ -810,6 +802,19 @@ tgsi_translate_mesa_program(
       }
    }
 
+   /* texture samplers */
+   for (i = 0; i < 8; i++) {
+      if (program->SamplersUsed & (1 << i)) {
+         struct tgsi_full_declaration fulldecl;
+         fulldecl = make_sampler_decl( i );
+         ti += tgsi_build_full_declaration(&fulldecl,
+                                           &tokens[ti],
+                                           header,
+                                           maxTokens - ti );
+      }
+   }
+
+
    for( i = 0; i < program->NumInstructions; i++ ) {
       compile_instruction(
             &program->Instructions[i],
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index e64bf14d56..fe22233c93 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -47,7 +47,7 @@
 #include "st_mesa_to_tgsi.h"
 
 
-#define TGSI_DEBUG 0
+#define TGSI_DEBUG 01
 
 
 /**
@@ -283,16 +283,17 @@ st_translate_fragment_program(struct st_context *st,
    const struct cso_fragment_shader *cso;
    GLuint interpMode[16];  /* XXX size? */
    GLuint attr;
-   GLbitfield inputsRead = stfp->Base.Base.InputsRead;
-
-   /* For software rendering, we always need the fragment input position
-    * in order to calculate interpolated values.
-    * For i915, we always want to emit the semantic info for position.
-    */
-   inputsRead |= FRAG_BIT_WPOS;
+   const GLbitfield inputsRead = stfp->Base.Base.InputsRead;
+   GLuint vslot = 0;
 
    memset(&fs, 0, sizeof(fs));
 
+   /* which vertex output goes to the first fragment input: */
+   if (inputsRead & FRAG_BIT_WPOS)
+      vslot = 0;
+   else
+      vslot = 1;
+
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
@@ -300,15 +301,17 @@ st_translate_fragment_program(struct st_context *st,
       if (inputsRead & (1 << attr)) {
          const GLuint slot = fs.num_inputs;
 
-         fs.num_inputs++;
-
          defaultInputMapping[attr] = slot;
 
+         fs.input_map[slot] = vslot++;
+
+         fs.num_inputs++;
+
          switch (attr) {
          case FRAG_ATTRIB_WPOS:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
             fs.input_semantic_index[slot] = 0;
-            interpMode[slot] = TGSI_INTERPOLATE_CONSTANT;
+            interpMode[slot] = TGSI_INTERPOLATE_LINEAR;
             break;
          case FRAG_ATTRIB_COL0:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_COLOR;