From 2f5f7c07732577f60666e3cee69c75c9b035c145 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 16:55:02 +0100
Subject: i965g: re-starting from the dri driver

---
 src/gallium/drivers/i965/brw_wm.h | 309 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 src/gallium/drivers/i965/brw_wm.h

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
new file mode 100644
index 0000000000..872b1f3ecf
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -0,0 +1,309 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+
+#include "shader/prog_instruction.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define SATURATE (1<<5)
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_BIT_MAX                  0x40
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   GLuint source_depth_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_depth_regs:3;
+   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint source_depth_to_render_target:1;
+   GLuint flat_shade:1;
+   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
+   GLuint runtime_check_aads_emit:1;
+   
+   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
+   GLuint shadowtex_mask:16;
+   GLuint yuvtex_mask:16;
+   GLuint yuvtex_swap_mask:16;	/* UV swaped */
+
+   GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
+
+   GLuint program_string_id:32;
+   GLuint origin_x, origin_y;
+   GLuint drawable_height;
+   GLuint vp_outputs_written;
+};
+
+
+/* A bit of a glossary:
+ *
+ * brw_wm_value: A computed value or program input.  Values are
+ * constant, they are created once and are never modified.  When a
+ * fragment program register is written or overwritten, new values are
+ * created fresh, preserving the rule that values are constant.
+ *
+ * brw_wm_ref: A reference to a value.  Wherever a value used is by an
+ * instruction or as a program output, that is tracked with an
+ * instance of this struct.  All references to a value occur after it
+ * is created.  After the last reference, a value is dead and can be
+ * discarded.
+ *
+ * brw_wm_grf: Represents a physical hardware register.  May be either
+ * empty or hold a value.  Register allocation is the process of
+ * assigning values to grf registers.  This occurs in pass2 and the
+ * brw_wm_grf struct is not used before that.
+ *
+ * Fragment program registers: These are time-varying constructs that
+ * are hard to reason about and which we translate away in pass0.  A
+ * single fragment program register element (eg. temp[0].x) will be
+ * translated to one or more brw_wm_value structs, one for each time
+ * that temp[0].x is written to during the program. 
+ */
+
+
+
+/* Used in pass2 to track register allocation.
+ */
+struct brw_wm_grf {
+   struct brw_wm_value *value;
+   GLuint nextuse;
+};
+
+struct brw_wm_value {
+   struct brw_reg hw_reg;	/* emitted to this reg, may not always be there */
+   struct brw_wm_ref *lastuse;
+   struct brw_wm_grf *resident; 
+   GLuint contributes_to_output:1;
+   GLuint spill_slot:16;	/* if non-zero, spill immediately after calculation */
+};
+
+struct brw_wm_ref {
+   struct brw_reg hw_reg;	/* nr filled in in pass2, everything else, pass0 */
+   struct brw_wm_value *value;
+   struct brw_wm_ref *prevuse;
+   GLuint unspill_reg:7;	/* unspill to reg */
+   GLuint emitted:1;
+   GLuint insn:24;
+};
+
+struct brw_wm_constref {
+   const struct brw_wm_ref *ref;
+   GLfloat constval;
+};
+
+
+struct brw_wm_instruction {
+   struct brw_wm_value *dst[4];
+   struct brw_wm_ref *src[3][4];
+   GLuint opcode:8;
+   GLuint saturate:1;
+   GLuint writemask:4;
+   GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
+   GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
+   GLuint tex_shadow:1; /* do shadow comparison? */
+   GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
+   GLuint target:10;    /* target binding table index for FB_WRITE*/
+};
+
+
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+#define BRW_WM_MAX_SUBROUTINE 16
+
+
+
+/* New opcodes to track internal operations required for WM unit.
+ * These are added early so that the registers used can be tracked,
+ * freed and reused like those of other instructions.
+ */
+#define WM_PIXELXY        (MAX_OPCODE)
+#define WM_DELTAXY        (MAX_OPCODE + 1)
+#define WM_PIXELW         (MAX_OPCODE + 2)
+#define WM_LINTERP        (MAX_OPCODE + 3)
+#define WM_PINTERP        (MAX_OPCODE + 4)
+#define WM_CINTERP        (MAX_OPCODE + 5)
+#define WM_WPOSXY         (MAX_OPCODE + 6)
+#define WM_FB_WRITE       (MAX_OPCODE + 7)
+#define WM_FRONTFACING    (MAX_OPCODE + 8)
+#define MAX_WM_OPCODE     (MAX_OPCODE + 9)
+
+#define PROGRAM_PAYLOAD   (PROGRAM_FILE_MAX)
+#define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data;
+
+   struct brw_fragment_program *fp;
+
+   GLfloat (*env_param)[4];
+
+   enum {
+      START,
+      PASS2_DONE
+   } state;
+
+   /* Initial pass - translate fp instructions to fp instructions,
+    * simplifying and adding instructions for interpolation and
+    * framebuffer writes.
+    */
+   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   GLuint nr_fp_insns;
+   GLuint fp_temp;
+   GLuint fp_interp_emitted;
+   GLuint fp_fragcolor_emitted;
+
+   struct prog_src_register pixel_xy;
+   struct prog_src_register delta_xy;
+   struct prog_src_register pixel_w;
+
+
+   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   GLuint nr_vreg;
+
+   struct brw_wm_value creg[BRW_WM_MAX_PARAM];
+   GLuint nr_creg;
+
+   struct {
+      struct brw_wm_value depth[4]; /* includes r0/r1 */
+      struct brw_wm_value input_interp[FRAG_ATTRIB_MAX];
+   } payload;
+
+
+   const struct brw_wm_ref *pass0_fp_reg[PROGRAM_PAYLOAD+1][256][4];
+
+   struct brw_wm_ref undef_ref;
+   struct brw_wm_value undef_value;
+
+   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   GLuint nr_refs;
+
+   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   GLuint nr_insns;
+
+   struct brw_wm_constref constref[BRW_WM_MAX_CONST];
+   GLuint nr_constrefs;
+
+   struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
+
+   GLuint grf_limit;
+   GLuint max_wm_grf;
+   GLuint last_scratch;
+
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
+   /** Mapping from Mesa registers to hardware registers */
+   struct {
+      GLboolean inited;
+      struct brw_reg reg;
+   } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
+   struct brw_reg stack;
+   struct brw_reg emit_mask_reg;
+   GLuint tmp_regs[BRW_WM_MAX_GRF];
+   GLuint tmp_index;
+   GLuint tmp_max;
+   GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+};
+
+
+GLuint brw_wm_nr_args( GLuint opcode );
+GLuint brw_wm_is_scalar_result( GLuint opcode );
+
+void brw_wm_pass_fp( struct brw_wm_compile *c );
+void brw_wm_pass0( struct brw_wm_compile *c );
+void brw_wm_pass1( struct brw_wm_compile *c );
+void brw_wm_pass2( struct brw_wm_compile *c );
+void brw_wm_emit( struct brw_wm_compile *c );
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+			 struct brw_wm_value *value );
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref );
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst );
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage );
+
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key );
+
+GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
+void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0);
+
+#endif
-- 
cgit v1.2.3


From 57a920cb1a0b6051068e730747b3fb475de88aca Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 17:01:32 +0100
Subject: i965g: wip

---
 src/gallium/drivers/i965/brw_bo.c             |   12 +
 src/gallium/drivers/i965/brw_cc.c             |  180 +----
 src/gallium/drivers/i965/brw_clip.c           |  127 +--
 src/gallium/drivers/i965/brw_clip.h           |    5 +-
 src/gallium/drivers/i965/brw_clip_line.c      |    7 -
 src/gallium/drivers/i965/brw_clip_point.c     |    7 -
 src/gallium/drivers/i965/brw_clip_state.c     |    7 +-
 src/gallium/drivers/i965/brw_clip_tri.c       |    7 -
 src/gallium/drivers/i965/brw_clip_unfilled.c  |    5 -
 src/gallium/drivers/i965/brw_clip_util.c      |    7 -
 src/gallium/drivers/i965/brw_context.c        |  135 ++--
 src/gallium/drivers/i965/brw_context.h        |    7 +-
 src/gallium/drivers/i965/brw_curbe.c          |   89 +--
 src/gallium/drivers/i965/brw_defines.h        |    4 +-
 src/gallium/drivers/i965/brw_disasm.c         |    2 -
 src/gallium/drivers/i965/brw_draw.c           |  244 +-----
 src/gallium/drivers/i965/brw_draw_upload.c    |  566 ++++---------
 src/gallium/drivers/i965/brw_gs.c             |    2 +-
 src/gallium/drivers/i965/brw_pipe_blend.c     |   41 +
 src/gallium/drivers/i965/brw_pipe_debug.c     |    2 +
 src/gallium/drivers/i965/brw_pipe_depth.c     |   52 ++
 src/gallium/drivers/i965/brw_pipe_fb.c        |   25 +
 src/gallium/drivers/i965/brw_pipe_flush.c     |   64 ++
 src/gallium/drivers/i965/brw_screen_surface.c |   27 +
 src/gallium/drivers/i965/brw_sf.c             |    4 +-
 src/gallium/drivers/i965/brw_sf_emit.c        |    4 +-
 src/gallium/drivers/i965/brw_state_upload.c   |   63 +-
 src/gallium/drivers/i965/brw_swtnl.c          |  114 +++
 src/gallium/drivers/i965/brw_types.h          |   11 +
 src/gallium/drivers/i965/brw_util.c           |    8 -
 src/gallium/drivers/i965/brw_vs.c             |   12 +-
 src/gallium/drivers/i965/brw_vs_emit.c        |  250 ++----
 src/gallium/drivers/i965/brw_wm.c             |   59 +-
 src/gallium/drivers/i965/brw_wm.h             |    1 -
 src/gallium/drivers/i965/brw_wm_emit.c        |   17 +-
 src/gallium/drivers/i965/brw_wm_fp.c          |  193 ++---
 src/gallium/drivers/i965/brw_wm_glsl.c        | 1060 +------------------------
 src/gallium/drivers/i965/brw_wm_pass0.c       |    1 -
 src/gallium/drivers/i965/brw_wm_pass1.c       |   81 +-
 src/gallium/drivers/i965/intel_chipset.h      |    4 +-
 40 files changed, 907 insertions(+), 2599 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_bo.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_blend.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_debug.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_depth.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_fb.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_flush.c
 create mode 100644 src/gallium/drivers/i965/brw_screen_surface.c
 create mode 100644 src/gallium/drivers/i965/brw_swtnl.c
 create mode 100644 src/gallium/drivers/i965/brw_types.h

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_bo.c b/src/gallium/drivers/i965/brw_bo.c
new file mode 100644
index 0000000000..e7a4dac666
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_bo.c
@@ -0,0 +1,12 @@
+
+
+void brw_buffer_subdata()
+{
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 1088a7a607..9ab5638137 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -62,84 +62,21 @@ const struct brw_tracked_state brw_cc_vp = {
 };
 
 struct brw_cc_unit_key {
-   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
-
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-   GLclampf alpha_ref;
-
-   GLboolean dither;
-
-   GLboolean depth_test, depth_write;
-   GLenum depth_func;
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend; /* no color mask */
 };
 
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
-
    memset(key, 0, sizeof(*key));
+   
+   key->dsa = brw->curr.dsa.base;
+   key->blend = brw->curr.blend.base;
 
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
-
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_ref[1] = ctx->Stencil.Ref[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
-   }
-
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
-      key->blend_eq_a = ctx->Color.BlendEquationA;
-      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
-      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
-      key->blend_src_a = ctx->Color.BlendSrcA;
-      key->blend_dst_a = ctx->Color.BlendDstA;
-   }
-
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-      key->alpha_ref = ctx->Color.AlphaRef;
-   }
-
-   key->dither = ctx->Color.DitherFlag;
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
+   /* Clear non-respected values:
+    */
+   key->blend.colormask = 0xf;
 }
 
 /**
@@ -153,103 +90,16 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 
    memset(&cc, 0, sizeof(cc));
 
-   /* _NEW_STENCIL */
-   if (key->stencil) {
-      cc.cc0.stencil_enable = 1;
-      cc.cc0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
-      cc.cc0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
-      cc.cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
-      cc.cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      cc.cc1.stencil_ref = key->stencil_ref[0];
-      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
-      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
-
-      if (key->stencil_two_side) {
-	 cc.cc0.bf_stencil_enable = 1;
-	 cc.cc0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
-	 cc.cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
-	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
-      }
-
-      /* Not really sure about this:
-       */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 cc.cc0.stencil_write_enable = 1;
-   }
-
-   /* _NEW_COLOR */
-   if (key->logic_op != GL_COPY) {
-      cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
-      }
-
-      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
-      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
-
-      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
-      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
-
-      cc.cc3.blend_enable = 1;
-      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
-				dstA != dstRGB ||
-				eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
-      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
-   }
-
-   if (key->dither) {
-      cc.cc5.dither_enable = 1;
-      cc.cc6.y_dither_offset = 0;
-      cc.cc6.x_dither_offset = 0;
-   }
-
-   /* _NEW_DEPTH */
-   if (key->depth_test) {
-      cc.cc2.depth_test = 1;
-      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
-      cc.cc2.depth_write_enable = key->depth_write;
-   }
+   cc.cc0 = brw->dsa.cc0;
+   cc.cc1 = brw->dsa.cc1;
+   cc.cc2 = brw->dsa.cc2;
+   cc.cc3 = brw->dsa.cc3 | brw->blend.cc3;
 
    /* CACHE_NEW_CC_VP */
    cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */
 
-   if (INTEL_DEBUG & DEBUG_STATS)
-      cc.cc5.statistics_enable = 1;
+   cc.cc5 = brw->blend.cc5 | brw->debug.cc5;
+
 
    bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
 			 key, sizeof(*key),
@@ -286,7 +136,7 @@ static void prepare_cc_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_cc_unit = {
    .dirty = {
-      .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
+      .mesa = PIPE_NEW_DEPTH_STENCIL_ALPHA | PIPE_NEW_BLEND,
       .brw = 0,
       .cache = CACHE_NEW_CC_VP
    },
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 20a927cf38..df1b3718d0 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -29,9 +29,9 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_state.h"
+
+#include "util/u_math.h"
 
 #include "intel_batchbuffer.h"
 
@@ -83,7 +83,7 @@ static void compile_clip_prog( struct brw_context *brw,
 	 delta += ATTR_SIZE;
       }
 
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -104,16 +104,16 @@ static void compile_clip_prog( struct brw_context *brw,
     * do all three:
     */
    switch (key->primitive) {
-   case GL_TRIANGLES: 
+   case PIPE_PRIM_TRIANGLES: 
       if (key->do_unfilled)
 	 brw_emit_unfilled_clip( &c );
       else
 	 brw_emit_tri_clip( &c );
       break;
-   case GL_LINES:
+   case PIPE_PRIM_LINES:
       brw_emit_line_clip( &c );
       break;
-   case GL_POINTS:
+   case PIPE_PRIM_POINTS:
       brw_emit_point_clip( &c );
       break;
    default:
@@ -143,7 +143,6 @@ static void compile_clip_prog( struct brw_context *brw,
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -151,101 +150,51 @@ static void upload_clip_prog(struct brw_context *brw)
    /* Populate the key:
     */
    /* BRW_NEW_REDUCED_PRIMITIVE */
-   key.primitive = brw->intel.reduced_primitive;
+   key.primitive = brw->reduced_primitive;
    /* CACHE_NEW_VS_PROG */
    key.attrs = brw->vs.prog_data->outputs_written;
-   /* _NEW_LIGHT */
-   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
-   /* _NEW_TRANSFORM */
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_RAST */
+   key.do_flat_shading = brw->rast.base.flatshade;
+   /* PIPE_NEW_UCP */
+   key.nr_userclip = brw->nr_ucp;
 
    if (BRW_IS_IGDNG(brw))
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
 
-   /* _NEW_POLYGON */
-   if (key.primitive == GL_TRIANGLES) {
-      if (ctx->Polygon.CullFlag &&
-	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+   /* PIPE_NEW_RAST */
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
 	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
       else {
-	 GLuint fill_front = CLIP_CULL;
-	 GLuint fill_back = CLIP_CULL;
-	 GLuint offset_front = 0;
-	 GLuint offset_back = 0;
-
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_FRONT) {
-	    switch (ctx->Polygon.FrontMode) {
-	    case GL_FILL: 
-	       fill_front = CLIP_FILL; 
-	       offset_front = 0;
-	       break;
-	    case GL_LINE:
-	       fill_front = CLIP_LINE;
-	       offset_front = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_front = CLIP_POINT;
-	       offset_front = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
 	 }
 
-	 if (!ctx->Polygon.CullFlag ||
-	     ctx->Polygon.CullFaceMode != GL_BACK) {
-	    switch (ctx->Polygon.BackMode) {
-	    case GL_FILL: 
-	       fill_back = CLIP_FILL; 
-	       offset_back = 0;
-	       break;
-	    case GL_LINE:
-	       fill_back = CLIP_LINE;
-	       offset_back = ctx->Polygon.OffsetLine;
-	       break;
-	    case GL_POINT:
-	       fill_back = CLIP_POINT;
-	       offset_back = ctx->Polygon.OffsetPoint;
-	       break;
-	    }
+	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
 	 }
 
-	 if (ctx->Polygon.BackMode != GL_FILL ||
-	     ctx->Polygon.FrontMode != GL_FILL) {
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
 	    key.do_unfilled = 1;
-
-	    /* Most cases the fixed function units will handle.  Cases where
-	     * one or more polygon faces are unfilled will require help:
-	     */
 	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.offset_ccw;
+	 key.offset_cw = brw->rast.offset_cw;
+
+	 if (brw->rast.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
 
-	    if (offset_back || offset_front) {
-	       /* _NEW_POLYGON, _NEW_BUFFERS */
-	       key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale;
-	       key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD;
-	    }
-
-	    switch (ctx->Polygon.FrontFace) {
-	    case GL_CCW:
-	       key.fill_ccw = fill_front;
-	       key.fill_cw = fill_back;
-	       key.offset_ccw = offset_front;
-	       key.offset_cw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_cw != CLIP_CULL) 
-		  key.copy_bfc_cw = 1;
-	       break;
-	    case GL_CW:
-	       key.fill_cw = fill_front;
-	       key.fill_ccw = fill_back;
-	       key.offset_cw = offset_front;
-	       key.offset_ccw = offset_back;
-	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_ccw != CLIP_CULL) 
-		  key.copy_bfc_ccw = 1;
-	       break;
-	    }
+	 if (brw->rast.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
 	 }
       }
    }
@@ -262,10 +211,8 @@ static void upload_clip_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT | 
-		_NEW_TRANSFORM |
-		_NEW_POLYGON | 
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_RAST | 
+		PIPE_NEW_UCP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index 957df441ab..d80ec819b9 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -43,6 +43,7 @@
  */
 struct brw_clip_prog_key {
    GLuint attrs:32;		
+
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -51,12 +52,10 @@ struct brw_clip_prog_key {
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:17;
-
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:27;
+   GLuint pad1:12;
    
    GLfloat offset_factor;
    GLfloat offset_units;
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
index 048ca620fa..6b4da25644 100644
--- a/src/gallium/drivers/i965/brw_clip_line.c
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c
index 8458f61c5a..b2cf7b2011 100644
--- a/src/gallium/drivers/i965/brw_clip_point.c
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 234b3744bf..72e27205e2 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -32,7 +32,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_clip_unit_key {
    unsigned int total_grf;
@@ -66,8 +65,8 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_clip_entries;
    key->urb_size = brw->urb.vsize;
 
-   /* _NEW_TRANSOFORM */
-   key->depth_clamp = ctx->Transform.DepthClamp;
+   /*  */
+   key->depth_clamp = 0; // XXX: add this to gallium: ctx->Transform.DepthClamp;
 }
 
 static dri_bo *
@@ -175,7 +174,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM,
+      .mesa  = 0,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
index 0efd77225e..d8feca6a87 100644
--- a/src/gallium/drivers/i965/brw_clip_tri.c
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -29,13 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index ad1bfa435f..4baff55806 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,11 +29,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 5a73abdfee..7a6c46ce07 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -30,13 +30,6 @@
   */
 
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index c300c33adc..bf0ec89e13 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -52,122 +52,77 @@
 #include "utils.h"
 
 
-/***************************************
- * Mesa's Driver Functions
- ***************************************/
-
-static void brwUseProgram(GLcontext *ctx, GLuint program)
-{
-   _mesa_use_program(ctx, program);
-}
-
-static void brwInitProgFuncs( struct dd_function_table *functions )
-{
-   functions->UseProgram = brwUseProgram;
-}
-static void brwInitDriverFunctions( struct dd_function_table *functions )
-{
-   intelInitDriverFunctions( functions );
-
-   brwInitFragProgFuncs( functions );
-   brwInitProgFuncs( functions );
-   brw_init_queryobj_functions(functions);
-
-   functions->Viewport = intel_viewport;
-}
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate)
 {
-   struct dd_function_table functions;
    struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
-   struct intel_context *intel = &brw->intel;
-   GLcontext *ctx = &intel->ctx;
 
    if (!brw) {
-      _mesa_printf("%s: failed to alloc context\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   brwInitVtbl( brw );
-   brwInitDriverFunctions( &functions );
-
-   if (!intelInitContext( intel, mesaVis, driContextPriv,
-			  sharedContextPrivate, &functions )) {
-      _mesa_printf("%s: failed to init intel context\n", __FUNCTION__);
-      FREE(brw);
+      debug_printf("%s: failed to alloc context\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   /* Initialize swrast, tnl driver tables: */
-   intelInitSpanFuncs(ctx);
-
-   TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
-
-   ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT;
-   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
-   ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
-                                     ctx->Const.MaxTextureImageUnits);
-   ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
-
-   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
-    */
-   ctx->Const.MaxTextureLevels = 13;
-   ctx->Const.Max3DTextureLevels = 9;
-   ctx->Const.MaxCubeTextureLevels = 12;
-   ctx->Const.MaxTextureRectSize = (1<<12);
-   
-   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-
-   /* if conformance mode is set, swrast can handle any size AA point */
-   ctx->Const.MaxPointSizeAA = 255.0;
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    ctx->Shader.EmitCondCodes = GL_TRUE;
    ctx->Shader.EmitNVTempInitialization = GL_TRUE;
 
-   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.VertexProgram.MaxAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
-   ctx->Const.VertexProgram.MaxNativeTemps = 256;
-   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
-   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
-   ctx->Const.VertexProgram.MaxEnvParams =
-      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
-	   ctx->Const.VertexProgram.MaxEnvParams);
-
-   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
-   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
-   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
-   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
-   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
-   ctx->Const.FragmentProgram.MaxEnvParams =
-      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
-	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   brw_init_query( brw );
    brw_init_state( brw );
+   brw_draw_init( brw );
 
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0;
 
    brw->emit_state_always = 0;
 
-   ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-   ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
    make_empty_list(&brw->query.active_head);
 
-   brw_draw_init( brw );
 
    return GL_TRUE;
 }
 
+/**
+ * called from intelDestroyContext()
+ */
+static void brw_destroy_context( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   int i;
+
+   brw_destroy_state(brw);
+   brw_draw_destroy( brw );
+
+   _mesa_free(brw->wm.compile_data);
+
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
+   intel_region_release(&brw->state.depth_region);
+
+   dri_bo_unreference(brw->curbe.curbe_bo);
+   dri_bo_unreference(brw->vs.prog_bo);
+   dri_bo_unreference(brw->vs.state_bo);
+   dri_bo_unreference(brw->vs.bind_bo);
+   dri_bo_unreference(brw->gs.prog_bo);
+   dri_bo_unreference(brw->gs.state_bo);
+   dri_bo_unreference(brw->clip.prog_bo);
+   dri_bo_unreference(brw->clip.state_bo);
+   dri_bo_unreference(brw->clip.vp_bo);
+   dri_bo_unreference(brw->sf.prog_bo);
+   dri_bo_unreference(brw->sf.state_bo);
+   dri_bo_unreference(brw->sf.vp_bo);
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
+      dri_bo_unreference(brw->wm.sdc_bo[i]);
+   dri_bo_unreference(brw->wm.bind_bo);
+   for (i = 0; i < BRW_WM_MAX_SURF; i++)
+      dri_bo_unreference(brw->wm.surf_bo[i]);
+   dri_bo_unreference(brw->wm.sampler_bo);
+   dri_bo_unreference(brw->wm.prog_bo);
+   dri_bo_unreference(brw->wm.state_bo);
+   dri_bo_unreference(brw->cc.prog_bo);
+   dri_bo_unreference(brw->cc.state_bo);
+   dri_bo_unreference(brw->cc.vp_bo);
+}
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index fa3e32c7ff..009e28b227 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -115,7 +115,6 @@
  * Handles blending and (presumably) depth and stencil testing.
  */
 
-#define BRW_FALLBACK_TEXTURE		 0x1
 #define BRW_MAX_CURBE                    (32*16)
 
 struct brw_context;
@@ -450,11 +449,9 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean tmp_fallback;
    GLboolean no_batch_wrap;
 
    struct {
@@ -692,7 +689,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 /*======================================================================
  * brw_queryobj.c
  */
-void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_init_query(struct brw_context *brw);
 void brw_prepare_query_begin(struct brw_context *brw);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
@@ -730,7 +727,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst);
  * macros used previously:
  */
 static INLINE struct brw_context *
-brw_context( GLcontext *ctx )
+brw_context( struct pipe_context *ctx )
 {
    return (struct brw_context *)ctx;
 }
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 4be6c77aa1..3e32c4983d 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,14 +30,6 @@
   */
 
 
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
@@ -64,31 +56,17 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   /* PIPE_NEW_UCP */
+   if (brw->nr_ucp) {
+      GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
 
 
    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
 
-   /* This can happen - what to do?  Probably rather than falling
-    * back, the best thing to do is emit programs which code the
-    * constants as immediate values.  Could do this either as a static
-    * cap on WM and VS, or adaptively.
-    *
-    * Unfortunately, this is currently dependent on the results of the
-    * program generation process (in the case of wm), so this would
-    * introduce the need to re-generate programs in the event of a
-    * curbe allocation failure.
-    */
-   /* Max size is 32 - just large enough to
-    * hold the 128 parameters allowed by
-    * the fragment and vertex program
-    * api's.  It's not clear what happens
-    * when both VP and FP want to use 128
-    * parameters, though. 
+   /* When this is > 32, want to use a true constant buffer to hold
+    * the extra constants.
     */
    assert(total_regs <= 32);
 
@@ -113,8 +91,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
       brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
       brw->curbe.total_size = reg;
 
-      if (0)
-	 _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+      if (BRW_DEBUG & DEBUG_CURBE)
+	 debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
 		      brw->curbe.wm_start,
 		      brw->curbe.wm_size,
 		      brw->curbe.clip_start,
@@ -129,7 +107,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = _NEW_TRANSFORM,
+      .mesa = PIPE_NEW_UCP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -204,11 +182,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
+      /* map fs constant buffer */
 
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
 	 buf[offset + i] = *brw->wm.prog_data->param[i];
+
+      /* unmap fs constant buffer */
    }
 
 
@@ -228,18 +208,15 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
       }
 
-      /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
-       * clip-space:
+      /* Clip planes:
        */
-      assert(MAX_CLIP_PLANES == 6);
-      for (j = 0; j < MAX_CLIP_PLANES; j++) {
-	 if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
-	    buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0];
-	    buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1];
-	    buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2];
-	    buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3];
-	    i++;
-	 }
+      assert(brw->nr_ucp <= 6);
+      for (j = 0; j < brw->nr_ucp; j++) {
+	 buf[offset + i * 4 + 0] = brw->ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->ucp[j][3];
+	 i++;
       }
    }
 
@@ -248,13 +225,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
+      /* map vs constant buffer */
 
       /* XXX just use a memcpy here */
       for (i = 0; i < nr; i++) {
@@ -264,14 +235,16 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 buf[offset + i * 4 + 2] = value[2];
 	 buf[offset + i * 4 + 3] = value[3];
       }
+
+      /* unmap vs constant buffer */
    }
 
    if (0) {
       for (i = 0; i < sz*16; i+=4) 
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -282,12 +255,12 @@ static void prepare_constant_buffer(struct brw_context *brw)
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
       /* constants have not changed */
-      _mesa_free(buf);
+      FREE(buf);
    } 
    else {
       /* constants have changed */
       if (brw->curbe.last_buf)
-	 _mesa_free(brw->curbe.last_buf);
+	 FREE(brw->curbe.last_buf);
 
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
@@ -353,15 +326,11 @@ static void emit_constant_buffer(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-/* This tracked state is unique in that the state it monitors varies
- * dynamically depending on the parameters tracked by the fragment and
- * vertex programs.  This is the template used as a starting point,
- * each context will maintain a copy of this internally and update as
- * required.
- */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .mesa = (PIPE_NEW_FS_CONSTANTS |
+	       PIPE_NEW_VS_CONSTANTS |
+	       PIPE_NEW_UCP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 78d457ad2b..282c5b18f4 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
index 9fef230507..a84c581c03 100644
--- a/src/gallium/drivers/i965/brw_disasm.c
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -27,8 +27,6 @@
 #include <unistd.h>
 #include <stdarg.h>
 
-#include "main/mtypes.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 44bb7bd588..8cd117c24f 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -39,14 +39,13 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
 
-static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
+static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
    _3DPRIM_POINTLIST,
    _3DPRIM_LINELIST,
    _3DPRIM_LINELOOP,
@@ -60,19 +59,6 @@ static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
 };
 
 
-static const GLenum reduced_prim[GL_POLYGON+1] = {  
-   GL_POINTS,
-   GL_LINES,
-   GL_LINES,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES
-};
-
 
 /* When the primitive changes, set a state bit and re-validate.  Not
  * the nicest and would rather deal with this by having all the
@@ -196,102 +182,6 @@ static void brw_merge_inputs( struct brw_context *brw,
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
 }
 
-/* XXX: could split the primitive list to fallback only on the
- * non-conformant primitives.
- */
-static GLboolean check_fallbacks( struct brw_context *brw,
-				  const struct _mesa_prim *prim,
-				  GLuint nr_prims )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   GLuint i;
-
-   /* If we don't require strict OpenGL conformance, never 
-    * use fallbacks.  If we're forcing fallbacks, always
-    * use fallfacks.
-    */
-   if (brw->intel.conformance_mode == 0)
-      return GL_FALSE;
-
-   if (brw->intel.conformance_mode == 2)
-      return GL_TRUE;
-
-   if (ctx->Polygon.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware will do AA lines, but they are non-conformant it
-    * seems.  TBD whether we keep this fallback:
-    */
-   if (ctx->Line.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_LINES) 
-	    return GL_TRUE;
-   }
-
-   /* Stipple -- these fallbacks could be resolved with a little
-    * bit of work?
-    */
-   if (ctx->Line.StippleFlag) {
-      for (i = 0; i < nr_prims; i++) {
-	 /* GS doesn't get enough information to know when to reset
-	  * the stipple counter?!?
-	  */
-	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
-	    return GL_TRUE;
-	    
-	 if (prim[i].mode == GL_POLYGON &&
-	     (ctx->Polygon.FrontMode == GL_LINE ||
-	      ctx->Polygon.BackMode == GL_LINE))
-	    return GL_TRUE;
-      }
-   }
-
-   if (ctx->Point.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (prim[i].mode == GL_POINTS) 
-	    return GL_TRUE;
-   }
-
-   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
-    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
-    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
-    * we want strict conformance, force the fallback.
-    * Right now, we only do this for 2D textures.
-    */
-   {
-      int u;
-      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
-         if (texUnit->Enabled) {
-            if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-         }
-      }
-   }
-      
-   /* Nothing stopping us from the fast path now */
-   return GL_FALSE;
-}
-
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
@@ -308,23 +198,12 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    GLboolean retval = GL_FALSE;
    GLboolean warn = GL_FALSE;
    GLboolean first_time = GL_TRUE;
+   uint32_t hw_prim;
    GLuint i;
 
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
-   /* We have to validate the textures *before* checking for fallbacks;
-    * otherwise, the software fallback won't be able to rely on the
-    * texture state, the firstLevel and lastLevel fields won't be
-    * set in the intel texture object (they'll both be 0), and the 
-    * software fallback will segfault if it attempts to access any
-    * texture level other than level 0.
-    */
-   brw_validate_textures( brw );
-
-   if (check_fallbacks(brw, prim, nr_prims))
-      return GL_FALSE;
-
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs( brw, arrays );
@@ -336,90 +215,30 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    brw->vb.max_index = max_index;
    brw->state.dirty.brw |= BRW_NEW_VERTICES;
 
-   /* Have to validate state quite late.  Will rebuild tnl_program,
-    * which depends on varying information.  
-    * 
-    * Note this is where brw->vs->prog_data.inputs_read is calculated,
-    * so can't access it earlier.
-    */
-
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
-
-   for (i = 0; i < nr_prims; i++) {
-      uint32_t hw_prim;
-
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.  This fraction is just a guess (minimal full state plus
-       * a primitive is around 512 bytes), and would be better if we had
-       * an upper bound of how much we might emit in a single
-       * brw_try_draw_prims().
-       */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
-
-      hw_prim = brw_set_prim(brw, prim[i].mode);
-
-      if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) {
-	 first_time = GL_FALSE;
-
-	 brw_validate_state(brw);
-
-	 /* Various fallback checks:  */
-	 if (brw->intel.Fallback)
-	    goto out;
-
-	 /* Check that we can fit our state in with our existing batchbuffer, or
-	  * flush otherwise.
-	  */
-	 if (dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-					     brw->state.validated_bo_count)) {
-	    static GLboolean warned;
-	    intel_batchbuffer_flush(intel->batch);
-
-	    /* Validate the state after we flushed the batch (which would have
-	     * changed the set of dirty state).  If we still fail to
-	     * check_aperture, warn of what's happening, but attempt to continue
-	     * on since it may succeed anyway, and the user would probably rather
-	     * see a failure and a warning than a fallback.
-	     */
-	    brw_validate_state(brw);
-	    if (!warned &&
-		dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-						brw->state.validated_bo_count)) {
-	       warn = GL_TRUE;
-	       warned = GL_TRUE;
-	    }
-	 }
-
-	 brw_upload_state(brw);
-      }
+   hw_prim = brw_set_prim(brw, prim[i].mode);
 
-      brw_emit_prim(brw, &prim[i], hw_prim);
+   brw_validate_state(brw);
 
-      retval = GL_TRUE;
-   }
+   /* Check that we can fit our state in with our existing batchbuffer, or
+    * flush otherwise.
+    */
+   ret = dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					 brw->state.validated_bo_count);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_state(brw);
+   if (ret)
+      return ret;
+   
+   ret = brw_emit_prim(brw, &prim[i], hw_prim);
+   if (ret)
+      return ret;
 
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
- out:
-   UNLOCK_HARDWARE(intel);
-
-   brw_state_cache_check_size(brw);
-
-   if (warn)
-      fprintf(stderr, "i965: Single primitive emit potentially exceeded "
-	      "available aperture space\n");
 
-   if (!retval)
-      DBG("%s failed\n", __FUNCTION__);
-
-   return retval;
+   return 0;
 }
 
 void brw_draw_prims( GLcontext *ctx,
@@ -431,37 +250,26 @@ void brw_draw_prims( GLcontext *ctx,
 		     GLuint min_index,
 		     GLuint max_index )
 {
-   GLboolean retval;
+   enum pipe_error ret;
 
    if (!vbo_all_varyings_in_vbos(arrays)) {
       if (!index_bounds_valid)
 	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-
-      /* Decide if we want to rebase.  If so we end up recursing once
-       * only into this function.
-       */
-      if (min_index != 0) {
-	 vbo_rebase_prims(ctx, arrays,
-			  prim, nr_prims,
-			  ib, min_index, max_index,
-			  brw_draw_prims );
-	 return;
-      }
    }
 
    /* Make a first attempt at drawing:
     */
-   retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
    /* Otherwise, we really are out of memory.  Pass the drawing
     * command to the software tnl module and which will in turn call
     * swrast to do the drawing.
     */
-   if (!retval) {
-       _swsetup_Wakeup(ctx);
-      _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   if (ret != 0) {
+      intel_batchbuffer_flush(intel->batch);
+      ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+      assert(ret == 0);
    }
-
 }
 
 void brw_draw_init( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index a3ff6c58d8..ad3ef6b7dd 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -25,13 +25,9 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_context.h"
 
-#include "main/glheader.h"
-#include "main/bufferobj.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/api_validate.h"
-#include "main/enums.h"
+#include "util/u_upload_mgr.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
@@ -43,303 +39,157 @@
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
-static GLuint double_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64_FLOAT,
-   BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
-};
-
-static GLuint float_types[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32_FLOAT,
-   BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
-};
-
-static GLuint uint_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_UNORM,
-   BRW_SURFACEFORMAT_R32G32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32_UNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_UNORM
-};
-
-static GLuint uint_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_USCALED,
-   BRW_SURFACEFORMAT_R32G32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32_USCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_USCALED
-};
-
-static GLuint int_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SNORM,
-   BRW_SURFACEFORMAT_R32G32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32_SNORM,
-   BRW_SURFACEFORMAT_R32G32B32A32_SNORM
-};
-
-static GLuint int_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32_SSCALED,
-   BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
-};
-
-static GLuint ushort_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_UNORM,
-   BRW_SURFACEFORMAT_R16G16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16_UNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_UNORM
-};
-
-static GLuint ushort_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_USCALED,
-   BRW_SURFACEFORMAT_R16G16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16_USCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_USCALED
-};
-
-static GLuint short_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SNORM,
-   BRW_SURFACEFORMAT_R16G16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16_SNORM,
-   BRW_SURFACEFORMAT_R16G16B16A16_SNORM
-};
-
-static GLuint short_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16_SSCALED,
-   BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
-};
 
-static GLuint ubyte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_UNORM,
-   BRW_SURFACEFORMAT_R8G8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8_UNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_UNORM
-};
 
-static GLuint ubyte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_USCALED,
-   BRW_SURFACEFORMAT_R8G8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8_USCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_USCALED
-};
-
-static GLuint byte_types_norm[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SNORM,
-   BRW_SURFACEFORMAT_R8G8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8_SNORM,
-   BRW_SURFACEFORMAT_R8G8B8A8_SNORM
-};
 
-static GLuint byte_types_scale[5] = {
-   0,
-   BRW_SURFACEFORMAT_R8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8_SSCALED,
-   BRW_SURFACEFORMAT_R8G8B8A8_SSCALED
-};
-
-
-/**
- * Given vertex array type/size/format/normalized info, return
- * the appopriate hardware surface type.
- * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
- */
-static GLuint get_surface_type( GLenum type, GLuint size,
-                                GLenum format, GLboolean normalized )
+unsigned brw_translate_surface_format( unsigned id )
 {
-   if (INTEL_DEBUG & DEBUG_VERTS)
-      _mesa_printf("type %s size %d normalized %d\n", 
-		   _mesa_lookup_enum_by_nr(type), size, normalized);
-
-   if (normalized) {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_norm[size];
-      case GL_SHORT: return short_types_norm[size];
-      case GL_BYTE: return byte_types_norm[size];
-      case GL_UNSIGNED_INT: return uint_types_norm[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
-      case GL_UNSIGNED_BYTE:
-         if (format == GL_BGRA) {
-            /* See GL_EXT_vertex_array_bgra */
-            assert(size == 4);
-            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-         }
-         else {
-            return ubyte_types_norm[size];
-         }
-      default: assert(0); return 0;
-      }      
-   }
-   else {
-      assert(format == GL_RGBA); /* sanity check */
-      switch (type) {
-      case GL_DOUBLE: return double_types[size];
-      case GL_FLOAT: return float_types[size];
-      case GL_INT: return int_types_scale[size];
-      case GL_SHORT: return short_types_scale[size];
-      case GL_BYTE: return byte_types_scale[size];
-      case GL_UNSIGNED_INT: return uint_types_scale[size];
-      case GL_UNSIGNED_SHORT: return ushort_types_scale[size];
-      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size];
-      default: assert(0); return 0;
-      }      
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
    }
 }
 
-
-static GLuint get_size( GLenum type )
-{
-   switch (type) {
-   case GL_DOUBLE: return sizeof(GLdouble);
-   case GL_FLOAT: return sizeof(GLfloat);
-   case GL_INT: return sizeof(GLint);
-   case GL_SHORT: return sizeof(GLshort);
-   case GL_BYTE: return sizeof(GLbyte);
-   case GL_UNSIGNED_INT: return sizeof(GLuint);
-   case GL_UNSIGNED_SHORT: return sizeof(GLushort);
-   case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
-   default: return 0;
-   }      
-}
-
-static GLuint get_index_type(GLenum type) 
+static unsigned get_index_type(int type)
 {
    switch (type) {
-   case GL_UNSIGNED_BYTE:  return BRW_INDEX_BYTE;
-   case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD;
-   case GL_UNSIGNED_INT:   return BRW_INDEX_DWORD;
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
    default: assert(0); return 0;
    }
 }
 
-static void wrap_buffers( struct brw_context *brw,
-			  GLuint size )
-{
-   if (size < BRW_UPLOAD_INIT_SIZE)
-      size = BRW_UPLOAD_INIT_SIZE;
-
-   brw->vb.upload.offset = 0;
-
-   if (brw->vb.upload.bo != NULL)
-      dri_bo_unreference(brw->vb.upload.bo);
-   brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
-				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
-}
-
-static void get_space( struct brw_context *brw,
-		       GLuint size,
-		       dri_bo **bo_return,
-		       GLuint *offset_return )
-{
-   size = ALIGN(size, 64);
-
-   if (brw->vb.upload.bo == NULL ||
-       brw->vb.upload.offset + size > brw->vb.upload.bo->size) {
-      wrap_buffers(brw, size);
-   }
-
-   assert(*bo_return == NULL);
-   dri_bo_reference(brw->vb.upload.bo);
-   *bo_return = brw->vb.upload.bo;
-   *offset_return = brw->vb.upload.offset;
-   brw->vb.upload.offset += size;
-}
-
-static void
-copy_array_to_vbo_array( struct brw_context *brw,
-			 struct brw_vertex_element *element,
-			 GLuint dst_stride)
-{
-   struct intel_context *intel = &brw->intel;
-   GLuint size = element->count * dst_stride;
-
-   get_space(brw, size, &element->bo, &element->offset);
 
-   if (element->glarray->StrideB == 0) {
-      assert(element->count == 1);
-      element->stride = 0;
-   } else {
-      element->stride = dst_stride;
-   }
-
-   if (dst_stride == element->glarray->StrideB) {
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 memcpy((char *)element->bo->virtual + element->offset,
-		element->glarray->Ptr, size);
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			element->glarray->Ptr);
-      }
-   } else {
-      char *dest;
-      const unsigned char *src = element->glarray->Ptr;
-      int i;
-
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 dest = element->bo->virtual;
-	 dest += element->offset;
-
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 void *data;
-
-	 data = _mesa_malloc(dst_stride * element->count);
-	 dest = data;
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			data);
-
-	 _mesa_free(data);
-      }
-   }
-}
 
-static void brw_prepare_vertices(struct brw_context *brw)
+static boolean brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
@@ -358,123 +208,38 @@ static void brw_prepare_vertices(struct brw_context *brw)
    if (0)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
-   /* Accumulate the list of enabled arrays. */
-   brw->vb.nr_enabled = 0;
-   while (vs_inputs) {
-      GLuint i = _mesa_ffsll(vs_inputs) - 1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      vs_inputs &= ~(1 << i);
-      brw->vb.enabled[brw->vb.nr_enabled++] = input;
-   }
-
-   /* XXX: In the rare cases where this happens we fallback all
-    * the way to software rasterization, although a tnl fallback
-    * would be sufficient.  I don't know of *any* real world
-    * cases with > 17 vertex attributes enabled, so it probably
-    * isn't an issue at this point.
-    */
-   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
-      intel->Fallback = 1;
-      return;
-   }
 
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
 
-      if (_mesa_is_bufferobj(input->glarray->BufferObj)) {
-	 struct intel_buffer_object *intel_buffer =
-	    intel_buffer_object(input->glarray->BufferObj);
-
-	 /* Named buffer object: Just reference its contents directly. */
-	 dri_bo_unreference(input->bo);
-	 input->bo = intel_bufferobj_buffer(intel, intel_buffer,
-					    INTEL_READ);
-	 dri_bo_reference(input->bo);
-	 input->offset = (unsigned long)input->glarray->Ptr;
-	 input->stride = input->glarray->StrideB;
-	 input->count = input->glarray->_MaxElement;
-
-	 /* This is a common place to reach if the user mistakenly supplies
-	  * a pointer in place of a VBO offset.  If we just let it go through,
-	  * we may end up dereferencing a pointer beyond the bounds of the
-	  * GTT.  We would hope that the VBO's max_index would save us, but
-	  * Mesa appears to hand us min/max values not clipped to the
-	  * array object's _MaxElement, and _MaxElement frequently appears
-	  * to be wrong anyway.
-	  *
-	  * The VBO spec allows application termination in this case, and it's
-	  * probably a service to the poor programmer to do so rather than
-	  * trying to just not render.
-	  */
-	 assert(input->offset < input->bo->size);
-      } else {
-	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
-	 if (input->bo != NULL) {
-	    /* Already-uploaded vertex data is present from a previous
-	     * prepare_vertices, but we had to re-validate state due to
-	     * check_aperture failing and a new batch being produced.
-	     */
-	    continue;
-	 }
-
-	 /* Queue the buffer object up to be uploaded in the next pass,
-	  * when we've decided if we're doing interleaved or not.
-	  */
-	 if (input->attrib == VERT_ATTRIB_POS) {
-	    /* Position array not properly enabled:
-	     */
-            if (input->glarray->StrideB == 0) {
-               intel->Fallback = 1;
-               return;
-            }
-
-	    interleave = input->glarray->StrideB;
-	    ptr = input->glarray->Ptr;
-	 }
-	 else if (interleave != input->glarray->StrideB ||
-		  (const unsigned char *)input->glarray->Ptr - ptr < 0 ||
-		  (const unsigned char *)input->glarray->Ptr - ptr > interleave)
-	 {
-	    interleave = 0;
-	 }
-
-	 upload[nr_uploads++] = input;
-	 
-	 /* We rebase drawing to start at element zero only when
-	  * varyings are not in vbos, which means we can end up
-	  * uploading non-varying arrays (stride != 0) when min_index
-	  * is zero.  This doesn't matter as the amount to upload is
-	  * the same for these arrays whether the draw call is rebased
-	  * or not - we just have to upload the one element.
-	  */
-	 assert(min_index == 0 || input->glarray->StrideB == 0);
-      }
-   }
-
-   /* Handle any arrays to be uploaded. */
-   if (nr_uploads > 1 && interleave && interleave <= 256) {
-      /* All uploads are interleaved, so upload the arrays together as
-       * interleaved.  First, upload the contents and set up upload[0].
-       */
-      copy_array_to_vbo_array(brw, upload[0], interleave);
-
-      for (i = 1; i < nr_uploads; i++) {
-	 /* Then, just point upload[i] at upload[0]'s buffer. */
-	 upload[i]->stride = interleave;
-	 upload[i]->offset = upload[0]->offset +
-	    ((const unsigned char *)upload[i]->glarray->Ptr - ptr);
-	 upload[i]->bo = upload[0]->bo;
-	 dri_bo_reference(upload[i]->bo);
+      if (brw_is_user_buffer(vb)) {
+	 u_upload_buffer( brw->upload, 
+			  min_index * vb->stride,
+			  (max_index + 1 - min_index) * vb->stride,
+			  &offset,
+			  &buffer );
       }
-   }
-   else {
-      /* Upload non-interleaved arrays */
-      for (i = 0; i < nr_uploads; i++) {
-          copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
+      else
+      {
+	 offset = 0;
+	 buffer = vb->buffer;
+	 count = stride == 0 ? 1 : max_index + 1 - min_index;
       }
+
+      /* Named buffer object: Just reference its contents directly. */
+      dri_bo_unreference(input->bo);
+      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
+					 INTEL_READ);
+      dri_bo_reference(input->bo);
+
+      input->offset = (unsigned long)offset;
+      input->stride = vb->stride;
+      input->count = count;
+
+      assert(input->offset < input->bo->size);
    }
 
    brw_prepare_query_begin(brw);
@@ -632,13 +397,8 @@ static void brw_prepare_indices(struct brw_context *brw)
 
       /* Straight upload
        */
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(bo);
-	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
-	 drm_intel_gem_bo_unmap_gtt(bo);
-      } else {
-	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
-      }
+      brw_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
       brw->ib.start_vertex_offset = 0;
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 48c2b9a41c..5ec0c585fe 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -58,7 +58,7 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
 
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
new file mode 100644
index 0000000000..b351794dce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -0,0 +1,41 @@
+
+   /* _NEW_COLOR */
+   if (key->logic_op != GL_COPY) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
+   } else if (key->color_blend) {
+      GLenum eqRGB = key->blend_eq_rgb;
+      GLenum eqA = key->blend_eq_a;
+      GLenum srcRGB = key->blend_src_rgb;
+      GLenum dstRGB = key->blend_dst_rgb;
+      GLenum srcA = key->blend_src_a;
+      GLenum dstA = key->blend_dst_a;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	 srcRGB = dstRGB = GL_ONE;
+      }
+
+      if (eqA == GL_MIN || eqA == GL_MAX) {
+	 srcA = dstA = GL_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+
+   if (key->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
diff --git a/src/gallium/drivers/i965/brw_pipe_debug.c b/src/gallium/drivers/i965/brw_pipe_debug.c
new file mode 100644
index 0000000000..34d6d4028a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_debug.c
@@ -0,0 +1,2 @@
+   if (INTEL_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
new file mode 100644
index 0000000000..da29bc8bcb
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -0,0 +1,52 @@
+   /* _NEW_STENCIL */
+   if (key->dsa.stencil[0].enable) {
+      cc.cc0.stencil_enable = 1;
+      cc.cc0.stencil_func =
+	 intel_translate_compare_func(key->stencil_func[0]);
+      cc.cc0.stencil_fail_op =
+	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+      cc.cc0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+      cc.cc0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
+      cc.cc1.stencil_ref = key->stencil_ref[0];
+      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
+      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
+
+      if (key->stencil_two_side) {
+	 cc.cc0.bf_stencil_enable = 1;
+	 cc.cc0.bf_stencil_func =
+	    intel_translate_compare_func(key->stencil_func[1]);
+	 cc.cc0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
+	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
+	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
+	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
+      }
+
+      /* Not really sure about this:
+       */
+      if (key->stencil_write_mask[0] ||
+	  (key->stencil_two_side && key->stencil_write_mask[1]))
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+
+   if (key->alpha_enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+   }
+
+   /* _NEW_DEPTH */
+   if (key->depth_test) {
+      cc.cc2.depth_test = 1;
+      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
+      cc.cc2.depth_write_enable = key->depth_write;
+   }
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
new file mode 100644
index 0000000000..d4ae332f46
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -0,0 +1,25 @@
+
+/**
+ * called from intelDrawBuffer()
+ */
+static void brw_set_draw_region( struct intel_context *intel, 
+                                 struct intel_region *color_regions[],
+                                 struct intel_region *depth_region,
+                                 GLuint num_color_regions)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   GLuint i;
+
+   /* release old color/depth regions */
+   if (brw->state.depth_region != depth_region)
+      brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
+   intel_region_release(&brw->state.depth_region);
+
+   /* reference new color/depth regions */
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
+   intel_region_reference(&brw->state.depth_region, depth_region);
+   brw->state.nr_color_regions = num_color_regions;
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
new file mode 100644
index 0000000000..008f623151
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -0,0 +1,64 @@
+
+/**
+ * called from intel_batchbuffer_flush and children before sending a
+ * batchbuffer off.
+ */
+static void brw_finish_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+   brw_emit_query_end(brw);
+}
+
+
+/**
+ * called from intelFlushBatchLocked
+ */
+static void brw_new_batch( struct intel_context *intel )
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   /* Check that we didn't just wrap our batchbuffer at a bad time. */
+   assert(!brw->no_batch_wrap);
+
+   brw->curbe.need_new_bo = GL_TRUE;
+
+   /* Mark all context state as needing to be re-emitted.
+    * This is probably not as severe as on 915, since almost all of our state
+    * is just in referenced buffers.
+    */
+   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+
+   /* Move to the end of the current upload buffer so that we'll force choosing
+    * a new buffer next time.
+    */
+   if (brw->vb.upload.bo != NULL) {
+      dri_bo_unreference(brw->vb.upload.bo);
+      brw->vb.upload.bo = NULL;
+      brw->vb.upload.offset = 0;
+   }
+}
+
+
+static void brw_note_fence( struct intel_context *intel, GLuint fence )
+{
+   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
+}
+
+/* called from intelWaitForIdle() and intelFlush()
+ *
+ * For now, just flush everything.  Could be smarter later.
+ */
+static GLuint brw_flush_cmd( void )
+{
+   struct brw_mi_flush flush;
+   flush.opcode = CMD_MI_FLUSH;
+   flush.pad = 0;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
+   return *(GLuint *)&flush;
+}
+
+
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
new file mode 100644
index 0000000000..d199d0b81a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -0,0 +1,27 @@
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && 
+	     irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e1c2c7777b..90513245ee 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -59,9 +59,9 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
 
    c.key = *key;
-   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attrs = util_count_bits(c.key.attrs);
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_attrs = util_count_bits(c.key.attrs & DO_SETUP_BITS);
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index ca8f97f9f9..4cc427a935 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -150,7 +150,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
@@ -188,7 +188,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
 
    if (!nr)
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b817b741e7..6801084616 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -270,7 +270,7 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
 /***********************************************************************
  * Emit all state:
  */
-void brw_validate_state( struct brw_context *brw )
+enum pipe_error brw_validate_state( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
@@ -278,10 +278,6 @@ void brw_validate_state( struct brw_context *brw )
    GLuint i;
 
    brw_clear_validated_bos(brw);
-
-   state->mesa |= brw->intel.NewGLState;
-   brw->intel.NewGLState = 0;
-
    brw_add_validated_bo(brw, intel->batch->buf);
 
    if (brw->emit_state_always) {
@@ -290,36 +286,23 @@ void brw_validate_state( struct brw_context *brw )
       state->cache |= ~0;
    }
 
-   if (brw->fragment_program != ctx->FragmentProgram._Current) {
-      brw->fragment_program = ctx->FragmentProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-   }
-
-   if (brw->vertex_program != ctx->VertexProgram._Current) {
-      brw->vertex_program = ctx->VertexProgram._Current;
-      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-   }
-
    if (state->mesa == 0 &&
        state->cache == 0 &&
        state->brw == 0)
-      return;
+      return 0;
 
    if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
       brw_clear_batch_cache(brw);
 
-   brw->intel.Fallback = 0;
-
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
       const struct brw_tracked_state *atom = atoms[i];
 
-      if (brw->intel.Fallback)
-         break;
-
       if (check_state(state, &atom->dirty)) {
          if (atom->prepare) {
-            atom->prepare(brw);
+            ret = atom->prepare(brw);
+	    if (ret)
+	       return ret;
         }
       }
    }
@@ -329,17 +312,18 @@ void brw_validate_state( struct brw_context *brw )
     * If this fails, we can experience GPU lock-ups.
     */
    {
-      const struct brw_fragment_program *fp;
-      fp = brw_fragment_program_const(brw->fragment_program);
+      const struct brw_fragment_program *fp = brw->fragment_program;
       if (fp) {
-         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
-                == fp->tex_units_used);
+         assert(fp->info.max_sampler <= brw->nr_samplers &&
+		fp->info.max_texture <= brw->nr_textures);
       }
    }
+
+   return 0;
 }
 
 
-void brw_upload_state(struct brw_context *brw)
+enum pipe_error brw_upload_state(struct brw_context *brw)
 {
    struct brw_state_flags *state = &brw->state.dirty;
    int i;
@@ -356,7 +340,7 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < Elements(atoms); i++) {	 
+      for (i = 0; i < Elements(atoms); i++) {
 	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
@@ -364,12 +348,11 @@ void brw_upload_state(struct brw_context *brw)
 		atom->dirty.brw ||
 		atom->dirty.cache);
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
 
@@ -388,12 +371,11 @@ void brw_upload_state(struct brw_context *brw)
       for (i = 0; i < Elements(atoms); i++) {	 
 	 const struct brw_tracked_state *atom = atoms[i];
 
-	 if (brw->intel.Fallback)
-	    break;
-
 	 if (check_state(state, &atom->dirty)) {
 	    if (atom->emit) {
-	       atom->emit( brw );
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
 	    }
 	 }
       }
@@ -407,10 +389,11 @@ void brw_upload_state(struct brw_context *brw)
 	 brw_print_dirty_count(mesa_bits, state->mesa);
 	 brw_print_dirty_count(brw_bits, state->brw);
 	 brw_print_dirty_count(cache_bits, state->cache);
-	 fprintf(stderr, "\n");
+	 debug_printf("\n");
       }
    }
-
-   if (!brw->intel.Fallback)
-      memset(state, 0, sizeof(*state));
+   
+   /* Clear dirty flags:
+    */
+   memset(state, 0, sizeof(*state));
 }
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
new file mode 100644
index 0000000000..6684f442d5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -0,0 +1,114 @@
+
+/* XXX: could split the primitive list to fallback only on the
+ * non-conformant primitives.
+ */
+static GLboolean check_fallbacks( struct brw_context *brw,
+				  const struct _mesa_prim *prim,
+				  GLuint nr_prims )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   GLuint i;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
+      return GL_FALSE;
+
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
+   if (ctx->Polygon.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware will do AA lines, but they are non-conformant it
+    * seems.  TBD whether we keep this fallback:
+    */
+   if (ctx->Line.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_LINES) 
+	    return GL_TRUE;
+   }
+
+   /* Stipple -- these fallbacks could be resolved with a little
+    * bit of work?
+    */
+   if (ctx->Line.StippleFlag) {
+      for (i = 0; i < nr_prims; i++) {
+	 /* GS doesn't get enough information to know when to reset
+	  * the stipple counter?!?
+	  */
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
+	    return GL_TRUE;
+	    
+	 if (prim[i].mode == GL_POLYGON &&
+	     (ctx->Polygon.FrontMode == GL_LINE ||
+	      ctx->Polygon.BackMode == GL_LINE))
+	    return GL_TRUE;
+      }
+   }
+
+   if (ctx->Point.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (prim[i].mode == GL_POINTS) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
+
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->nr_ve == 0 ||
+       brw->nr_ve >= BRW_VEP_MAX) {
+      return TRUE;
+   }
+
+   /* Position array with zero stride?
+    */
+   if (brw->vs[brw->ve[0]]->stride == 0)
+      return TRUE;
+
+
+      
+   /* Nothing stopping us from the fast path now */
+   return GL_FALSE;
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
new file mode 100644
index 0000000000..32b62848da
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -0,0 +1,11 @@
+#ifndef BRW_TYPES_H
+#define BRW_TYPES_H
+
+typedef GLuint uint32_t;
+typedef GLubyte uint8_t;
+typedef GLushort uint16_t;
+/* no GLenum, translate all away */
+
+typedef GLboolean uint8_t;
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c
index ce21aa4869..17f671a8fa 100644
--- a/src/gallium/drivers/i965/brw_util.c
+++ b/src/gallium/drivers/i965/brw_util.c
@@ -35,14 +35,6 @@
 #include "brw_util.h"
 #include "brw_defines.h"
 
-GLuint brw_count_bits( GLuint val )
-{
-   GLuint i;
-   for (i = 0; val ; val >>= 1)
-      if (val & 1)
-	 i++;
-   return i;
-}
 
 
 GLuint brw_translate_blend_equation( GLenum mode )
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index f0c79efbd9..53a5560105 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -61,9 +61,7 @@ static void do_vs_prog( struct brw_context *brw,
    }
 
    if (0)
-      _mesa_print_program(&c.vp->program.Base);
-
-
+      tgsi_dump(&c.vp->tokens, 0);
 
    /* Emit GEN4 code.
     */
@@ -96,9 +94,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
-   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
-			ctx->Polygon.BackMode != GL_FILL);
+   key.nr_userclip = brw->nr_userclip;
+   key.copy_edgeflag = (brw->rast->fill_ccw != PIPE_POLYGON_MODE_FILL ||
+			brw->rast->fill_cw != PIPE_POLYGON_MODE_FILL);
 
    /* Make an early check for the key.
     */
@@ -116,7 +114,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
+      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 1638ef8111..7f20c4baca 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "main/macros.h"
 #include "shader/program.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "pipe/p_shader_tokens.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 
@@ -129,6 +129,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 reg++;
       }
    }
+
    /* If there are no inputs, we'll still be reading one attribute's worth
     * because it's required -- see urb_read_length setting.
     */
@@ -226,6 +227,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * vertex urb, so is half the amount:
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+
    /* Setting this field to 0 leads to undefined behavior according to the
     * the VS_STATE docs.  Our VUEs will always have at least one attribute
     * sitting in them, even if it's padding.
@@ -960,9 +962,6 @@ static void emit_arl( struct brw_vs_compile *c,
 
 /**
  * Return the brw reg for the given instruction's src argument.
- * Will return mangled results for SWZ op.  The emit_swz() function
- * ignores this result and recalculates taking extended swizzles into
- * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
                                const struct prog_instruction *inst,
@@ -1024,74 +1023,6 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 }
 
 
-static void emit_swz( struct brw_vs_compile *c, 
-		      struct brw_reg dst,
-                      const struct prog_instruction *inst)
-{
-   const GLuint argIndex = 0;
-   const struct prog_src_register src = inst->SrcReg[argIndex];
-   struct brw_compile *p = &c->func;
-   GLuint zeros_mask = 0;
-   GLuint ones_mask = 0;
-   GLuint src_mask = 0;
-   GLubyte src_swz[4];
-   GLboolean need_tmp = (src.Negate &&
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-   struct brw_reg tmp = dst;
-   GLuint i;
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   for (i = 0; i < 4; i++) {
-      if (dst.dw1.bits.writemask & (1<<i)) {
-	 GLubyte s = GET_SWZ(src.Swizzle, i);
-	 switch (s) {
-	 case SWIZZLE_X:
-	 case SWIZZLE_Y:
-	 case SWIZZLE_Z:
-	 case SWIZZLE_W:
-	    src_mask |= 1<<i;
-	    src_swz[i] = s;
-	    break;
-	 case SWIZZLE_ZERO:
-	    zeros_mask |= 1<<i;
-	    break;
-	 case SWIZZLE_ONE:
-	    ones_mask |= 1<<i;
-	    break;
-	 }
-      }
-   }
-   
-   /* Do src first, in case dst aliases src:
-    */
-   if (src_mask) {
-      struct brw_reg arg0;
-
-      arg0 = get_src_reg(c, inst, argIndex);
-
-      arg0 = brw_swizzle(arg0, 
-			 src_swz[0], src_swz[1], 
-			 src_swz[2], src_swz[3]);
-
-      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
-   } 
-   
-   if (zeros_mask) 
-      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
-
-   if (ones_mask) 
-      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
-
-   if (src.Negate)
-      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
-   
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
 
 
 /**
@@ -1332,20 +1263,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
-   for (insn = 0; insn < nr_insns; insn++) {
-       GLuint i;
-       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
-       for (i = 0; i < 3; i++) {
-	   struct prog_src_register *src = &inst->SrcReg[i];
-	   GLuint index = src->Index;
-	   GLuint file = src->File;	
-	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
-	       c->output_regs[index].used_in_src = GL_TRUE;
-       }
-   }
-
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
@@ -1362,18 +1279,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
       _mesa_print_instruction(inst);
 #endif
 
-      /* Get argument regs.  SWZ is special and does this itself.
+      /* Get argument regs.
        */
-      if (inst->Opcode != OPCODE_SWZ)
-	  for (i = 0; i < 3; i++) {
-	      const struct prog_src_register *src = &inst->SrcReg[i];
-	      index = src->Index;
-	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-		  args[i] = c->output_regs[index].reg;
-	      else
-                  args[i] = get_arg(c, inst, i);
-	  }
+      for (i = 0; i < 3; i++) {
+	 const struct prog_src_register *src = &inst->SrcReg[i];
+	 index = src->Index;
+	 file = src->File;	
+	 args[i] = get_arg(c, inst, i);
+      }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
        * dst and arg, given the static allocation of registers.  So
@@ -1381,10 +1294,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
        */ 
       index = inst->DstReg.Index;
       file = inst->DstReg.File;
-      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-	  dst = c->output_regs[index].reg;
-      else
-	  dst = get_dst(c, inst->DstReg);
+      dst = get_dst(c, inst->DstReg);
 
       if (inst->SaturateMode != SATURATE_OFF) {
 	 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
@@ -1392,151 +1302,144 @@ void brw_vs_emit(struct brw_vs_compile *c )
       }
 
       switch (inst->Opcode) {
-      case OPCODE_ABS:
+      case TGSI_OPCODE_ABS:
 	 brw_MOV(p, dst, brw_abs(args[0]));
 	 break;
-      case OPCODE_ADD:
+      case TGSI_OPCODE_ADD:
 	 brw_ADD(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_COS:
+      case TGSI_OPCODE_COS:
 	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_DP3:
+      case TGSI_OPCODE_DP3:
 	 brw_DP3(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 brw_DP4(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 brw_DPH(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_NRM3:
+      case TGSI_OPCODE_NRM3:
 	 emit_nrm(c, dst, args[0], 3);
 	 break;
-      case OPCODE_NRM4:
+      case TGSI_OPCODE_NRM4:
 	 emit_nrm(c, dst, args[0], 4);
 	 break;
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
 	 break;
-      case OPCODE_EXP:
+      case TGSI_OPCODE_EXP:
 	 unalias1(c, dst, args[0], emit_exp_noalias);
 	 break;
-      case OPCODE_EX2:
+      case TGSI_OPCODE_EX2:
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_ARL:
+      case TGSI_OPCODE_ARL:
 	 emit_arl(c, dst, args[0]);
 	 break;
-      case OPCODE_FLR:
+      case TGSI_OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
 	 break;
-      case OPCODE_FRC:
+      case TGSI_OPCODE_FRC:
 	 brw_FRC(p, dst, args[0]);
 	 break;
-      case OPCODE_LOG:
+      case TGSI_OPCODE_LOG:
 	 unalias1(c, dst, args[0], emit_log_noalias);
 	 break;
-      case OPCODE_LG2:
+      case TGSI_OPCODE_LG2:
 	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 unalias1(c, dst, args[0], emit_lit_noalias);
 	 break;
-      case OPCODE_LRP:
+      case TGSI_OPCODE_LRP:
 	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
 	 break;
-      case OPCODE_MAD:
+      case TGSI_OPCODE_MAD:
 	 brw_MOV(p, brw_acc_reg(), args[2]);
 	 brw_MAC(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MAX:
+      case TGSI_OPCODE_MAX:
 	 emit_max(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MIN:
+      case TGSI_OPCODE_MIN:
 	 emit_min(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_MOV:
+      case TGSI_OPCODE_MOV:
 	 brw_MOV(p, dst, args[0]);
 	 break;
-      case OPCODE_MUL:
+      case TGSI_OPCODE_MUL:
 	 brw_MUL(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
 	 break;
-      case OPCODE_RCP:
+      case TGSI_OPCODE_RCP:
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_RSQ:
+      case TGSI_OPCODE_RSQ:
 	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-
-      case OPCODE_SEQ:
+      case TGSI_OPCODE_SEQ:
          emit_seq(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SIN:
+      case TGSI_OPCODE_SIN:
 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
-      case OPCODE_SNE:
+      case TGSI_OPCODE_SNE:
          emit_sne(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SGE:
+      case TGSI_OPCODE_SGE:
 	 emit_sge(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SGT:
+      case TGSI_OPCODE_SGT:
          emit_sgt(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SLT:
+      case TGSI_OPCODE_SLT:
 	 emit_slt(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_SLE:
+      case TGSI_OPCODE_SLE:
          emit_sle(p, dst, args[0], args[1]);
          break;
-      case OPCODE_SUB:
+      case TGSI_OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
 	 break;
-      case OPCODE_SWZ:
-	 /* The args[0] value can't be used here as it won't have
-	  * correctly encoded the full swizzle:
-	  */
-	 emit_swz(c, dst, inst);
-	 break;
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_TRUNC:
          /* round toward zero */
 	 brw_RNDZ(p, dst, args[0]);
 	 break;
-      case OPCODE_XPD:
+      case TGSI_OPCODE_XPD:
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_IF:
+      case TGSI_OPCODE_IF:
 	 assert(if_depth < MAX_IF_DEPTH);
 	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
 	 /* Note that brw_IF smashes the predicate_control field. */
 	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
 	 if_depth++;
 	 break;
-      case OPCODE_ELSE:
+      case TGSI_OPCODE_ELSE:
 	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
-      case OPCODE_ENDIF:
+      case TGSI_OPCODE_ENDIF:
          assert(if_depth > 0);
 	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
-      case OPCODE_BGNLOOP:
+      case TGSI_OPCODE_BGNLOOP:
          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
          break;
-      case OPCODE_BRK:
+      case TGSI_OPCODE_BRK:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_BREAK(p);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CONT:
+      case TGSI_OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_CONT(p);
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
+      case TGSI_OPCODE_ENDLOOP: 
          {
             struct brw_instruction *inst0, *inst1;
 	    GLuint br = 1;
@@ -1550,23 +1453,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
             while (inst0 > loop_inst[loop_depth]) {
                inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+               if (inst0->header.opcode == BRW_TGSI_OPCODE_BREAK) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
                   inst0->bits3.if_else.pop_count = 0;
                }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+               else if (inst0->header.opcode == BRW_TGSI_OPCODE_CONTINUE) {
                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                   inst0->bits3.if_else.pop_count = 0;
                }
             }
          }
          break;
-      case OPCODE_BRA:
+      case TGSI_OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_CAL:
+      case TGSI_OPCODE_CAL:
 	 brw_set_access_mode(p, BRW_ALIGN_1);
 	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
@@ -1575,27 +1478,27 @@ void brw_vs_emit(struct brw_vs_compile *c )
          brw_save_call(p, inst->Comment, p->nr_insn);
 	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_RET:
+      case TGSI_OPCODE_RET:
 	 brw_ADD(p, get_addr_reg(stack_index),
 			 get_addr_reg(stack_index), brw_imm_d(-4));
 	 brw_set_access_mode(p, BRW_ALIGN_1);
          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
 	 break;
-      case OPCODE_END:	
+      case TGSI_OPCODE_END:	
          end_offset = p->nr_insn;
          /* this instruction will get patched later to jump past subroutine
           * code, etc.
           */
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
-      case OPCODE_PRINT:
+      case TGSI_OPCODE_PRINT:
          /* no-op */
          break;
-      case OPCODE_BGNSUB:
+      case TGSI_OPCODE_BGNSUB:
          brw_save_label(p, inst->Comment, p->nr_insn);
          break;
-      case OPCODE_ENDSUB:
+      case TGSI_OPCODE_ENDSUB:
          /* no-op */
          break;
       default:
@@ -1618,33 +1521,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
       }
 
-      if ((inst->DstReg.File == PROGRAM_OUTPUT)
-          && (inst->DstReg.Index != VERT_RESULT_HPOS)
-          && c->output_regs[inst->DstReg.Index].used_in_src) {
-         brw_MOV(p, get_dst(c, inst->DstReg), dst);
-      }
-
-      /* Result color clamping.
-       *
-       * When destination register is an output register and
-       * it's primary/secondary front/back color, we have to clamp
-       * the result to [0,1]. This is done by enabling the
-       * saturation bit for the last instruction.
-       *
-       * We don't use brw_set_saturate() as it modifies
-       * p->current->header.saturate, which affects all the subsequent
-       * instructions. Instead, we directly modify the header
-       * of the last (already stored) instruction.
-       */
-      if (inst->DstReg.File == PROGRAM_OUTPUT) {
-         if ((inst->DstReg.Index == VERT_RESULT_COL0)
-             || (inst->DstReg.Index == VERT_RESULT_COL1)
-             || (inst->DstReg.Index == VERT_RESULT_BFC0)
-             || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
-            p->store[p->nr_insn-1].header.saturate = 1;
-         }
-      }
-
       release_tmps(c);
    }
 
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 2292de94c4..20d31880b4 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -269,61 +269,46 @@ static void brw_wm_populate_key( struct brw_context *brw,
 		    uses_depth,
 		    key);
 
+   /* Revisit this, figure out if it's really useful, and either push
+    * it into the state tracker so that everyone benefits (use to
+    * create fs varients with TEX rather than TXP), or discard.
+    */
+   key->proj_attrib_mask = ~0; /*brw->wm.input_size_masks[4-1];*/
 
-   /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
-
-   /* _NEW_LIGHT */
-   key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
+   /* PIPE_NEW_RAST */
+   key->flat_shade = brw->rast.flat_shade;
 
-   /* _NEW_HINT */
-   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+   /* This can be determined by looking at the INTERP mode each input decl.
+    */
+   key->linear_color = 0;
 
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
-
-      if (unit->_ReallyEnabled) {
-         const struct gl_texture_object *t = unit->_Current;
-         const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+      if (i < brw->nr_textures) {
+	 const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
+	 const struct gl_texture_object *t = unit->_Current;
+	 const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 
 	 if (img->InternalFormat == GL_YCBCR_MESA) {
 	    key->yuvtex_mask |= 1 << i;
 	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
-		key->yuvtex_swap_mask |= 1 << i;
+	       key->yuvtex_swap_mask |= 1 << i;
 	 }
 
-         key->tex_swizzles[i] = t->_Swizzle;
+	 key->tex_swizzles[i] = t->_Swizzle;
+	 
+	 if (0)
+	    key->shadowtex_mask |= 1<<i;
       }
       else {
          key->tex_swizzles[i] = SWIZZLE_NOOP;
       }
    }
 
-   /* Shadow */
-   key->shadowtex_mask = fp->program.Base.ShadowSamplers;
 
-   /* _NEW_BUFFERS */
-   /*
-    * Include the draw buffer origin and height so that we can calculate
-    * fragment position values relative to the bottom left of the drawable,
-    * from the incoming screen origin relative position we get as part of our
-    * payload.
-    *
-    * We could avoid recompiling by including this as a constant referenced by
-    * our program, but if we were to do that it would also be nice to handle
-    * getting that constant updated at batchbuffer submit time (when we
-    * hold the lock and know where the buffer really is) rather than at emit
-    * time when we don't hold the lock and are just guessing.  We could also
-    * just avoid using this as key data if the program doesn't use
-    * fragment.position.
-    *
-    * This pretty much becomes moot with DRI2 and redirected buffers anyway,
-    * as our origins will always be zero then.
-    */
+   /* _NEW_FRAMEBUFFER */
    if (brw->intel.driDrawable != NULL) {
-      key->origin_x = brw->intel.driDrawable->x;
-      key->origin_y = brw->intel.driDrawable->y;
-      key->drawable_height = brw->intel.driDrawable->h;
+      key->drawable_height = brw->fb.cbufs[0].height;
    }
 
    /* CACHE_NEW_VS_PROG */
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 872b1f3ecf..756a680150 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -76,7 +76,6 @@ struct brw_wm_prog_key {
    GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
-   GLuint origin_x, origin_y;
    GLuint drawable_height;
    GLuint vp_outputs_written;
 };
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index bf80a2942a..9c47c46a3d 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -125,23 +125,21 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   /* Calculate the pixel offset from window bottom left into destination
-    * X and Y channels.
-    */
    if (mask & WRITEMASK_X) {
-      /* X' = X - origin */
-      brw_ADD(p,
+      /* X' = X */
+      brw_MOV(p,
 	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_W),
-	      brw_imm_d(0 - c->key.origin_x));
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
    }
 
+   /* XXX: is this needed any more, or is this a NOOP?
+    */
    if (mask & WRITEMASK_Y) {
-      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
+      /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
 	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
-	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+	      brw_imm_d(c->key.drawable_height - 1));
    }
 }
 
@@ -1376,7 +1374,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_MOV:
-      case OPCODE_SWZ:
 	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 	 break;
 
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 4e3edfbbff..5f47d86f71 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -30,25 +30,12 @@
   */
                
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
+#include "pipe/p_shader_constants.h"
+
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_util.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
-
-
-/** An invalid texture target */
-#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS
-
-/** An invalid texture unit */
-#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT
-
-#define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
 
 #define X    0
 #define Y    1
@@ -68,11 +55,6 @@ static const char *wm_opcode_strings[] = {
    "FRONTFACING",
 };
 
-#if 0
-static const char *wm_file_strings[] = {   
-   "PAYLOAD"
-};
-#endif
 
 
 /***********************************************************************
@@ -165,13 +147,13 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    }
 
    c->fp_temp |= 1<<(bit-1);
-   return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1));
+   return dst_reg(PROGRAM_TEMPORARY, c->first_internal_temp+(bit-1));
 }
 
 
 static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
 {
-   c->fp_temp &= ~(1 << (temp.Index - FIRST_INTERNAL_TEMP));
+   c->fp_temp &= ~(1 << (temp.Index - c->first_internal_temp));
 }
 
 
@@ -192,58 +174,29 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
    return inst;
 }
 
-static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       GLuint tex_src_unit,
-				       GLuint tex_src_target,
-				       GLuint tex_shadow,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
+static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+					 GLuint op,
+					 struct prog_dst_register dest,
+					 GLuint saturate,
+					 struct prog_src_register src0,
+					 struct prog_src_register src1,
+					 struct prog_src_register src2 )
 {
    struct prog_instruction *inst = get_fp_inst(c);
       
-   assert(tex_src_unit < BRW_MAX_TEX_UNIT ||
-          tex_src_unit == TEX_UNIT_NONE);
-   assert(tex_src_target < NUM_TEXTURE_TARGETS ||
-          tex_src_target == TEX_TARGET_NONE);
-
-   /* update mask of which texture units are referenced by this program */
-   if (tex_src_unit != TEX_UNIT_NONE)
-      c->fp->tex_units_used |= (1 << tex_src_unit);
-
    memset(inst, 0, sizeof(*inst));
 
    inst->Opcode = op;
    inst->DstReg = dest;
    inst->SaturateMode = saturate;   
-   inst->TexSrcUnit = tex_src_unit;
-   inst->TexSrcTarget = tex_src_target;
-   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
    return inst;
 }
-   
-
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
-				       GLuint op,
-				       struct prog_dst_register dest,
-				       GLuint saturate,
-				       struct prog_src_register src0,
-				       struct prog_src_register src1,
-				       struct prog_src_register src2 )
-{
-   return emit_tex_op(c, op, dest, saturate,
-                      TEX_UNIT_NONE, TEX_TARGET_NONE, 0,  /* unit, tgt, shadow */
-                      src0, src1, src2);
-}
 
 
-/* Many Mesa opcodes produce the same value across all the result channels.
+/* Many opcodes produce the same value across all the result channels.
  * We'd rather not have to support that splatting in the opcode implementations,
  * and brw_wm_pass*.c wants to optimize them out by shuffling references around
  * anyway.  We can easily get both by emitting the opcode to one channel, and
@@ -267,7 +220,7 @@ static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
    other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
    if (other_channel_mask != 0) {
       inst = emit_op(c,
-		     OPCODE_MOV,
+		     TGSI_OPCODE_MOV,
 		     dst_mask(inst0->DstReg, other_channel_mask),
 		     0,
 		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
@@ -356,7 +309,9 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 }
 
 static void emit_interp( struct brw_wm_compile *c,
-			 GLuint idx )
+			 GLuint semantic,
+			 GLuint semantic_index,
+			 GLuint interp_mode )
 {
    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
@@ -366,7 +321,7 @@ static void emit_interp( struct brw_wm_compile *c,
     * multiplied by 1/W in the SF program, and LINTERP on those
     * which have not:
     */
-   switch (idx) {
+   switch (semantic) {
    case FRAG_ATTRIB_WPOS:
       /* Have to treat wpos.xy specially:
        */
@@ -390,8 +345,8 @@ static void emit_interp( struct brw_wm_compile *c,
 	      deltas,
 	      src_undef());
       break;
-   case FRAG_ATTRIB_COL0:
-   case FRAG_ATTRIB_COL1:
+
+   case TGSI_SEMANTIC_COLOR:
       if (c->key.flat_shade) {
 	 emit_op(c,
 		 WM_CINTERP,
@@ -402,25 +357,13 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-         if (c->key.linear_color) {
-            emit_op(c,
-                    WM_LINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    src_undef());
-         }
-         else {
-            /* perspective-corrected color interpolation */
-            emit_op(c,
-                    WM_PINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    get_pixel_w(c));
-         }
+	 emit_op(c,
+		 translate_interp_mode(interp_mode),
+		 dst,
+		 0,
+		 interp,
+		 deltas,
+		 src_undef());
       }
       break;
    case FRAG_ATTRIB_FOGC:
@@ -434,7 +377,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
@@ -468,7 +411,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
@@ -482,7 +425,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
    default:
       emit_op(c,
-	      WM_PINTERP,
+	      translate_interp_mode(interp_mode),
 	      dst,
 	      0,
 	      interp,
@@ -490,8 +433,6 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
       break;
    }
-
-   c->fp_interp_emitted |= 1<<idx;
 }
 
 /***********************************************************************
@@ -581,7 +522,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(dst, WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
@@ -596,7 +537,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.xz = swz src0.1zzz
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
@@ -609,7 +550,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* dst.w = mov src1.w
        */
       emit_op(c,
-	      OPCODE_MOV,
+	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
@@ -631,7 +572,7 @@ static void precalc_lit( struct brw_wm_compile *c,
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
-		    OPCODE_SWZ,
+		    TGSI_OPCODE_MOV,
 		    dst_mask(dst, WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
@@ -643,7 +584,7 @@ static void precalc_lit( struct brw_wm_compile *c,
 
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
-	      OPCODE_LIT,
+	      TGSI_OPCODE_LIT,
 	      dst_mask(dst, WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
@@ -681,7 +622,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        coord = src_reg_from_dst(tmpcoord);
 
        /* tmpcoord = src0 (i.e.: coord = src0) */
-       out = emit_op(c, OPCODE_MOV,
+       out = emit_op(c, TGSI_OPCODE_MOV,
                      tmpcoord,
                      0,
                      src0,
@@ -691,7 +632,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        out->SrcReg[0].Abs = 1;
 
        /* tmp0 = MAX(coord.X, coord.Y) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp0,
                0,
                src_swizzle1(coord, X),
@@ -699,7 +640,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp1 = MAX(tmp0, coord.Z) */
-       emit_op(c, OPCODE_MAX,
+       emit_op(c, TGSI_OPCODE_MAX,
                tmp1,
                0,
                tmp0src,
@@ -707,7 +648,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmp0 = 1 / tmp1 */
-       emit_op(c, OPCODE_RCP,
+       emit_op(c, TGSI_OPCODE_RCP,
                dst_mask(tmp0, WRITEMASK_X),
                0,
                tmp1src,
@@ -715,7 +656,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                src_undef());
 
        /* tmpCoord = src0 * tmp0 */
-       emit_op(c, OPCODE_MUL,
+       emit_op(c, TGSI_OPCODE_MUL,
                tmpcoord,
                0,
                src0,
@@ -738,7 +679,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* coord.xy   = MUL inst->SrcReg[0], { 1/width, 1/height }
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      tmpcoord,
 	      0,
 	      inst->SrcReg[0],
@@ -785,7 +726,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp     = TEX ...
        */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   tmp,
                   inst->SaturateMode,
                   unit,
@@ -798,7 +739,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* tmp.xyz =  ADD TMP, C0
        */
       emit_op(c,
-	      OPCODE_ADD,
+	      TGSI_OPCODE_ADD,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
@@ -809,7 +750,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
 	      0,
 	      tmpsrc,
@@ -824,7 +765,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
 
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
@@ -834,7 +775,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
        */
       emit_op(c,
-	      OPCODE_MAD,
+	      TGSI_OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
@@ -846,7 +787,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    else {
       /* ordinary RGBA tex instruction */
       emit_tex_op(c, 
-                  OPCODE_TEX,
+                  TGSI_OPCODE_TEX,
                   inst->DstReg,
                   inst->SaturateMode,
                   unit,
@@ -861,7 +802,7 @@ static void precalc_tex( struct brw_wm_compile *c,
    if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
       /* swizzle the result of the TEX instruction */
       struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
-      emit_op(c, OPCODE_SWZ,
+      emit_op(c, TGSI_OPCODE_MOV,
               inst->DstReg,
               SATURATE_OFF, /* saturate already done above */
               src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
@@ -884,7 +825,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
    const struct prog_src_register src = inst->SrcReg[0];
    GLboolean retVal;
 
-   assert(inst->Opcode == OPCODE_TXP);
+   assert(inst->Opcode == TGSI_OPCODE_TXP);
 
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
@@ -921,7 +862,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.w = RCP inst.arg[0][3]
        */
       emit_op(c,
-	      OPCODE_RCP,
+	      TGSI_OPCODE_RCP,
 	      dst_mask(tmp, WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
@@ -931,7 +872,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
        */
       emit_op(c,
-	      OPCODE_MUL,
+	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_XYZ),
 	      0,
 	      src0,
@@ -1015,6 +956,7 @@ static void validate_src_regs( struct brw_wm_compile *c,
 	 GLuint idx = inst->SrcReg[i].Index;
 	 if (!(c->fp_interp_emitted & (1<<idx))) {
 	    emit_interp(c, idx);
+	    c->fp_interp_emitted |= 1<<idx;
 	 }
       }
    }
@@ -1094,71 +1036,64 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        */
 
       switch (inst->Opcode) {
-      case OPCODE_SWZ: 
+      case TGSI_OPCODE_ABS:
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
-	 break;
-	 
-      case OPCODE_ABS:
-	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_MOV;
+	 out->Opcode = TGSI_OPCODE_MOV;
 	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
-      case OPCODE_SUB: 
+      case TGSI_OPCODE_SUB: 
 	 out = emit_insn(c, inst);
-	 out->Opcode = OPCODE_ADD;
+	 out->Opcode = TGSI_OPCODE_ADD;
 	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
-      case OPCODE_SCS: 
+      case TGSI_OPCODE_SCS: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XY;
 	 break;
 	 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
 	 precalc_dst(c, inst);
 	 break;
 
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 precalc_lit(c, inst);
 	 break;
 
-      case OPCODE_TEX:
+      case TGSI_OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
 
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TXP:
 	 precalc_txp(c, inst);
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 out = emit_insn(c, inst);
 	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
          assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_KIL: 
+      case TGSI_OPCODE_KIL: 
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
 	 out->DstReg.WriteMask = 0;
 	 break;
-      case OPCODE_END:
+      case TGSI_OPCODE_END:
 	 emit_fb_write(c);
 	 break;
-      case OPCODE_PRINT:
-	 break;
       default:
 	 if (brw_wm_is_scalar_result(inst->Opcode))
 	    emit_scalar_insn(c, inst);
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index c9fe1dd8ad..d836e2fb34 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -6,9 +6,6 @@
 #include "brw_eu.h"
 #include "brw_wm.h"
 
-enum _subroutine {
-    SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
-};
 
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
                                   const struct prog_instruction *inst,
@@ -32,10 +29,6 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
 	    case OPCODE_RET:
-	    case OPCODE_NOISE1:
-	    case OPCODE_NOISE2:
-	    case OPCODE_NOISE3:
-	    case OPCODE_NOISE4:
 	    case OPCODE_BGNLOOP:
 		return GL_TRUE; 
 	    default:
@@ -1495,1036 +1488,7 @@ static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 		   0, 16, 2 );
 }
 
-/* One-, two- and three-dimensional Perlin noise, similar to the description
-   in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
-static void noise1_sub( struct brw_wm_compile *c ) {
 
-    struct brw_compile *p = &c->func;
-    struct brw_reg param,
-	x0, x1, /* gradients at each end */       
-	t, tmp[ 2 ], /* float temporaries */
-	itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0 = alloc_tmp( c );
-    x1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    tmp[ 0 ] = alloc_tmp( c );
-    tmp[ 1 ] = alloc_tmp( c );
-    itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
-    itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
-    itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
-    itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
-    
-    param = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-
-    /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
-       be hashed.  Also compute the remainder (offset within the unit
-       length), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
-    brw_FRC( p, param, param );
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
-    brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-
-    /* We're now ready to perform the hashing.  The two hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-    for( i = 0; i < 2; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 2; i++ )
-       brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the two gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 31 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
-    brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
-
-    brw_MUL( p, x0, x0, param );
-    brw_MUL( p, x1, x1, t );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
-					   pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
-    brw_MUL( p, param, tmp[ 0 ], param );
-    brw_MUL( p, x1, x1, param );
-    brw_ADD( p, x0, x0, x1 );    
-    /* scale by pow( 2, -30 ), to compensate for the format conversion
-       above and an extra factor of 2 so that a single gradient covers
-       the [-1,1] range */
-    brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise1( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src, param, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src = get_src_reg( c, inst, 0, 0 );
-
-    param = alloc_tmp( c );
-
-    brw_MOV( p, param, src );
-
-    invoke_subroutine( c, SUB_NOISE1, noise1_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-static void noise2_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */       
-	t, tmp[ 4 ], /* float temporaries */
-	itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 4; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-    }
-    itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
-    itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
-    itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
-    
-    param0 = lookup_tmp( c, mark - 3 );
-    param1 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       square), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
-    brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
-	     low_words( itmp[ 1 ] ) );
-    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
-    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
-    brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
-    brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
-    brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
-
-    /* We're now ready to perform the hashing.  The four hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 32x16
-       bit multiplication, and 16-bit swizzles (which we get for
-       free).  We can't use immediate operands in the multiplies,
-       because immediates are permitted only in src1 and the 16-bit
-       factor is permitted only in src0. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
-		 high_words( itmp[ i ] ) );
-
-    /* Now we want to initialise the four gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
-    
-    brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
-    brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
-    brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
-
-    brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
-						 pipeline */
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
-    brw_MUL( p, param0, tmp[ 0 ], param0 );
-    brw_MUL( p, param1, tmp[ 1 ], param1 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, param1 );
-    brw_MUL( p, x1y1, x1y1, param1 );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  There are horrible register dependencies here,
-       but we have nothing else to do. */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, param0 );
-    brw_ADD( p, x0y0, x0y0, x1y0 );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise2( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, param0, param1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-
-    invoke_subroutine( c, SUB_NOISE2, noise2_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-
-/**
- * The three-dimensional case is much like the one- and two- versions above,
- * but since the number of corners is rapidly growing we now pack 16 16-bit
- * hashes into each register to extract more parallelism from the EUs.
- */
-static void noise3_sub( struct brw_wm_compile *c ) {
-
-    struct brw_compile *p = &c->func;
-    struct brw_reg param0, param1, param2,
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	xi, yi, zi, /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i;
-    int mark = mark_tmps( c );
-
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    xi = alloc_tmp( c );
-    yi = alloc_tmp( c );
-    zi = alloc_tmp( c );
-    t = alloc_tmp( c );
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-    
-    param0 = lookup_tmp( c, mark - 4 );
-    param1 = lookup_tmp( c, mark - 3 );
-    param2 = lookup_tmp( c, mark - 2 );
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Also compute the remainders (offsets within the unit
-       cube), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
-    brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
-    brw_FRC( p, param0, param0 );
-    brw_FRC( p, param1, param1 );
-    brw_FRC( p, param2, param2 );
-    /* Since we now have only 16 bits of precision in the hash, we must
-       be more careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* We interpolate between the gradients using the polynomial
-       6t^5 - 15t^4 + 10t^3 (Perlin). */
-    brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
-    brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
-    brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
-    brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    brw_MUL( p, xi, xi, param0 );
-    brw_MUL( p, yi, yi, param1 );
-    brw_MUL( p, zi, zi, param2 );
-    
-    /* Here we interpolate in the y dimension... */
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param0 );
-    brw_MUL( p, x0y1, x0y1, param0 );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    
-    /* The interpolation coefficients are still around from last time, so
-       again interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, yi );
-    brw_MUL( p, x1y1, x1y1, yi );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, xi );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* The final interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-    
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise3( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, param0, param1, param2, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-
-    invoke_subroutine( c, SUB_NOISE3, noise3_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
-    
-/**
- * For the four-dimensional case, the little micro-optimisation benefits
- * we obtain by unrolling all the loops aren't worth the massive bloat it
- * now causes.  Instead, we loop twice around performing a similar operation
- * to noise3, once for the w=0 cube and once for the w=1, with a bit more
- * code to glue it all together.
- */
-static void noise4_sub( struct brw_wm_compile *c )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg param[ 4 ],
-	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
-	w0, /* noise for the w=0 cube */
-	floors[ 2 ], /* integer coordinates of base corner of hypercube */
-	interp[ 4 ], /* interpolation coefficients */
-	t, tmp[ 8 ], /* float temporaries */
-	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
-	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
-    int i, j;
-    int mark = mark_tmps( c );
-    GLuint loop, origin;
-    
-    x0y0 = alloc_tmp( c );
-    x0y1 = alloc_tmp( c );
-    x1y0 = alloc_tmp( c );
-    x1y1 = alloc_tmp( c );
-    t = alloc_tmp( c );
-    w0 = alloc_tmp( c );    
-    floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-    floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
-
-    for( i = 0; i < 4; i++ ) {
-	param[ i ] = lookup_tmp( c, mark - 5 + i );
-	interp[ i ] = alloc_tmp( c );
-    }
-    
-    for( i = 0; i < 8; i++ ) {
-	tmp[ i ] = alloc_tmp( c );
-	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
-	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
-    }
-
-    brw_set_access_mode( p, BRW_ALIGN_1 );
-
-    /* We only want 16 bits of precision from the integral part of each
-       co-ordinate, but unfortunately the RNDD semantics would saturate
-       at 16 bits if we performed the operation directly to a 16-bit
-       destination.  Therefore, we round to 32-bit temporaries where
-       appropriate, and then store only the lower 16 bits. */
-    brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
-    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
-    brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
-    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
-    brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
-    brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
-
-    /* Modify the flag register here, because the side effect is useful
-       later (see below).  We know for certain that all flags will be
-       cleared, since the FRC instruction cannot possibly generate
-       negative results.  Even for exceptional inputs (infinities, denormals,
-       NaNs), the architecture guarantees that the L conditional is false. */
-    brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
-    brw_FRC( p, param[ 0 ], param[ 0 ] );
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    for( i = 1; i < 4; i++ )	
-	brw_FRC( p, param[ i ], param[ i ] );
-    
-    /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
-       of all. */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-    for( i = 0; i < 4; i++ )
-	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
-    for( j = 0; j < 3; j++ )
-	for( i = 0; i < 4; i++ )
-	    brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
-
-    /* Mark the current address, as it will be a jump destination.  The
-       following code will be executed twice: first, with the flag
-       register clear indicating the w=0 case, and second with flags
-       set for w=1. */
-    loop = p->nr_insn;
-    
-    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
-       be hashed.  Since we have only 16 bits of precision in the hash, we
-       must be careful about thorough mixing to maintain entropy as we
-       squash the input vector into a small scalar. */
-    brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-    brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
-	     brw_imm_uw( 0xD0BD ) );
-    brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
-	     brw_imm_uw( 0x9B93 ) );
-    brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 0xA359 ) );
-    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
-	     brw_imm_uw( 0xBC8F ) );
-
-    /* Temporarily disable the execution mask while we work with ExecSize=16
-       channels (the mask is set for ExecSize=8 and is probably incorrect).
-       Although this might cause execution of unwanted channels, the code
-       writes only to temporary registers and has no side effects, so
-       disabling the mask is harmless. */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
-    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
-    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
-
-    /* We're now ready to perform the hashing.  The eight hashes are
-       interleaved for performance.  The hash function used is
-       designed to rapidly achieve avalanche and require only 16x16
-       bit multiplication, and 8-bit swizzles (which we get for
-       free). */
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    for( i = 0; i < 4; i++ )
-	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
-    for( i = 0; i < 4; i++ )
-	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
-		 odd_bytes( wtmp[ i ] ) );
-    brw_pop_insn_state( p );
-
-    /* Now we want to initialise the four rear gradients based on the
-       hashes.  Format conversion from signed integer to float leaves
-       everything scaled too high by a factor of pow( 2, 15 ), but
-       we correct for that right at the end. */
-    /* x component */
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-    
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );    
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Here we interpolate in the y dimension... */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
-
-    /* Now do the same thing for the front four gradients... */
-    /* x component */
-    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
-    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
-    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
-    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, x1y0, x1y0, t );
-    brw_MUL( p, x1y1, x1y1, t );
-    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
-    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
-
-    /* y component */
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
-    
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    
-    /* z component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
-    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
-    brw_pop_insn_state( p );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    /* prepare t for the w component (used below): w the first time through
-       the loop; w - 1 the second time) */
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
-    p->current->header.predicate_inverse = 1;
-    brw_MOV( p, t, param[ 3 ] );
-    p->current->header.predicate_inverse = 0;
-    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* w component */
-    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
-    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
-    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
-
-    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
-    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
-    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
-    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
-    
-    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
-    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
-    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
-    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
-
-    /* Interpolate in the y dimension: */
-    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
-    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
-    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
-    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
-    brw_ADD( p, x0y0, x0y0, x0y1 );
-    brw_ADD( p, x1y0, x1y0, x1y1 );
-
-    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
-       time put the front face in tmp[ 1 ] and we're nearly there... */
-    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
-    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
-    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
-
-    /* Another interpolation, in the z dimension: */
-    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
-    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
-
-    /* Exit the loop if we've computed both cubes... */
-    origin = p->nr_insn;
-    brw_push_insn_state( p );
-    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
-    brw_pop_insn_state( p );
-
-    /* Save the result for the w=0 case, and increment the w coordinate: */
-    brw_MOV( p, w0, tmp[ 0 ] );
-    brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
-	     brw_imm_uw( 1 ) );
-
-    /* Loop around for the other cube.  Explicitly set the flag register
-       (unfortunately we must spend an extra instruction to do this: we
-       can't rely on a side effect of the previous MOV or ADD because
-       conditional modifiers which are normally true might be false in
-       exceptional circumstances, e.g. given a NaN input; the add to
-       brw_ip_reg() is not suitable because the IP is not an 8-vector). */
-    brw_push_insn_state( p );
-    brw_set_mask_control( p, BRW_MASK_DISABLE );
-    brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
-    brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
-	     brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
-    brw_pop_insn_state( p );
-
-    /* Patch the previous conditional branch now that we know the
-       destination address. */
-    brw_set_src1( p->store + origin,
-		  brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
-
-    /* The very last interpolation. */
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );    
-    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
-    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
-
-    /* scale by pow( 2, -15 ), as described above */
-    brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
-
-    release_tmps( c, mark );
-}
-
-static void emit_noise4( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    int mark = mark_tmps( c );
-
-    assert( mark == 0 );
-    
-    src0 = get_src_reg( c, inst, 0, 0 );
-    src1 = get_src_reg( c, inst, 0, 1 );
-    src2 = get_src_reg( c, inst, 0, 2 );
-    src3 = get_src_reg( c, inst, 0, 3 );
-
-    param0 = alloc_tmp( c );
-    param1 = alloc_tmp( c );
-    param2 = alloc_tmp( c );
-    param3 = alloc_tmp( c );
-
-    brw_MOV( p, param0, src0 );
-    brw_MOV( p, param1, src1 );
-    brw_MOV( p, param2, src2 );
-    brw_MOV( p, param3, src3 );
-
-    invoke_subroutine( c, SUB_NOISE4, noise4_sub );
-    
-    /* Fill in the result: */
-    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV( p, dst, param0 );
-	}
-    }
-    if( inst->SaturateMode == SATURATE_ZERO_ONE )
-	brw_set_saturate( p, 0 );
-    
-    release_tmps( c, mark );
-}
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
                          const struct prog_instruction *inst)
@@ -2543,19 +1507,18 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
      * X and Y channels.
      */
     if (mask & WRITEMASK_X) {
-	/* X' = X - origin_x */
-	brw_ADD(p,
+	/* X' = X */
+	brw_MOV(p,
 		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_W),
-		brw_imm_d(0 - c->key.origin_x));
+		retype(src0[0], BRW_REGISTER_TYPE_W));
     }
 
     if (mask & WRITEMASK_Y) {
-	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
+	/* Y' = height - 1 - Y */
 	brw_ADD(p,
 		dst[1],
 		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
-		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+		brw_imm_d(c->key.drawable_height - 1));
     }
 }
 
@@ -2827,7 +1790,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
 		emit_mov(c, inst);
 		break;
 	    case OPCODE_DP3:
@@ -2903,18 +1865,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_MAD:
 		emit_mad(c, inst);
 		break;
-	    case OPCODE_NOISE1:
-		emit_noise1(c, inst);
-		break;
-	    case OPCODE_NOISE2:
-		emit_noise2(c, inst);
-		break;
-	    case OPCODE_NOISE3:
-		emit_noise3(c, inst);
-		break;
-	    case OPCODE_NOISE4:
-		emit_noise4(c, inst);
-		break;
 	    case OPCODE_TEX:
 		emit_tex(c, inst);
 		break;
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 6279258339..0c411b57f5 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -422,7 +422,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
        */      
       switch (inst->Opcode) {
       case OPCODE_MOV: 
-      case OPCODE_SWZ: 
 	 if (!inst->SaturateMode) {
 	    pass0_precalc_mov(c, inst);
 	 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index b449394029..d940ec09a9 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -120,7 +120,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint writemask;
       GLuint read0, read1, read2;
 
-      if (inst->opcode == OPCODE_KIL) {
+      if (inst->opcode == TGSI_OPCODE_KIL) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
@@ -154,76 +154,75 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       /* Mark all inputs which contribute to the marked outputs:
        */
       switch (inst->opcode) {
-      case OPCODE_ABS:
-      case OPCODE_FLR:
-      case OPCODE_FRC:
-      case OPCODE_MOV:
-      case OPCODE_SWZ:
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_ABS:
+      case TGSI_OPCODE_FLR:
+      case TGSI_OPCODE_FRC:
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_SUB:
-      case OPCODE_SLT:
-      case OPCODE_SLE:
-      case OPCODE_SGE:
-      case OPCODE_SGT:
-      case OPCODE_SEQ:
-      case OPCODE_SNE:
-      case OPCODE_ADD:
-      case OPCODE_MAX:
-      case OPCODE_MIN:
-      case OPCODE_MUL:
+      case TGSI_OPCODE_SUB:
+      case TGSI_OPCODE_SLT:
+      case TGSI_OPCODE_SLE:
+      case TGSI_OPCODE_SGE:
+      case TGSI_OPCODE_SGT:
+      case TGSI_OPCODE_SEQ:
+      case TGSI_OPCODE_SNE:
+      case TGSI_OPCODE_ADD:
+      case TGSI_OPCODE_MAX:
+      case TGSI_OPCODE_MIN:
+      case TGSI_OPCODE_MUL:
 	 read0 = writemask;
 	 read1 = writemask;
 	 break;
 
-      case OPCODE_DDX:
-      case OPCODE_DDY:
+      case TGSI_OPCODE_DDX:
+      case TGSI_OPCODE_DDY:
 	 read0 = writemask;
 	 break;
 
-      case OPCODE_MAD:	
-      case OPCODE_CMP:
-      case OPCODE_LRP:
+      case TGSI_OPCODE_MAD:	
+      case TGSI_OPCODE_CMP:
+      case TGSI_OPCODE_LRP:
 	 read0 = writemask;
 	 read1 = writemask;	
 	 read2 = writemask;	
 	 break;
 
-      case OPCODE_XPD: 
+      case TGSI_OPCODE_XPD: 
 	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
 	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
 	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
-      case OPCODE_COS:
-      case OPCODE_EX2:
-      case OPCODE_LG2:
-      case OPCODE_RCP:
-      case OPCODE_RSQ:
-      case OPCODE_SIN:
-      case OPCODE_SCS:
+      case TGSI_OPCODE_COS:
+      case TGSI_OPCODE_EX2:
+      case TGSI_OPCODE_LG2:
+      case TGSI_OPCODE_RCP:
+      case TGSI_OPCODE_RSQ:
+      case TGSI_OPCODE_SIN:
+      case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
 	 read0 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 read0 = WRITEMASK_X;
 	 read1 = WRITEMASK_X;
 	 break;
 
-      case OPCODE_TEX:
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
 	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
@@ -254,28 +253,28 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read2 = WRITEMASK_W; /* pixel w */
 	 break;
 
-      case OPCODE_DP3:	
+      case TGSI_OPCODE_DP3:	
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZ;
 	 break;
 
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 read0 = WRITEMASK_XYZ;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 read0 = WRITEMASK_XYZW;
 	 read1 = WRITEMASK_XYZW;
 	 break;
 
-      case OPCODE_LIT: 
+      case TGSI_OPCODE_LIT: 
 	 read0 = WRITEMASK_XYW;
 	 break;
 
-      case OPCODE_DST:
+      case TGSI_OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
+      case TGSI_OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/gallium/drivers/i965/intel_chipset.h b/src/gallium/drivers/i965/intel_chipset.h
index 3dc8653a73..3c38f1676c 100644
--- a/src/gallium/drivers/i965/intel_chipset.h
+++ b/src/gallium/drivers/i965/intel_chipset.h
@@ -66,7 +66,6 @@
 #define PCI_CHIP_Q45_G                  0x2E12
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
-#define PCI_CHIP_B43_G                  0x2E42
 
 #define PCI_CHIP_ILD_G                  0x0042
 #define PCI_CHIP_ILM_G                  0x0046
@@ -84,8 +83,7 @@
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
-                                 devid == PCI_CHIP_G41_G || \
-                                 devid == PCI_CHIP_B43_G)
+                                 devid == PCI_CHIP_G41_G)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
-- 
cgit v1.2.3


From 074606a806df755ecbb84e0a1182c66fd0b2a8dd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 24 Oct 2009 13:18:34 +0100
Subject: i965g: more files compiling

---
 src/gallium/drivers/i965/brw_batchbuffer.h      | 124 ++++++++++++
 src/gallium/drivers/i965/brw_cc.c               |  16 +-
 src/gallium/drivers/i965/brw_clip.c             |  80 +++-----
 src/gallium/drivers/i965/brw_clip.h             |   7 +-
 src/gallium/drivers/i965/brw_clip_unfilled.c    |   2 +-
 src/gallium/drivers/i965/brw_clip_util.c        |   2 +-
 src/gallium/drivers/i965/brw_context.c          |   2 +-
 src/gallium/drivers/i965/brw_context.h          |  89 ++++-----
 src/gallium/drivers/i965/brw_curbe.c            |  10 +-
 src/gallium/drivers/i965/brw_defines.h          |   4 +-
 src/gallium/drivers/i965/brw_draw.c             |  12 +-
 src/gallium/drivers/i965/brw_draw_upload.c      |   2 +-
 src/gallium/drivers/i965/brw_eu.h               |  32 +++-
 src/gallium/drivers/i965/brw_eu_emit.c          |   4 +-
 src/gallium/drivers/i965/brw_gs.c               |   2 +-
 src/gallium/drivers/i965/brw_gs_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_misc_state.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_query.c       |   4 +-
 src/gallium/drivers/i965/brw_pipe_rast.c        |  46 +++++
 src/gallium/drivers/i965/brw_pipe_rast.h        |  14 ++
 src/gallium/drivers/i965/brw_pipe_shader.c      | 159 ++++++++++++++++
 src/gallium/drivers/i965/brw_reg.h              |  79 ++++++++
 src/gallium/drivers/i965/brw_screen.h           |  78 ++++++++
 src/gallium/drivers/i965/brw_screen_surface.c   |   4 +-
 src/gallium/drivers/i965/brw_sf.c               |   2 +-
 src/gallium/drivers/i965/brw_sf.h               |   1 -
 src/gallium/drivers/i965/brw_sf_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_state.h            |   2 +-
 src/gallium/drivers/i965/brw_state_batch.c      |   6 +-
 src/gallium/drivers/i965/brw_state_cache.c      |   2 +-
 src/gallium/drivers/i965/brw_state_upload.c     |   2 +-
 src/gallium/drivers/i965/brw_tex_layout.c       |   2 +-
 src/gallium/drivers/i965/brw_urb.c              |   2 +-
 src/gallium/drivers/i965/brw_util.h             |   5 +-
 src/gallium/drivers/i965/brw_vs.c               |   3 +-
 src/gallium/drivers/i965/brw_vs.h               |   1 -
 src/gallium/drivers/i965/brw_vs_emit.c          |  82 ++++----
 src/gallium/drivers/i965/brw_winsys.h           | 243 ++++++++++++++++++++++++
 src/gallium/drivers/i965/brw_wm.h               |   1 -
 src/gallium/drivers/i965/brw_wm_debug.c         |   2 +-
 src/gallium/drivers/i965/brw_wm_emit.c          |  84 ++++----
 src/gallium/drivers/i965/brw_wm_fp.c            |  60 +++---
 src/gallium/drivers/i965/brw_wm_pass0.c         |   1 -
 src/gallium/drivers/i965/brw_wm_pass1.c         |  68 +++----
 src/gallium/drivers/i965/brw_wm_surface_state.c |   2 +-
 src/gallium/drivers/i965/intel_batchbuffer.h    | 168 ----------------
 47 files changed, 1027 insertions(+), 492 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_batchbuffer.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_shader.c
 create mode 100644 src/gallium/drivers/i965/brw_reg.h
 create mode 100644 src/gallium/drivers/i965/brw_screen.h
 create mode 100644 src/gallium/drivers/i965/brw_winsys.h
 delete mode 100644 src/gallium/drivers/i965/intel_batchbuffer.h

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
new file mode 100644
index 0000000000..76b3c1bf69
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -0,0 +1,124 @@
+#ifndef BRW_BATCHBUFFER_H
+#define BRW_BATCHBUFFER_H
+
+#include "brw_types.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+/* All ignored:
+ */
+enum cliprect_mode {
+   IGNORE_CLIPRECTS,
+   LOOP_CLIPRECTS,
+   NO_LOOP_CLIPRECTS,
+   REFERENCES_CLIPRECTS
+};
+
+void brw_batchbuffer_free(struct brw_batchbuffer *batch);
+
+void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
+			      const char *file, int line);
+
+#define brw_batchbuffer_flush(batch) \
+	_brw_batchbuffer_flush(batch, __FILE__, __LINE__)
+
+void brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+void brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+void brw_batchbuffer_release_space(struct brw_batchbuffer *batch,
+                                     GLuint bytes);
+
+GLboolean brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+                                       struct brw_winsys_buffer *buffer,
+				       uint32_t read_domains,
+				       uint32_t write_domain,
+				       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+brw_batchbuffer_space(struct brw_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+brw_batchbuffer_emit_dword(struct brw_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(brw_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE boolean
+brw_batchbuffer_require_space(struct brw_batchbuffer *batch,
+                                GLuint sz,
+				enum cliprect_mode cliprect_mode)
+{
+   assert(sz < batch->size - 8);
+   if (brw_batchbuffer_space(batch) < sz) {
+      assert(0);
+      return FALSE;
+   }
+
+   /* All commands should be executed once regardless of cliprect
+    * mode.
+    */
+   (void)cliprect_mode;
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BATCH_LOCALS
+
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+   brw_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+   assert(intel->batch->emit.start_ptr == NULL);			\
+   intel->batch->emit.total = (n) * 4;					\
+   intel->batch->emit.start_ptr = intel->batch->ptr;			\
+} while (0)
+
+#define OUT_BATCH(d) brw_batchbuffer_emit_dword(intel->batch, d)
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
+   assert((unsigned) (delta) < buf->size);				\
+   brw_batchbuffer_emit_reloc(intel->batch, buf,			\
+				read_domains, write_domain, delta);	\
+} while (0)
+
+#define ADVANCE_BATCH() do {						\
+   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
+   assert(intel->batch->emit.start_ptr != NULL);			\
+   if (_n != intel->batch->emit.total) {				\
+      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
+	      _n, intel->batch->emit.total);				\
+      abort();								\
+   }									\
+   intel->batch->emit.start_ptr = NULL;					\
+} while(0)
+
+
+static INLINE void
+brw_batchbuffer_emit_mi_flush(struct brw_batchbuffer *batch)
+{
+   brw_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
+   brw_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index bf2743ebbe..c8e7851d75 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -65,7 +65,7 @@ static void prepare_cc_vp( struct brw_context *brw )
    memset(&ccv, 0, sizeof(ccv));
 
    /* PIPE_NEW_VIEWPORT */
-   calc_sane_viewport( &brw->vp, &svp );
+   calc_sane_viewport( &brw->curr.vp, &svp );
 
    ccv.min_depth = svp.near;
    ccv.max_depth = svp.far;
@@ -109,13 +109,13 @@ static void
 cc_unit_populate_key(const struct brw_context *brw,
 		     struct brw_cc_unit_key *key)
 {
-   key->cc0 = brw->dsa->cc0;
-   key->cc1 = brw->dsa->cc1;
-   key->cc2 = brw->dsa->cc2;
-   key->cc3 = combine_cc3( brw->dsa->cc3, brw->blend->cc3 );
-   key->cc5 = brw->blend->cc5;
-   key->cc6 = brw->blend->cc6;
-   key->cc7 = brw->blend->cc7;
+   key->cc0 = brw->curr.dsa->cc0;
+   key->cc1 = brw->curr.dsa->cc1;
+   key->cc2 = brw->curr.dsa->cc2;
+   key->cc3 = combine_cc3( brw->curr.dsa->cc3, brw->curr.blend->cc3 );
+   key->cc5 = brw->curr.blend->cc5;
+   key->cc6 = brw->curr.blend->cc6;
+   key->cc7 = brw->curr.blend->cc7;
 }
 
 /**
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index d82ebeb9a9..591e904705 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -33,13 +33,14 @@
 
 #include "util/u_math.h"
 
-#include "intel_batchbuffer.h"
-
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_util.h"
 #include "brw_state.h"
+#include "brw_pipe_rast.h"
 #include "brw_clip.h"
 
 
@@ -77,13 +78,16 @@ static void compile_clip_prog( struct brw_context *brw,
    else
        delta = REG_SIZE;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
-	 c.offset[i] = delta;
-	 delta += ATTR_SIZE;
-      }
+   /* XXX: c.offset is now pretty redundant:
+    */
+   for (i = 0; i < c.key.nr_attrs; i++) {
+      c.offset[i] = delta;
+      delta += ATTR_SIZE;
+   }
 
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   /* XXX: c.nr_attrs is very redundant:
+    */
+   c.nr_attrs = c.key.nr_attrs;
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -145,59 +149,21 @@ static void upload_clip_prog(struct brw_context *brw)
 {
    struct brw_clip_prog_key key;
 
-   memset(&key, 0, sizeof(key));
-
-   /* Populate the key:
+   /* Populate the key, starting from the almost-complete version from
+    * the rast state. 
     */
+
+   /* PIPE_NEW_RAST */
+   memcpy(&key, &brw->curr.rast->clip_key, sizeof key);
+
    /* BRW_NEW_REDUCED_PRIMITIVE */
    key.primitive = brw->reduced_primitive;
-   /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written;
-   /* PIPE_NEW_RAST */
-   key.do_flat_shading = brw->rast.base.flatshade;
-   /* PIPE_NEW_UCP */
-   key.nr_userclip = brw->nr_ucp;
 
-   if (BRW_IS_IGDNG(brw))
-       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
-   else
-       key.clip_mode = BRW_CLIPMODE_NORMAL;
+   /* PIPE_NEW_VS */
+   key.nr_attrs = brw->curr.vs->info.file_max[TGSI_FILE_OUTPUT] + 1;
 
-   /* PIPE_NEW_RAST */
-   if (key.primitive == PIPE_PRIM_TRIANGLES) {
-      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
-	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
-      else {
-	 key.fill_ccw = CLIP_CULL;
-	 key.fill_cw = CLIP_CULL;
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
-	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
-	 }
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
-	    key.fill_cw = translate_fill(brw->rast.fill_cw);
-	 }
-
-	 if (key.fill_cw != CLIP_FILL ||
-	     key.fill_ccw != CLIP_FILL) {
-	    key.do_unfilled = 1;
-	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
-	 }
-
-	 key.offset_ccw = brw->rast.offset_ccw;
-	 key.offset_cw = brw->rast.offset_cw;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_cw != CLIP_CULL) 
-	    key.copy_bfc_cw = 1;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_ccw != CLIP_CULL) 
-	    key.copy_bfc_ccw = 1;
-	 }
-      }
-   }
+   /* PIPE_NEW_CLIP */
+   key.nr_userclip = brw->curr.ucp.nr;
 
    brw->sws->bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
@@ -212,7 +178,7 @@ static void upload_clip_prog(struct brw_context *brw)
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
       .mesa  = (PIPE_NEW_RAST | 
-		PIPE_NEW_UCP),
+		PIPE_NEW_CLIP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index d80ec819b9..cfe51bf292 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -42,8 +42,7 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:32;		
-
+   GLuint nr_attrs:5;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -55,7 +54,7 @@ struct brw_clip_prog_key {
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:12;
+   GLuint pad1:7;
    
    GLfloat offset_factor;
    GLfloat offset_units;
@@ -117,7 +116,7 @@ struct brw_clip_compile {
    GLuint last_mrf;
 
    GLuint header_position_offset;
-   GLuint offset[VERT_ATTRIB_MAX];
+   GLuint offset[PIPE_MAX_SHADER_OUTPUTS];
    GLboolean need_ff_sync;
 };
 
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index 4baff55806..8501599aef 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 7a6c46ce07..60bfd3538e 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -93,7 +93,7 @@ void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
    /* value.xyz *= value.rhw
     */
    brw_set_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_MUL(p, brw_writemask(pos, BRW_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
    brw_set_access_mode(p, BRW_ALIGN_1);
 }
 
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index 063ada5772..07a5420d6e 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -38,7 +38,7 @@
 #include "brw_state.h"
 #include "brw_vs.h"
 #include "brw_screen_tex.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 6699d3bdb6..3a2fece45c 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -36,6 +36,8 @@
 #include "brw_structs.h"
 #include "brw_winsys.h"
 #include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
 
 
 /* Glossary:
@@ -143,6 +145,27 @@ struct brw_blend_state {
 };
 
 
+struct brw_rasterizer_state;
+
+
+struct brw_vertex_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+struct brw_fragment_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   GLboolean isGLSL;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
 
 
@@ -157,6 +180,7 @@ struct brw_blend_state {
 #define PIPE_NEW_VERTEX_SHADER          0x2
 #define PIPE_NEW_FRAGMENT_CONSTS        0x2
 #define PIPE_NEW_VERTEX_CONSTS          0x2
+#define PIPE_NEW_CLIP                   0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -196,25 +220,6 @@ struct brw_state_flags {
 };
 
 
-struct brw_vertex_program {
-   const struct tgsi_token *tokens;
-   GLuint id;
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
-
-/** Subclass of Mesa fragment program */
-struct brw_fragment_program {
-   const struct tgsi_token *tokens;
-
-   GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< any IF/LOOP/CONT/BREAK instructions */
-
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
 
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
@@ -452,24 +457,29 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct pipe_context *pipe;
-   struct pipe_screen *screen;
-   
+   struct pipe_context pipe;
+
+   struct brw_screen *brw_screen;   
    struct brw_winsys_screen *sws;
 
    GLuint primitive;
+   GLuint reduced_primitive;
 
    GLboolean emit_state_always;
    GLboolean no_batch_wrap;
 
    /* Active vertex program: 
     */
-   const struct gl_vertex_program *vertex_program;
-   const struct gl_fragment_program *fragment_program;
-   struct pipe_framebuffer_state fb;
-   struct brw_depth_stencil_alpha_state *dsa;
-   struct brw_blend_state *blend;
-   struct pipe_viewport_state vp;
+   struct {
+      const struct brw_vertex_shader *vs;
+      const struct brw_fragment_shader *fs;
+      const struct brw_blend_state *blend;
+      const struct brw_rasterizer_state *rast;
+      const struct brw_depth_stencil_alpha_state *dsa;
+      struct pipe_framebuffer_state fb;
+      struct pipe_viewport_state vp;
+      struct pipe_clip_state ucp;
+   } curr;
 
    struct {
       struct brw_state_flags dirty;
@@ -719,29 +729,6 @@ brw_context( struct pipe_context *ctx )
    return (struct brw_context *)ctx;
 }
 
-static INLINE struct brw_vertex_program *
-brw_vertex_program(struct gl_vertex_program *p)
-{
-   return (struct brw_vertex_program *) p;
-}
-
-static INLINE const struct brw_vertex_program *
-brw_vertex_program_const(const struct gl_vertex_program *p)
-{
-   return (const struct brw_vertex_program *) p;
-}
-
-static INLINE struct brw_fragment_program *
-brw_fragment_program(struct gl_fragment_program *p)
-{
-   return (struct brw_fragment_program *) p;
-}
-
-static INLINE const struct brw_fragment_program *
-brw_fragment_program_const(const struct gl_fragment_program *p)
-{
-   return (const struct brw_fragment_program *) p;
-}
 
 
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 33ea9a00f7..f2524d75e2 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,7 +30,7 @@
   */
 
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
 #include "brw_defines.h"
@@ -55,8 +55,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* PIPE_NEW_UCP */
-   if (brw->nr_ucp) {
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
       GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
@@ -106,7 +106,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = PIPE_NEW_UCP,
+      .mesa = PIPE_NEW_CLIP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -327,7 +327,7 @@ const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
       .mesa = (PIPE_NEW_FS_CONSTANTS |
 	       PIPE_NEW_VS_CONSTANTS |
-	       PIPE_NEW_UCP),
+	       PIPE_NEW_CLIP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 282c5b18f4..1dc64ddc8f 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->pci_id))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->pci_id))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 856999f3ef..741537309a 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -31,7 +31,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
@@ -133,7 +133,7 @@ static void brw_emit_prim(struct brw_context *brw,
       ADVANCE_BATCH();
    }
    if (prim_packet.verts_per_instance) {
-      intel_batchbuffer_data( brw->intel.batch, &prim_packet,
+      brw_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
    if (intel->always_flush_cache) {
@@ -224,7 +224,7 @@ static GLboolean brw_try_draw_prims( struct brw_context *brw,
       return ret;
 
    if (intel->always_flush_batch)
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
 
    return 0;
 }
@@ -249,12 +249,10 @@ void brw_draw_prims( struct brw_context *brw,
     */
    ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
-   /* Otherwise, we really are out of memory.  Pass the drawing
-    * command to the software tnl module and which will in turn call
-    * swrast to do the drawing.
+   /* Otherwise, flush and retry:
     */
    if (ret != 0) {
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
       ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
       assert(ret == 0);
    }
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index dce015d79f..1ab65d60c4 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -35,7 +35,7 @@
 #include "brw_state.h"
 #include "brw_fallback.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index 30603bdd0e..46d52a473b 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -35,7 +35,6 @@
 
 #include "brw_structs.h"
 #include "brw_defines.h"
-#include "shader/prog_instruction.h"
 
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
@@ -45,6 +44,23 @@
 #define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 
+#define BRW_WRITEMASK_NONE     0x00
+#define BRW_WRITEMASK_X        0x01
+#define BRW_WRITEMASK_Y        0x02
+#define BRW_WRITEMASK_XY       0x03
+#define BRW_WRITEMASK_Z        0x04
+#define BRW_WRITEMASK_XZ       0x05
+#define BRW_WRITEMASK_YZ       0x06
+#define BRW_WRITEMASK_XYZ      0x07
+#define BRW_WRITEMASK_W        0x08
+#define BRW_WRITEMASK_XW       0x09
+#define BRW_WRITEMASK_YW       0x0A
+#define BRW_WRITEMASK_XYW      0x0B
+#define BRW_WRITEMASK_ZW       0x0C
+#define BRW_WRITEMASK_XZW      0x0D
+#define BRW_WRITEMASK_YZW      0x0E
+#define BRW_WRITEMASK_XYZW     0x0F
+
 
 #define REG_SIZE (8*4)
 
@@ -157,7 +173,7 @@ static INLINE int type_sz( GLuint type )
  * \param width  one of BRW_WIDTH_x
  * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
  * \param swizzle  one of BRW_SWIZZLE_x
- * \param writemask  WRITEMASK_X/Y/Z/W bitfield
+ * \param writemask  BRW_WRITEMASK_X/Y/Z/W bitfield
  */
 static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint nr,
@@ -215,7 +231,7 @@ static INLINE struct brw_reg brw_vec16_reg( GLuint file,
 		  BRW_WIDTH_16,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[8] register */
@@ -231,7 +247,7 @@ static INLINE struct brw_reg brw_vec8_reg( GLuint file,
 		  BRW_WIDTH_8,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[4] register */
@@ -247,7 +263,7 @@ static INLINE struct brw_reg brw_vec4_reg( GLuint file,
 		  BRW_WIDTH_4,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[2] register */
@@ -263,7 +279,7 @@ static INLINE struct brw_reg brw_vec2_reg( GLuint file,
 		  BRW_WIDTH_2,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYXY,
-		  WRITEMASK_XY);
+		  BRW_WRITEMASK_XY);
 }
 
 /** Construct float[1] register */
@@ -279,7 +295,7 @@ static INLINE struct brw_reg brw_vec1_reg( GLuint file,
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XXXX,
-		  WRITEMASK_X);
+		  BRW_WRITEMASK_X);
 }
 
 
@@ -510,7 +526,7 @@ static INLINE struct brw_reg brw_ip_reg( void )
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XYZW, /* NOTE! */
-		  WRITEMASK_XYZW); /* NOTE! */
+		  BRW_WRITEMASK_XYZW); /* NOTE! */
 }
 
 static INLINE struct brw_reg brw_acc_reg( void )
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index 241cdc33f8..f6b8843e01 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -1276,7 +1276,7 @@ void brw_SAMPLE(struct brw_compile *p,
     * instruction, so that is a guide for whether a workaround is
     * needed.
     */
-   if (writemask != WRITEMASK_XYZW) {
+   if (writemask != BRW_WRITEMASK_XYZW) {
       GLuint dst_offset = 0;
       GLuint i, newmask = 0, len = 0;
 
@@ -1299,7 +1299,7 @@ void brw_SAMPLE(struct brw_compile *p,
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 	 
-	 newmask = ~newmask & WRITEMASK_XYZW;
+	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
 
 	 brw_push_insn_state(p);
 
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 58930e7964..692ce46679 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
       
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
index 9ec206d7e8..fd8e2acced 100644
--- a/src/gallium/drivers/i965/brw_gs_emit.c
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -30,7 +30,7 @@
   */
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index d33bf40a01..eb39be8545 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -31,7 +31,7 @@
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index d5b7bd3b83..e85a1a9c1b 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -1,6 +1,6 @@
 
 /**
- * called from intel_batchbuffer_flush and children before sending a
+ * called from brw_batchbuffer_flush and children before sending a
  * batchbuffer off.
  */
 static void brw_finish_batch(struct intel_context *intel)
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index 0b9ba0c0ed..55242ac6ad 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -42,7 +42,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_reg.h"
 
 /** Waits on the query object's BO and totals the results for this query */
@@ -122,7 +122,7 @@ brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
     */
    if (query->bo) {
       brw_emit_query_end(brw);
-      intel_batchbuffer_flush(brw->batch);
+      brw_batchbuffer_flush(brw->batch);
 
       brw->sws->bo_unreference(brw->query.bo);
       brw->query.bo = NULL;
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
new file mode 100644
index 0000000000..ff64dbd48d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -0,0 +1,46 @@
+
+static void
+calculate_clip_key_rast()
+{
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   key.do_flat_shading = brw->rast->templ.flatshade;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->templ.cull_mode = PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
+	 }
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
+	 }
+
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
+	    key.do_unfilled = 1;
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.templ.offset_ccw;
+	 key.offset_cw = brw->rast.templ.offset_cw;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
+	 }
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
new file mode 100644
index 0000000000..6ceaa1fb09
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -0,0 +1,14 @@
+#ifndef BRW_PIPE_RAST_H
+#define BRW_PIPE_RAST_H
+
+#include "brw_clip.h"
+
+struct brw_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* for draw module */
+
+   /* Precalculated hardware state:
+    */
+   struct brw_clip_prog_key clip_key;
+};
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
new file mode 100644
index 0000000000..fbb772d18c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -0,0 +1,159 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+
+static void brwBindProgram( struct brw_context *brw,
+			    GLenum target, 
+			    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: 
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      break;
+   case GL_FRAGMENT_PROGRAM_ARB:
+      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      break;
+   }
+}
+
+static struct gl_program *brwNewProgram( structg brw_context *brw,
+				      GLenum target, 
+				      GLuint id )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_vertex_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_fragment_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   default:
+      return _mesa_new_program(ctx, target, id);
+   }
+}
+
+static void brwDeleteProgram( struct brw_context *brw,
+			      struct gl_program *prog )
+{
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      brw->sws->bo_unreference(brw_fprog->const_buffer);
+   }
+
+   _mesa_delete_program( ctx, prog );
+}
+
+
+static GLboolean brwIsProgramNative( struct brw_context *brw,
+				     GLenum target, 
+				     struct gl_program *prog )
+{
+   return GL_TRUE;
+}
+
+static void brwProgramStringNotify( struct brw_context *brw,
+				    GLenum target,
+				    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
+      if (fprog->FogOption) {
+         _mesa_append_fog_code(ctx, fprog);
+         fprog->FogOption = GL_NONE;
+      }
+
+      if (newFP == curFP)
+	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
+   }
+   else if (target == GL_VERTEX_PROGRAM_ARB) {
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
+	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
+      }
+      newVP->id = brw->program_id++;      
+
+      /* Also tell tnl about it:
+       */
+      _tnl_program_string(ctx, target, prog);
+   }
+}
+
+void brwInitFragProgFuncs( struct dd_function_table *functions )
+{
+   assert(functions->ProgramStringNotify == _tnl_program_string); 
+
+   functions->BindProgram = brwBindProgram;
+   functions->NewProgram = brwNewProgram;
+   functions->DeleteProgram = brwDeleteProgram;
+   functions->IsProgramNative = brwIsProgramNative;
+   functions->ProgramStringNotify = brwProgramStringNotify;
+}
+
diff --git a/src/gallium/drivers/i965/brw_reg.h b/src/gallium/drivers/i965/brw_reg.h
new file mode 100644
index 0000000000..a640104d71
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_reg.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_NOOP				(CMD_MI | 0)
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+#define MI_FLUSH			(CMD_MI | (4 << 23))
+
+#define _3DSTATE_DRAWRECT_INFO_I965	(CMD_3D | (3 << 27) | (1 << 24) | 0x2)
+
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
new file mode 100644
index 0000000000..716b55c52b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+
+struct brw_winsys_screen;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen base;
+
+   struct brw_winsys_screen *sws;
+
+   boolean is_i945;
+   uint pci_id;
+};
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct brw_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+
+/*
+ * Cast wrappers
+ */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+static INLINE struct brw_transfer *
+brw_transfer(struct pipe_transfer *transfer)
+{
+   return (struct brw_transfer *)transfer;
+}
+
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index d199d0b81a..544be6a089 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -1,6 +1,6 @@
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (IS_965(brw->brw_screen->pci_id) &&
+       !IS_G4X(brw->brw_screen->pci_id)) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 0115f77c08..54202cbd12 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -30,7 +30,7 @@
   */
   
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index 26c2e8891a..c99116b8b1 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -34,7 +34,6 @@
 #define BRW_SF_H
 
 
-#include "shader/program.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index c98d7ec13a..4acb2b7d72 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -30,7 +30,7 @@
   */
    
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index b716097bfc..02657eaba7 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -157,7 +157,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 9568794625..b285837070 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -32,7 +32,7 @@
 
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
@@ -47,7 +47,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
 
@@ -74,7 +74,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
    return GL_TRUE;
 }
 
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index 91d0f80297..1b5f27cc16 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -57,7 +57,7 @@
  */
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b68b6cb21a..842380e38f 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -33,7 +33,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
index 75cdc18912..813cd31f49 100644
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ b/src/gallium/drivers/i965/brw_tex_layout.c
@@ -47,7 +47,7 @@ GLboolean brw_miptree_layout(struct brw_context *brw,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (IS_IGDNG(brw->brw_screen->pci_id)) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index 8c6f4355a6..18d79c5ebb 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -31,7 +31,7 @@
         
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
index 37c3acbc11..b5f9a36e7b 100644
--- a/src/gallium/drivers/i965/brw_util.h
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -36,9 +36,8 @@
 #include "brw_types.h"
 
 extern GLuint brw_count_bits( GLuint val );
-extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
-extern GLuint brw_translate_blend_factor( GLenum factor );
-extern GLuint brw_translate_blend_equation( GLenum mode );
+extern GLuint brw_translate_blend_factor( unsigned factor );
+extern GLuint brw_translate_blend_equation( unsigned mode );
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 97e523c3ee..dcd687ac34 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -34,7 +34,6 @@
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "shader/prog_print.h"
 
 
@@ -113,7 +112,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
+      .mesa  = PIPE_NEW_CLIP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 4a591365c9..54f7d7d7c4 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -36,7 +36,6 @@
 
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "shader/program.h"
 
 
 struct brw_vs_prog_key {
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 6adb743017..e946944295 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -192,7 +192,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 					     BRW_WIDTH_8,
 					     BRW_HORIZONTAL_STRIDE_1,
 					     BRW_SWIZZLE_XXXX,
-					     WRITEMASK_X);
+					     BRW_WRITEMASK_X);
       reg++;
    }
 
@@ -487,7 +487,7 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
    struct brw_compile *p = &c->func;
    
 
-   if (dst.dw1.bits.writemask & WRITEMASK_X) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
       struct brw_reg tmp = get_tmp(c);
       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 
@@ -499,23 +499,23 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
       /* Adjust exponent for floating point: 
        * exp += 127 
        */
-      brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
+      brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 
       /* Install exponent and sign.  
        * Excess drops off the edge: 
        */
-      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X), 
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X), 
 	      tmp_d, brw_imm_d(23));
 
       release_tmp(c, tmp);
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_Y) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
       /* result[1] = arg0.x - floor(arg0.x) */
-      brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
+      brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* As with the LOG instruction, we might be better off just
        * doing a taylor expansion here, seeing as we have to do all
        * the prep work.
@@ -525,14 +525,14 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_EXP, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(arg0, 0), 
 		 BRW_MATH_PRECISION_FULL);
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 }
 
@@ -562,36 +562,36 @@ static void emit_log_noalias( struct brw_vs_compile *c,
     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1U<<31)-1));
 
       brw_SHR(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X), 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X), 
 	      tmp_ud,
 	      brw_imm_ud(23));
 
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_X), 
+	      brw_writemask(tmp, BRW_WRITEMASK_X), 
 	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
 	      brw_imm_d(-127));
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_Y),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1<<23)-1));
 
       brw_OR(p, 
-	     brw_writemask(tmp_ud, WRITEMASK_Y), 
+	     brw_writemask(tmp_ud, BRW_WRITEMASK_Y), 
 	     tmp_ud,
 	     brw_imm_ud(127<<23));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* result[2] = result[0] + LOG2(result[1]); */
 
       /* Why bother?  The above is just a hint how to do this with a
@@ -606,19 +606,19 @@ static void emit_log_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_LOG, 
-		 brw_writemask(tmp, WRITEMASK_Z), 
+		 brw_writemask(tmp, BRW_WRITEMASK_Z), 
 		 brw_swizzle1(tmp, 1), 
 		 BRW_MATH_PRECISION_FULL);
       
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_Z), 
+	      brw_writemask(tmp, BRW_WRITEMASK_Z), 
 	      brw_swizzle1(tmp, 2), 
 	      brw_swizzle1(tmp, 0));
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 
    if (need_tmp) {
@@ -639,14 +639,14 @@ static void emit_dst_noalias( struct brw_vs_compile *c,
 
    /* There must be a better way to do this: 
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_X)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
-   if (dst.dw1.bits.writemask & WRITEMASK_Y)
-      brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
-   if (dst.dw1.bits.writemask & WRITEMASK_Z)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
-   if (dst.dw1.bits.writemask & WRITEMASK_W)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 }
 
 
@@ -672,8 +672,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
    
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0)); 
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1)); 
 
    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
     * to get all channels active inside the IF.  In the clipping code
@@ -683,15 +683,15 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
    if_insn = brw_IF(p, BRW_EXECUTE_8);
    {
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
       emit_math2(c, 
 		 BRW_MATH_FUNCTION_POW, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(tmp, 2),
 		 brw_swizzle1(arg0, 3),
 		 BRW_MATH_PRECISION_PARTIAL);      
@@ -1045,7 +1045,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* ndc = 1.0 / pos.w */
    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
    /* ndc.xyz = pos * ndc */
-   brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
+   brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
@@ -1062,14 +1062,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
-	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
-	 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
+	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
       }
 
       for (i = 0; i < c->key.nr_userclip; i++) {
 	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
 
@@ -1089,7 +1089,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 brw_swizzle1(ndc, 3),
 		 brw_imm_f(0));
    
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
 	 brw_MOV(p, ndc, brw_imm_f(0));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
@@ -1139,7 +1139,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
-
+!
    if (c->first_overflow_output > 0) {
       /* Not all of the vertex outputs/results fit into the MRF.
        * Move the overflowed attributes from the GRF to the MRF and
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
new file mode 100644
index 0000000000..2142db5a4d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -0,0 +1,243 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+struct brw_winsys;
+struct pipe_fence_handle;
+
+/* This currently just wraps dri_bo:
+ */
+struct brw_winsys_buffer {
+   struct brw_winsys_screen *sws;
+   void *bo;
+   unsigned offset;
+};
+
+enum brw_buffer_usage {
+   I915_GEM_DOMAIN_RENDER,
+   I915_GEM_DOMAIN_SAMPLER,
+   I915_GEM_DOMAIN_VERTEX,
+   I915_GEM_DOMAIN_INSTRUCTION,
+
+
+   /* XXX: migrate from domains to explicit usage cases, eg below:
+    */
+
+   /* use on textures */
+   BRW_USAGE_RENDER    = 0x01,
+   BRW_USAGE_SAMPLER   = 0x02,
+   BRW_USAGE_2D_TARGET = 0x04,
+   BRW_USAGE_2D_SOURCE = 0x08,
+   /* use on vertex */
+   BRW_USAGE_VERTEX    = 0x10,
+};
+
+enum brw_buffer_type
+{
+   BRW_BUFFER_TYPE_TEXTURE,
+   BRW_BUFFER_TYPE_SCANOUT, /**< a texture used for scanning out from */
+   BRW_BUFFER_TYPE_VERTEX,
+};
+
+
+/* AKA winsys context:
+ */
+struct brw_batchbuffer {
+
+   struct brw_winsys *iws;
+   struct brw_winsys_buffer *buf;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct brw_winsys_screen {
+
+   /**
+    * Batchbuffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a new batchbuffer.
+    */
+   struct brw_batchbuffer *(*batchbuffer_create)(struct brw_winsys_screen *iws);
+
+   /**
+    * Emit a relocation to a buffer.
+    * Target position in batchbuffer is the same as ptr.
+    */
+   int (*batchbuffer_reloc)(struct brw_batchbuffer *batch,
+			    unsigned offset,
+                            struct brw_winsys_buffer *reloc,
+			    unsigned pre_add,
+                            enum brw_buffer_usage usage);
+
+   /**
+    * Flush a bufferbatch.
+    */
+   void (*batchbuffer_flush)(struct brw_batchbuffer *batch,
+                             struct pipe_fence_handle **fence);
+
+   /**
+    * Destroy a batchbuffer.
+    */
+   void (*batchbuffer_destroy)(struct brw_batchbuffer *batch);
+   /*@}*/
+
+
+   /**
+    * Buffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   struct brw_winsys_buffer *(*buffer_create)(struct brw_winsys *iws,
+					      unsigned size, 
+					      unsigned alignment,
+					      enum brw_buffer_type type);
+
+
+   /* Reference and unreference buffers:
+    */
+   void (*bo_reference)( struct brw_winsys_buffer *buffer );
+   void (*bo_unreference)( struct brw_winsys_buffer *buffer );
+   void (*bo_emit_reloc)( struct brw_winsys_buffer *buffer,
+			  unsigned domain,
+			  unsigned a,
+			  unsigned b,
+			  unsigned offset,
+			  struct brw_winsys_buffer *b2);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*buffer_map)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *buffer,
+                       boolean write);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*buffer_unmap)(struct brw_winsys *iws,
+                        struct brw_winsys_buffer *buffer);
+
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pipe_buffer_write.
+    */
+   int (*buffer_write)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *dst,
+                       size_t offset,
+                       size_t size,
+                       const void *data);
+
+   void (*buffer_destroy)(struct brw_winsys *iws,
+                          struct brw_winsys_buffer *buffer);
+   /*@}*/
+
+
+   /**
+    * Fence functions.
+    */
+   /*@{*/
+   /**
+    * Reference fence and set ptr to fence.
+    */
+   void (*fence_reference)(struct brw_winsys *iws,
+                           struct pipe_fence_handle **ptr,
+                           struct pipe_fence_handle *fence);
+
+   /**
+    * Check if a fence has finished.
+    */
+   int (*fence_signalled)(struct brw_winsys *iws,
+                          struct pipe_fence_handle *fence);
+
+   /**
+    * Wait on a fence to finish.
+    */
+   int (*fence_finish)(struct brw_winsys *iws,
+                       struct pipe_fence_handle *fence);
+   /*@}*/
+
+
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct brw_winsys *iws);
+};
+
+
+/**
+ * Create i915 pipe_screen.
+ */
+struct pipe_screen *i915_create_screen(struct brw_winsys *iws, unsigned pci_id);
+
+/**
+ * Create a i915 pipe_context.
+ */
+struct pipe_context *i915_create_context(struct pipe_screen *screen);
+
+/**
+ * Get the brw_winsys buffer backing the texture.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture;
+boolean i915_get_texture_buffer_brw(struct pipe_texture *texture,
+				    struct brw_winsys_buffer **buffer,
+				    unsigned *stride);
+
+/**
+ * Wrap a brw_winsys buffer with a texture blanket.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture * i915_texture_blanket_brw(struct pipe_screen *screen,
+                                                 struct pipe_texture *tmplt,
+                                                 unsigned pitch,
+                                                 struct brw_winsys_buffer *buffer);
+
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 756a680150..18775830f9 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -34,7 +34,6 @@
 #define BRW_WM_H
 
 
-#include "shader/prog_instruction.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index 220821087c..c6659646f2 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -98,7 +98,7 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    }
    _mesa_printf("]");
 
-   if (inst->writemask != WRITEMASK_XYZW)
+   if (inst->writemask != BRW_WRITEMASK_XYZW)
       _mesa_printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index fec33f74eb..7df9b79d7a 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -72,14 +72,14 @@ static void emit_pixel_xy(struct brw_compile *p,
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
 	      brw_imm_v(0x10101010));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw,5), 2, 4, 0),
@@ -101,14 +101,14 @@ static void emit_delta_xy(struct brw_compile *p,
    /* Calc delta X,Y by subtracting origin in r1 from the pixel
     * centers.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      dst[0],
 	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
 	      negate(r1));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      dst[1],
 	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
@@ -124,7 +124,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       /* X' = X */
       brw_MOV(p,
 	      dst[0],
@@ -133,7 +133,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 
    /* XXX: is this needed any more, or is this a NOOP?
     */
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
@@ -152,7 +152,7 @@ static void emit_pixel_w( struct brw_compile *p,
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
     */
-   if (mask & WRITEMASK_W) {      
+   if (mask & BRW_WRITEMASK_W) {      
       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 
       /* Calc 1/w - just linterp wpos[3] optimized by putting the
@@ -255,7 +255,7 @@ static void emit_frontfacing( struct brw_compile *p,
    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    GLuint i;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return;
 
    for (i = 0; i < 4; i++) {
@@ -321,26 +321,26 @@ void emit_ddxy(struct brw_compile *p,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 } else {
 	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 }
 	 brw_ADD(p, dst[i], src0, negate(src1));
       }
@@ -611,12 +611,12 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -633,12 +633,12 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -656,12 +656,12 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   const int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -681,7 +681,7 @@ static void emit_xpd( struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert(!(mask & BRW_WRITEMASK_W) == BRW_WRITEMASK_X);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -704,12 +704,12 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
@@ -732,12 +732,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -795,17 +795,17 @@ static void emit_tex( struct brw_wm_compile *c,
     */
    switch (inst->tex_idx) {
    case TEXTURE_1D_INDEX:
-      emit = WRITEMASK_X;
+      emit = BRW_WRITEMASK_X;
       nr = 1;
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
-      emit = WRITEMASK_XY;
+      emit = BRW_WRITEMASK_XY;
       nr = 2;
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
-      emit = WRITEMASK_XYZ;
+      emit = BRW_WRITEMASK_XYZ;
       nr = 3;
       break;
    default:
@@ -815,7 +815,7 @@ static void emit_tex( struct brw_wm_compile *c,
 
    if (inst->tex_shadow) {
       nr = 4;
-      emit |= WRITEMASK_W;
+      emit |= BRW_WRITEMASK_W;
    }
 
    msgLength = 1;
@@ -922,18 +922,18 @@ static void emit_lit( struct brw_compile *p,
 		      GLuint mask,
 		      const struct brw_reg *arg0 )
 {
-   assert((mask & WRITEMASK_XW) == 0);
+   assert((mask & BRW_WRITEMASK_XW) == 0);
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
       brw_MOV(p, dst[1], arg0[0]);
       brw_set_saturate(p, 0);
    }
 
-   if (mask & WRITEMASK_Z) {
+   if (mask & BRW_WRITEMASK_Z) {
       emit_math2(p, BRW_MATH_FUNCTION_POW,
 		 &dst[2],
-		 WRITEMASK_X | (mask & SATURATE),
+		 BRW_WRITEMASK_X | (mask & SATURATE),
 		 &arg0[1],
 		 &arg0[3]);
    }
@@ -944,10 +944,10 @@ static void emit_lit( struct brw_compile *p,
     */
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
    {
-      if (mask & WRITEMASK_Y) 
+      if (mask & BRW_WRITEMASK_Y) 
 	 brw_MOV(p, dst[1], brw_imm_f(0));
 
-      if (mask & WRITEMASK_Z) 
+      if (mask & BRW_WRITEMASK_Z) 
 	 brw_MOV(p, dst[2], brw_imm_f(0)); 
    }
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1414,10 +1414,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* There is an scs math function, but it would need some
 	  * fixup for 16-element execution.
 	  */
-	 if (dst_flags & WRITEMASK_X)
-	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
-	 if (dst_flags & WRITEMASK_Y)
-	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_X)
+	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_Y)
+	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
 	 break;
 
       case OPCODE_POW:
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 5f47d86f71..be240031c7 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -115,7 +115,7 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    struct prog_dst_register reg;
    reg.File = file;
    reg.Index = idx;
-   reg.WriteMask = WRITEMASK_XYZW;
+   reg.WriteMask = BRW_WRITEMASK_XYZW;
    reg.RelAddr = 0;
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
@@ -249,7 +249,7 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELXY,
-	      dst_mask(pixel_xy, WRITEMASK_XY),
+	      dst_mask(pixel_xy, BRW_WRITEMASK_XY),
 	      0,
 	      payload_r0_depth,
 	      src_undef(),
@@ -272,7 +272,7 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_DELTAXY,
-	      dst_mask(delta_xy, WRITEMASK_XY),
+	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
 	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
@@ -295,7 +295,7 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELW,
-	      dst_mask(pixel_w, WRITEMASK_W),
+	      dst_mask(pixel_w, BRW_WRITEMASK_W),
 	      0,
 	      interp_wpos,
 	      deltas, 
@@ -327,13 +327,13 @@ static void emit_interp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      WM_WPOSXY,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      get_pixel_xy(c),
 	      src_undef(),
 	      src_undef());
       
-      dst = dst_mask(dst, WRITEMASK_ZW);
+      dst = dst_mask(dst, BRW_WRITEMASK_ZW);
 
       /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
        */
@@ -370,7 +370,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* Interpolate the fog coordinate */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_X),
+	      dst_mask(dst, BRW_WRITEMASK_X),
 	      0,
 	      interp,
 	      deltas,
@@ -378,7 +378,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_YZW),
+	      dst_mask(dst, BRW_WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -393,7 +393,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
               WM_FRONTFACING,
-              dst_mask(dst, WRITEMASK_X),
+              dst_mask(dst, BRW_WRITEMASK_X),
               0,
               src_undef(),
               src_undef(),
@@ -404,7 +404,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      interp,
 	      deltas,
@@ -412,7 +412,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_ZW),
+	      dst_mask(dst, BRW_WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -518,19 +518,19 @@ static void precalc_dst( struct brw_wm_compile *c,
    struct prog_src_register src1 = inst->SrcReg[1];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_Y) {      
+   if (dst.WriteMask & BRW_WRITEMASK_Y) {      
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
 	      src1,
 	      src_undef());
    }
 
-   if (dst.WriteMask & WRITEMASK_XZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
 
@@ -538,7 +538,7 @@ static void precalc_dst( struct brw_wm_compile *c,
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XZ),
+		    dst_mask(dst, BRW_WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
 		    src_undef(),
@@ -546,12 +546,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* Avoid letting negation flag of src0 affect our 1 constant. */
       swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
-   if (dst.WriteMask & WRITEMASK_W) {
+   if (dst.WriteMask & BRW_WRITEMASK_W) {
       /* dst.w = mov src1.w
        */
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_W),
+	      dst_mask(dst, BRW_WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
 	      src_undef(),
@@ -566,14 +566,14 @@ static void precalc_lit( struct brw_wm_compile *c,
    struct prog_src_register src0 = inst->SrcReg[0];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_XW) {
+   if (dst.WriteMask & BRW_WRITEMASK_XW) {
       struct prog_instruction *swz;
 
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XW),
+		    dst_mask(dst, BRW_WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
 		    src_undef(),
@@ -582,10 +582,10 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
-   if (dst.WriteMask & WRITEMASK_YZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_YZ) {
       emit_op(c,
 	      TGSI_OPCODE_LIT,
-	      dst_mask(dst, WRITEMASK_YZ),
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
 	      src_undef(),
@@ -649,7 +649,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
        /* tmp0 = 1 / tmp1 */
        emit_op(c, TGSI_OPCODE_RCP,
-               dst_mask(tmp0, WRITEMASK_X),
+               dst_mask(tmp0, BRW_WRITEMASK_X),
                0,
                tmp1src,
                src_undef(),
@@ -740,7 +740,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_ADD,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
 	      C0,
@@ -751,7 +751,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_Y),
+	      dst_mask(tmp, BRW_WRITEMASK_Y),
 	      0,
 	      tmpsrc,
 	      src_swizzle1(C0, W),
@@ -766,7 +766,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_XYZ),
+	      dst_mask(dst, BRW_WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
@@ -776,7 +776,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
 	      src_swizzle1(C1, W),
@@ -863,7 +863,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_RCP,
-	      dst_mask(tmp, WRITEMASK_W),
+	      dst_mask(tmp, BRW_WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
@@ -873,7 +873,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
@@ -1053,7 +1053,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XY;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XY;
 	 break;
 	 
       case TGSI_OPCODE_DST:
@@ -1082,7 +1082,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_KIL: 
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 0c411b57f5..de5f5fe821 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -32,7 +32,6 @@
 
 #include "brw_context.h"
 #include "brw_wm.h"
-#include "shader/prog_parameter.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index d940ec09a9..f2ae3a958f 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -91,15 +91,15 @@ static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
    case TEXTURE_1D_INDEX:
-      return WRITEMASK_X;
+      return BRW_WRITEMASK_X;
    case TEXTURE_2D_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    case TEXTURE_3D_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_CUBE_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_RECT_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    default: return 0;
    }
 }
@@ -121,16 +121,16 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint read0, read1, read2;
 
       if (inst->opcode == TGSI_OPCODE_KIL) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
 
       if (inst->opcode == WM_FB_WRITE) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
-	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, BRW_WRITEMASK_XYZW); 
 	 if (c->key.source_depth_to_render_target &&
 	     c->key.computes_depth)
-	    track_arg(c, inst, 2, WRITEMASK_Z); 
+	    track_arg(c, inst, 2, BRW_WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
 	 continue;
@@ -191,9 +191,9 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case TGSI_OPCODE_XPD: 
-	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
-	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
-	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
+	 if (writemask & BRW_WRITEMASK_X) read0 |= BRW_WRITEMASK_YZ;	 
+	 if (writemask & BRW_WRITEMASK_Y) read0 |= BRW_WRITEMASK_XZ;	 
+	 if (writemask & BRW_WRITEMASK_Z) read0 |= BRW_WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
@@ -206,12 +206,12 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
-	 read0 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_POW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_TEX:
@@ -219,57 +219,57 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
-	    read0 |= WRITEMASK_Z;
+	    read0 |= BRW_WRITEMASK_Z;
 	 break;
 
       case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
-	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
+	 read0 = get_texcoord_mask(inst->tex_idx) | BRW_WRITEMASK_W;
 	 break;
 
       case WM_WPOSXY:
-	 read0 = writemask & WRITEMASK_XY;
+	 read0 = writemask & BRW_WRITEMASK_XY;
 	 break;
 
       case WM_DELTAXY:
-	 read0 = writemask & WRITEMASK_XY;
-	 read1 = WRITEMASK_X;
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case WM_PIXELW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_LINTERP:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_PINTERP:
-	 read0 = WRITEMASK_X; /* interpolant */
-	 read1 = WRITEMASK_XY; /* deltas */
-	 read2 = WRITEMASK_W; /* pixel w */
+	 read0 = BRW_WRITEMASK_X; /* interpolant */
+	 read1 = BRW_WRITEMASK_XY; /* deltas */
+	 read2 = BRW_WRITEMASK_W; /* pixel w */
 	 break;
 
       case TGSI_OPCODE_DP3:	
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZ;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_DPH:
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_DP4:
-	 read0 = WRITEMASK_XYZW;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZW;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_LIT: 
-	 read0 = WRITEMASK_XYW;
+	 read0 = BRW_WRITEMASK_XYW;
 	 break;
 
       case TGSI_OPCODE_DST:
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index 86dcb74b5b..5045c9b4a6 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -31,7 +31,7 @@
                    
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 
diff --git a/src/gallium/drivers/i965/intel_batchbuffer.h b/src/gallium/drivers/i965/intel_batchbuffer.h
deleted file mode 100644
index be04656aec..0000000000
--- a/src/gallium/drivers/i965/intel_batchbuffer.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef INTEL_BATCHBUFFER_H
-#define INTEL_BATCHBUFFER_H
-
-#include "intel_bufmgr.h"
-#include "intel_reg.h"
-
-#define BATCH_SZ 16384
-#define BATCH_RESERVED 16
-
-enum cliprect_mode {
-   /**
-    * Batchbuffer contents may be looped over per cliprect, but do not
-    * require it.
-    */
-   IGNORE_CLIPRECTS,
-   /**
-    * Batchbuffer contents require looping over per cliprect at batch submit
-    * time.
-    *
-    * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single
-    * constant cliprect, as in DRI2 or FBO rendering.
-    */
-   LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that should not be executed multiple
-    * times.
-    */
-   NO_LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that already handles cliprects, such
-    * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE.
-    *
-    * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch
-    * outside of LOCK/UNLOCK.  This is upgraded to just NO_LOOP_CLIPRECTS when
-    * there's a constant cliprect, as in DRI2 or FBO rendering.
-    */
-   REFERENCES_CLIPRECTS
-};
-
-struct intel_batchbuffer
-{
-   struct intel_context *intel;
-
-   struct brw_winsys_buffer *buf;
-
-   GLubyte *buffer;
-
-   GLubyte *map;
-   GLubyte *ptr;
-
-   GLuint size;
-
-   /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
-   struct {
-      GLuint total;
-      GLubyte *start_ptr;
-   } emit;
-
-   GLuint dirty_state;
-};
-
-struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context
-                                                  *intel);
-
-void intel_batchbuffer_free(struct intel_batchbuffer *batch);
-
-
-void _intel_batchbuffer_flush(struct intel_batchbuffer *batch,
-			      const char *file, int line);
-
-#define intel_batchbuffer_flush(batch) \
-	_intel_batchbuffer_flush(batch, __FILE__, __LINE__)
-
-void intel_batchbuffer_reset(struct intel_batchbuffer *batch);
-
-
-/* Unlike bmBufferData, this currently requires the buffer be mapped.
- * Consider it a convenience function wrapping multple
- * intel_buffer_dword() calls.
- */
-void intel_batchbuffer_data(struct intel_batchbuffer *batch,
-                            const void *data, GLuint bytes,
-			    enum cliprect_mode cliprect_mode);
-
-void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
-                                     GLuint bytes);
-
-GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
-                                       struct brw_winsys_buffer *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-
-/* Inline functions - might actually be better off with these
- * non-inlined.  Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static INLINE GLint
-intel_batchbuffer_space(struct intel_batchbuffer *batch)
-{
-   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
-}
-
-
-static INLINE void
-intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
-{
-   assert(batch->map);
-   assert(intel_batchbuffer_space(batch) >= 4);
-   *(GLuint *) (batch->ptr) = dword;
-   batch->ptr += 4;
-}
-
-static INLINE void
-intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
-                                GLuint sz,
-				enum cliprect_mode cliprect_mode)
-{
-   assert(sz < batch->size - 8);
-   if (intel_batchbuffer_space(batch) < sz)
-      intel_batchbuffer_flush(batch);
-
-   /* All commands should be executed once regardless of cliprect
-    * mode.
-    */
-   (void)cliprect_mode;
-}
-
-/* Here are the crusty old macros, to be removed:
- */
-#define BATCH_LOCALS
-
-#define BEGIN_BATCH(n, cliprect_mode) do {				\
-   intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
-   assert(intel->batch->emit.start_ptr == NULL);			\
-   intel->batch->emit.total = (n) * 4;					\
-   intel->batch->emit.start_ptr = intel->batch->ptr;			\
-} while (0)
-
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d)
-
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   assert((unsigned) (delta) < buf->size);				\
-   intel_batchbuffer_emit_reloc(intel->batch, buf,			\
-				read_domains, write_domain, delta);	\
-} while (0)
-
-#define ADVANCE_BATCH() do {						\
-   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
-   assert(intel->batch->emit.start_ptr != NULL);			\
-   if (_n != intel->batch->emit.total) {				\
-      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
-	      _n, intel->batch->emit.total);				\
-      abort();								\
-   }									\
-   intel->batch->emit.start_ptr = NULL;					\
-} while(0)
-
-
-static INLINE void
-intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
-{
-   intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
-   intel_batchbuffer_emit_dword(batch, MI_FLUSH);
-}
-
-#endif
-- 
cgit v1.2.3


From 4dd2f6640b70e2313f8771f7588aa49a861153aa Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 25 Oct 2009 00:02:16 +0100
Subject: i965g: more work on compiling, particularly the brw_draw files

---
 src/gallium/auxiliary/util/u_debug.c       |  27 +++
 src/gallium/auxiliary/util/u_prim.h        |   2 +
 src/gallium/auxiliary/util/u_upload_mgr.h  |   2 +
 src/gallium/drivers/i965/Makefile          |   2 +-
 src/gallium/drivers/i965/brw_batchbuffer.c | 198 +++++++++++++++
 src/gallium/drivers/i965/brw_batchbuffer.h |  14 +-
 src/gallium/drivers/i965/brw_cc.c          |   8 +-
 src/gallium/drivers/i965/brw_clip.c        |   4 +-
 src/gallium/drivers/i965/brw_clip_state.c  |   4 +-
 src/gallium/drivers/i965/brw_context.c     |   2 +-
 src/gallium/drivers/i965/brw_context.h     |  68 ++++--
 src/gallium/drivers/i965/brw_curbe.c       |  13 +-
 src/gallium/drivers/i965/brw_draw.c        | 165 +++++++------
 src/gallium/drivers/i965/brw_draw.h        |   3 +-
 src/gallium/drivers/i965/brw_draw_upload.c | 372 +++++++++++++++++------------
 src/gallium/drivers/i965/brw_eu.c          |   5 +-
 src/gallium/drivers/i965/brw_eu_debug.c    |  13 +-
 src/gallium/drivers/i965/brw_misc_state.c  |  18 +-
 src/gallium/drivers/i965/brw_pipe_flush.c  |   3 +
 src/gallium/drivers/i965/brw_pipe_shader.c |  19 ++
 src/gallium/drivers/i965/brw_pipe_vertex.c |  25 +-
 src/gallium/drivers/i965/brw_screen.h      |  22 ++
 src/gallium/drivers/i965/brw_sf.c          |   2 +-
 src/gallium/drivers/i965/brw_sf_state.c    |  39 +--
 src/gallium/drivers/i965/brw_state.h       |   6 +-
 src/gallium/drivers/i965/brw_state_batch.c |   4 +-
 src/gallium/drivers/i965/brw_swtnl.c       |   6 +-
 src/gallium/drivers/i965/brw_winsys.h      |   7 +
 src/gallium/drivers/i965/brw_wm.c          |   2 +-
 src/gallium/drivers/i965/brw_wm.h          |   8 +-
 src/gallium/drivers/i965/brw_wm_glsl.c     |  28 ---
 src/gallium/drivers/i965/brw_wm_pass0.c    |  32 +--
 src/mesa/state_tracker/st_draw.c           |   3 +-
 33 files changed, 722 insertions(+), 404 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_batchbuffer.c

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 96d400c839..321ac59a7d 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -69,6 +69,7 @@
 #include "util/u_stream.h" 
 #include "util/u_math.h" 
 #include "util/u_tile.h" 
+#include "util/u_prim.h" 
 
 
 #ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
@@ -600,6 +601,32 @@ const char *pf_name( enum pipe_format format )
 }
 
 
+
+static const struct debug_named_value pipe_prim_names[] = {
+#ifdef DEBUG
+   DEBUG_NAMED_VALUE(PIPE_PRIM_POINTS),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINES),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINE_LOOP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINE_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLES),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLE_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLE_FAN),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_QUADS),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_QUAD_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_POLYGON),
+#endif
+   DEBUG_NAMED_VALUE_END
+};
+
+
+const char *u_prim_name( unsigned prim )
+{
+   return debug_dump_enum(pipe_prim_names, prim);
+}
+
+
+
+
 #ifdef DEBUG
 void debug_dump_image(const char *prefix,
                       unsigned format, unsigned cpp,
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
index a9b533eea7..7434329962 100644
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -135,4 +135,6 @@ static INLINE unsigned u_reduced_prim( unsigned pipe_prim )
    }
 }
 
+const char *u_prim_name( unsigned pipe_prim );
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 745b5834af..d414a1f2f6 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -32,6 +32,8 @@
 #ifndef U_UPLOAD_MGR_H
 #define U_UPLOAD_MGR_H
 
+#include "pipe/p_error.h"
+
 struct pipe_screen;
 struct pipe_buffer;
 struct u_upload_mgr;
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
index 40c8364824..40e8aa8786 100644
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@@ -61,7 +61,7 @@ C_SOURCES = \
 	brw_wm_state.c \
 	brw_wm_surface_state.c \
 	brw_bo.c \
-	intel_batchbuffer.c \
+	brw_batchbuffer.c \
 	intel_tex_layout.c 
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
new file mode 100644
index 0000000000..8bcac76ede
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -0,0 +1,198 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "brw_batchbuffer.h"
+#include "brw_decode.h"
+#include "brw_reg.h"
+#include "brw_winsys.h"
+
+
+void
+brw_batchbuffer_reset(struct brw_batchbuffer *batch)
+{
+   struct intel_context *intel = batch->intel;
+
+   if (batch->buf != NULL) {
+      brw->sws->bo_unreference(batch->buf);
+      batch->buf = NULL;
+   }
+
+   if (!batch->buffer && intel->ttm == GL_TRUE)
+      batch->buffer = malloc (intel->maxBatchSize);
+
+   batch->buf = batch->sws->bo_alloc(batch->sws,
+				     BRW_BUFFER_TYPE_BATCH,
+				     intel->maxBatchSize, 4096);
+   if (batch->buffer)
+      batch->map = batch->buffer;
+   else {
+      batch->sws->bo_map(batch->buf, GL_TRUE);
+      batch->map = batch->buf->virtual;
+   }
+   batch->size = intel->maxBatchSize;
+   batch->ptr = batch->map;
+   batch->dirty_state = ~0;
+   batch->cliprect_mode = IGNORE_CLIPRECTS;
+}
+
+struct brw_batchbuffer *
+brw_batchbuffer_alloc(struct brw_winsys_screen *sws)
+{
+   struct brw_batchbuffer *batch = CALLOC_STRUCT(brw_batchbuffer);
+
+   batch->sws = sws;
+   brw_batchbuffer_reset(batch);
+
+   return batch;
+}
+
+void
+brw_batchbuffer_free(struct brw_batchbuffer *batch)
+{
+   if (batch->map) {
+      dri_bo_unmap(batch->buf);
+      batch->map = NULL;
+   }
+
+   brw->sws->bo_unreference(batch->buf);
+   batch->buf = NULL;
+   FREE(batch);
+}
+
+
+void
+_brw_batchbuffer_flush(struct brw_batchbuffer *batch, const char *file,
+			 int line)
+{
+   struct intel_context *intel = batch->intel;
+   GLuint used = batch->ptr - batch->map;
+
+   if (used == 0)
+      return;
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      batch->sws->bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      batch->sws->bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
+
+   if (INTEL_DEBUG & DEBUG_BATCH)
+      fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
+	      used);
+
+   /* Emit a flush if the bufmgr doesn't do it for us. */
+   if (intel->always_flush_cache || !intel->ttm) {
+      *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Round batchbuffer usage to 2 DWORDs. */
+
+   if ((used & 4) == 0) {
+      *(GLuint *) (batch->ptr) = 0; /* noop */
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Mark the end of the buffer. */
+   *(GLuint *) (batch->ptr) = MI_BATCH_BUFFER_END; /* noop */
+   batch->ptr += 4;
+   used = batch->ptr - batch->map;
+
+   batch->sws->bo_unmap(batch->buf);
+
+   batch->map = NULL;
+   batch->ptr = NULL;
+      
+   batch->sws->bo_exec(batch->buf, used, NULL, 0, 0 );
+      
+   if (INTEL_DEBUG & DEBUG_BATCH) {
+      dri_bo_map(batch->buf, GL_FALSE);
+      intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
+		   brw->brw_screen->pci_id);
+      dri_bo_unmap(batch->buf);
+   }
+
+   if (INTEL_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "waiting for idle\n");
+      dri_bo_map(batch->buf, GL_TRUE);
+      dri_bo_unmap(batch->buf);
+   }
+
+   /* Reset the buffer:
+    */
+   brw_batchbuffer_reset(batch);
+}
+
+
+/*  This is the only way buffers get added to the validate list.
+ */
+GLboolean
+brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+                             struct brw_winsys_buffer *buffer,
+                             uint32_t read_domains, uint32_t write_domain,
+			     uint32_t delta)
+{
+   int ret;
+
+   if (batch->ptr - batch->map > batch->buf->size)
+      _mesa_printf ("bad relocation ptr %p map %p offset %d size %d\n",
+		    batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
+
+   ret = batch->sws->bo_emit_reloc(batch->buf,
+				   read_domains,
+				   write_domain,
+				   delta, 
+				   batch->ptr - batch->map,
+				   buffer);
+
+   /*
+    * Using the old buffer offset, write in what the right data would be, in case
+    * the buffer doesn't move and we can short-circuit the relocation processing
+    * in the kernel
+    */
+   brw_batchbuffer_emit_dword (batch, buffer->offset + delta);
+
+   return GL_TRUE;
+}
+
+void
+brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                       const void *data, GLuint bytes,
+		       enum cliprect_mode cliprect_mode)
+{
+   assert((bytes & 3) == 0);
+   brw_batchbuffer_require_space(batch, bytes);
+   __memcpy(batch->ptr, data, bytes);
+   batch->ptr += bytes;
+}
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
index b8492882e1..25bb9cefca 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.h
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -33,18 +33,16 @@ void brw_batchbuffer_reset(struct brw_batchbuffer *batch);
  * Consider it a convenience function wrapping multple
  * intel_buffer_dword() calls.
  */
-void brw_batchbuffer_data(struct brw_batchbuffer *batch,
+int brw_batchbuffer_data(struct brw_batchbuffer *batch,
                             const void *data, GLuint bytes,
 			    enum cliprect_mode cliprect_mode);
 
-void brw_batchbuffer_release_space(struct brw_batchbuffer *batch,
-                                     GLuint bytes);
 
-GLboolean brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
-                                       struct brw_winsys_buffer *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
+int brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+			       struct brw_winsys_buffer *buffer,
+			       uint32_t read_domains,
+			       uint32_t write_domain,
+			       uint32_t offset);
 
 /* Inline functions - might actually be better off with these
  * non-inlined.  Certainly better off switching all command packets to
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index 76759304eb..ca10bc73f6 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -57,7 +57,7 @@ static void calc_sane_viewport( const struct pipe_viewport_state *vp,
    svp->far = 1;
 }
 
-static void prepare_cc_vp( struct brw_context *brw )
+static int prepare_cc_vp( struct brw_context *brw )
 {
    struct brw_cc_viewport ccv;
    struct sane_viewport svp;
@@ -72,6 +72,8 @@ static void prepare_cc_vp( struct brw_context *brw )
 
    brw->sws->bo_unreference(brw->cc.vp_bo);
    brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_cc_vp = {
@@ -158,7 +160,7 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    return bo;
 }
 
-static void prepare_cc_unit( struct brw_context *brw )
+static int prepare_cc_unit( struct brw_context *brw )
 {
    struct brw_cc_unit_key key;
 
@@ -172,6 +174,8 @@ static void prepare_cc_unit( struct brw_context *brw )
 
    if (brw->cc.state_bo == NULL)
       brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
+   
+   return 0;
 }
 
 const struct brw_tracked_state brw_cc_unit = {
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 622d9dba96..1a52fa771b 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -146,7 +146,7 @@ static void compile_clip_prog( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_clip_prog(struct brw_context *brw)
+static int upload_clip_prog(struct brw_context *brw)
 {
    struct brw_clip_prog_key key;
 
@@ -173,6 +173,8 @@ static void upload_clip_prog(struct brw_context *brw)
 					&brw->clip.prog_data);
    if (brw->clip.prog_bo == NULL)
       compile_clip_prog( brw, &key );
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
index 25b8c6372f..bf4e6f5103 100644
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -159,7 +159,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    return bo;
 }
 
-static void upload_clip_unit( struct brw_context *brw )
+static int upload_clip_unit( struct brw_context *brw )
 {
    struct brw_clip_unit_key key;
 
@@ -173,6 +173,8 @@ static void upload_clip_unit( struct brw_context *brw )
    if (brw->clip.state_bo == NULL) {
       brw->clip.state_bo = clip_unit_create_from_key(brw, &key);
    }
+   
+   return 0;
 }
 
 const struct brw_tracked_state brw_clip_unit = {
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index e9605bafe6..e10b7d8bf5 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -105,7 +105,7 @@ struct pipe_context *brw_create_context(struct pipe_screen *screen)
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0;
 
-   brw->emit_state_always = 0;
+   brw->flags.always_emit_state = 0;
 
    make_empty_list(&brw->query.active_head);
 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index dd782fdba9..7ead641811 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -182,6 +182,8 @@ struct brw_fragment_shader {
 #define PIPE_NEW_FRAGMENT_CONSTANTS     0x2
 #define PIPE_NEW_VERTEX_CONSTANTS       0x2
 #define PIPE_NEW_CLIP                   0x2
+#define PIPE_NEW_INDEX_BUFFER           0x2
+#define PIPE_NEW_INDEX_RANGE            0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -387,8 +389,8 @@ struct brw_cache {
  */
 struct brw_tracked_state {
    struct brw_state_flags dirty;
-   void (*prepare)( struct brw_context *brw );
-   void (*emit)( struct brw_context *brw );
+   int (*prepare)( struct brw_context *brw );
+   int (*emit)( struct brw_context *brw );
 };
 
 /* Flags for brw->state.cache.
@@ -465,9 +467,7 @@ struct brw_context
    GLuint primitive;
    GLuint reduced_primitive;
 
-   GLboolean emit_state_always;
-
-   /* Active vertex program: 
+   /* Active state from the state tracker: 
     */
    struct {
       const struct brw_vertex_shader *vertex_shader;
@@ -475,11 +475,31 @@ struct brw_context
       const struct brw_blend_state *blend;
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_alpha_state *zstencil;
+
+      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      unsigned num_vertex_elements;
+      unsigned num_vertex_buffers;
+
       struct pipe_framebuffer_state fb;
       struct pipe_viewport_state vp;
       struct pipe_clip_state ucp;
       struct pipe_buffer *vertex_constants;
       struct pipe_buffer *fragment_constants;
+
+      /**
+       * Index buffer for this draw_prims call.
+       *
+       * Updates are signaled by PIPE_NEW_INDEX_BUFFER.
+       */
+      struct pipe_buffer *index_buffer;
+      unsigned index_size;
+
+      /* Updates are signalled by PIPE_NEW_INDEX_RANGE:
+       */
+      unsigned min_index;
+      unsigned max_index;
+
    } curr;
 
    struct {
@@ -504,30 +524,26 @@ struct brw_context
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
-      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
-      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-      unsigned num_vertex_element;
-      unsigned num_vertex_buffer;
-
       struct u_upload_mgr *upload_vertex;
       struct u_upload_mgr *upload_index;
       
-
-      /* Summary of size and varying of active arrays, so we can check
-       * for changes to this state:
+      /* Information on uploaded vertex buffers:
        */
-      struct brw_vertex_info info;
-      unsigned int min_index, max_index;
+      struct {
+	 unsigned stride;	/* in bytes between successive vertices */
+	 unsigned offset;	/* in bytes, of first vertex in bo */
+	 unsigned vertex_count;	/* count of valid vertices which may be accessed */
+	 struct brw_winsys_buffer *bo;
+      } vb[PIPE_MAX_ATTRIBS];
+
+      struct {
+      } ve[PIPE_MAX_ATTRIBS];
+
+      unsigned nr_vb;		/* currently the same as curr.num_vertex_buffers */
+      unsigned nr_ve;		/* currently the same as curr.num_vertex_elements */
    } vb;
 
    struct {
-      /**
-       * Index buffer for this draw_prims call.
-       *
-       * Updates are signaled by BRW_NEW_INDICES.
-       */
-      const struct _mesa_index_buffer *ib;
-
       /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
       struct brw_winsys_buffer *bo;
       unsigned int offset;
@@ -668,6 +684,14 @@ struct brw_context
       int index;
       GLboolean active;
    } query;
+
+   struct {
+      unsigned always_emit_state:1;
+      unsigned always_flush_batch:1;
+      unsigned force_swtnl:1;
+      unsigned no_swtnl:1;
+   } flags;
+
    /* Used to give every program string a unique id
     */
    GLuint program_id;
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index edc39ff223..278ffa4ca2 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -48,7 +48,7 @@
  * constants.  That greatly reduces the demand for space in the CURBE.
  * Some of the comments within are dated...
  */
-static void calculate_curbe_offsets( struct brw_context *brw )
+static int calculate_curbe_offsets( struct brw_context *brw )
 {
    /* CACHE_NEW_WM_PROG */
    const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
@@ -104,6 +104,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
       brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
    }
+
+   return 0;
 }
 
 
@@ -157,7 +159,7 @@ static GLfloat fixed_plane[6][4] = {
  * cache mechanism, but maybe would benefit from a comparison against
  * the current uploaded set of constants.
  */
-static void prepare_constant_buffer(struct brw_context *brw)
+static int prepare_constant_buffer(struct brw_context *brw)
 {
    const GLuint sz = brw->curbe.total_size;
    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
@@ -170,7 +172,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 brw->curbe.last_buf = NULL;
 	 brw->curbe.last_bufsz  = 0;
       }
-      return;
+      return 0;
    }
 
    buf = (GLfloat *) CALLOC(bufsz, 1);
@@ -305,9 +307,11 @@ static void prepare_constant_buffer(struct brw_context *brw)
     * flushes as necessary when doublebuffering of CURBEs isn't
     * possible.
     */
+
+   return 0;
 }
 
-static void emit_constant_buffer(struct brw_context *brw)
+static int emit_constant_buffer(struct brw_context *brw)
 {
    GLuint sz = brw->curbe.total_size;
 
@@ -322,6 +326,7 @@ static void emit_constant_buffer(struct brw_context *brw)
 		(sz - 1) + brw->curbe.curbe_offset);
    }
    ADVANCE_BATCH();
+   return 0;
 }
 
 const struct brw_tracked_state brw_constant_buffer = {
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 7af490bc5a..b5fe7c9601 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -26,15 +26,18 @@
  **************************************************************************/
 
 
+#include "util/u_prim.h"
+#include "util/u_upload_mgr.h"
+
 #include "brw_draw.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_debug.h"
+#include "brw_screen.h"
 
 #include "brw_batchbuffer.h"
 
-#define FILE_DEBUG_FLAG DEBUG_BATCH
 
 static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
    _3DPRIM_POINTLIST,
@@ -56,18 +59,21 @@ static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
  * programs be immune to the active primitive (ie. cope with all
  * possibilities).  That may not be realistic however.
  */
-static GLuint brw_set_prim(struct brw_context *brw, unsigned prim)
+static int brw_set_prim(struct brw_context *brw, unsigned prim )
 {
 
    if (BRW_DEBUG & DEBUG_PRIMS)
       debug_printf("PRIM: %s\n", u_prim_name(prim));
    
    if (prim != brw->primitive) {
+      unsigned reduced_prim;
+
       brw->primitive = prim;
       brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
 
-      if (reduced_prim[prim] != brw->reduced_primitive) {
-	 brw->reduced_primitive = reduced_prim[prim];
+      reduced_prim = u_reduced_prim(prim);
+      if (reduced_prim != brw->reduced_primitive) {
+	 brw->reduced_primitive = reduced_prim;
 	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
       }
    }
@@ -77,17 +83,14 @@ static GLuint brw_set_prim(struct brw_context *brw, unsigned prim)
 
 
-static enum pipe_error brw_emit_prim(struct brw_context *brw,
-				     unsigned prim,
-				     unsigned start,
-				     unsigned count,
-				     boolean indexed,
-				     uint32_t hw_prim)
+static int brw_emit_prim(struct brw_context *brw,
+			 unsigned start,
+			 unsigned count,
+			 boolean indexed,
+			 uint32_t hw_prim)
 {
    struct brw_3d_primitive prim_packet;
-
-   if (INTEL_DEBUG & DEBUG_PRIMS)
-      debug_printf("PRIM: %s %d %d\n", u_prim_name(prim), start, count);
+   int ret;
 
    prim_packet.header.opcode = CMD_3D_PRIM;
    prim_packet.header.length = sizeof(prim_packet)/4 - 2;
@@ -101,7 +104,7 @@ static enum pipe_error brw_emit_prim(struct brw_context *brw,
       prim_packet.start_vert_location += brw->ib.start_vertex_offset;
    prim_packet.instance_count = 1;
    prim_packet.start_instance_location = 0;
-   prim_packet.base_vert_location = prim->basevertex;
+   prim_packet.base_vert_location = 0; // prim->basevertex; XXX: add this to gallium
 
 
    /* If we're set to always flush, do it before and after the primitive emit.
@@ -109,20 +112,20 @@ static enum pipe_error brw_emit_prim(struct brw_context *brw,
     * and missed flushes of the render cache as it heads to other parts of
     * the besides the draw code.
     */
-   if (intel->always_flush_cache) {
-      BEGIN_BATCH(1, IGNORE_CLIPRECTS)
-      OUT_BATCH(intel->vtbl.flush_cmd());
+   if (0) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
       ADVANCE_BATCH();
    }
    if (prim_packet.verts_per_instance) {
-      ret = brw_batchbuffer_data( brw->intel.batch, &prim_packet,
+      ret = brw_batchbuffer_data( brw->batch, &prim_packet,
 				  sizeof(prim_packet), LOOP_CLIPRECTS);
       if (ret)
 	 return ret;
    }
-   if (intel->always_flush_cache) {
+   if (0) {
       BEGIN_BATCH(1, IGNORE_CLIPRECTS);
-      OUT_BATCH(intel->vtbl.flush_cmd());
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
       ADVANCE_BATCH();
    }
 
@@ -133,44 +136,24 @@ static enum pipe_error brw_emit_prim(struct brw_context *brw,
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
-static GLboolean brw_try_draw_prims( struct brw_context *brw,
-				     const struct gl_client_array *arrays[],
-				     const struct _mesa_prim *prim,
-				     GLuint nr_prims,
-				     const struct _mesa_index_buffer *ib,
-				     GLuint min_index,
-				     GLuint max_index )
+static int
+try_draw_range_elements(struct brw_context *brw,
+			struct pipe_buffer *index_buffer,
+			unsigned hw_prim, 
+			unsigned start, unsigned count)
 {
-   struct brw_context *brw = brw_context(ctx);
-   GLboolean retval = GL_FALSE;
-   GLboolean warn = GL_FALSE;
-   GLboolean first_time = GL_TRUE;
-   uint32_t hw_prim;
-   GLuint i;
-
-   if (ctx->NewState)
-      _mesa_update_state( ctx );
-
-   /* Bind all inputs, derive varying and size information:
-    */
-   brw_merge_inputs( brw, arrays );
-
-   brw->ib.ib = ib;
-   brw->state.dirty.brw |= BRW_NEW_INDICES;
-
-   brw->vb.min_index = min_index;
-   brw->vb.max_index = max_index;
-   brw->state.dirty.brw |= BRW_NEW_VERTICES;
-
-   hw_prim = brw_set_prim(brw, prim[i].mode);
+   int ret;
 
-   brw_validate_state(brw);
+   ret = brw_validate_state(brw);
+   if (ret)
+      return ret;
 
    /* Check that we can fit our state in with our existing batchbuffer, or
     * flush otherwise.
     */
-   ret = dri_bufmgr_check_aperture_space(brw->state.validated_bos,
-					 brw->state.validated_bo_count);
+   ret = brw->sws->check_aperture_space(brw->sws,
+					brw->state.validated_bos,
+					brw->state.validated_bo_count);
    if (ret)
       return ret;
 
@@ -178,12 +161,12 @@ static GLboolean brw_try_draw_prims( struct brw_context *brw,
    if (ret)
       return ret;
    
-   ret = brw_emit_prim(brw, &prim[i], hw_prim);
+   ret = brw_emit_prim(brw, start, count, index_buffer != NULL, hw_prim);
    if (ret)
       return ret;
 
-   if (intel->always_flush_batch)
-      brw_batchbuffer_flush(intel->batch);
+   if (brw->flags.always_flush_batch)
+      brw_batchbuffer_flush(brw->batch);
 
    return 0;
 }
@@ -197,22 +180,45 @@ brw_draw_range_elements(struct pipe_context *pipe,
 			unsigned max_index,
 			unsigned mode, unsigned start, unsigned count)
 {
-   enum pipe_error ret;
+   struct brw_context *brw = brw_context(pipe);
+   int ret;
+   uint32_t hw_prim;
+
+   hw_prim = brw_set_prim(brw, mode);
 
-   if (!vbo_all_varyings_in_vbos(arrays)) {
-      if (!index_bounds_valid)
-	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("PRIM: %s %d %d\n", u_prim_name(mode), start, count);
+
+   /* Potentially trigger upload of new index buffer.
+    *
+    * XXX: do we need to go through state validation to achieve this?
+    * Could just call upload code directly.
+    */
+   if (brw->curr.index_buffer != index_buffer) {
+      pipe_buffer_reference( &brw->curr.index_buffer, index_buffer );
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_BUFFER;
+   }
+
+   /* XXX: do we really care?
+    */
+   if (brw->curr.min_index != min_index ||
+       brw->curr.max_index != max_index) 
+   { 
+      brw->curr.min_index = min_index;
+      brw->curr.max_index = max_index;
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_RANGE;
    }
 
+
    /* Make a first attempt at drawing:
     */
-   ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
 
    /* Otherwise, flush and retry:
     */
    if (ret != 0) {
-      brw_batchbuffer_flush(intel->batch);
-      ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+      brw_batchbuffer_flush(brw->batch);
+      ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
       assert(ret == 0);
    }
 
@@ -242,28 +248,37 @@ brw_draw_arrays(struct pipe_context *pipe, unsigned mode,
 
 
-void brw_draw_init( struct brw_context *brw )
+boolean brw_draw_init( struct brw_context *brw )
 {
    /* Register our drawing function: 
     */
    brw->base.draw_arrays = brw_draw_arrays;
    brw->base.draw_elements = brw_draw_elements;
    brw->base.draw_range_elements = brw_draw_range_elements;
-}
 
-void brw_draw_destroy( struct brw_context *brw )
-{
-   int i;
+   /* Create helpers for uploading data in user buffers:
+    */
+   brw->vb.upload_vertex = u_upload_create( &brw->brw_screen->base,
+					    128 * 1024,
+					    64,
+					    PIPE_BUFFER_USAGE_VERTEX );
+   if (brw->vb.upload_vertex == NULL)
+      return FALSE;
+
+   brw->vb.upload_index = u_upload_create( &brw->brw_screen->base,
+					   128 * 1024,
+					   64,
+					   PIPE_BUFFER_USAGE_INDEX );
+   if (brw->vb.upload_index == NULL)
+      return FALSE;
 
-   if (brw->vb.upload.bo != NULL) {
-      brw->sws->bo_unreference(brw->vb.upload.bo);
-      brw->vb.upload.bo = NULL;
-   }
+   return TRUE;
+}
 
-   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
-      brw->vb.inputs[i].bo = NULL;
-   }
+void brw_draw_cleanup( struct brw_context *brw )
+{
+   u_upload_destroy( brw->vb.upload_vertex );
+   u_upload_destroy( brw->vb.upload_index );
 
    brw->sws->bo_unreference(brw->ib.bo);
    brw->ib.bo = NULL;
diff --git a/src/gallium/drivers/i965/brw_draw.h b/src/gallium/drivers/i965/brw_draw.h
index 13f0443a81..8dc5dbce62 100644
--- a/src/gallium/drivers/i965/brw_draw.h
+++ b/src/gallium/drivers/i965/brw_draw.h
@@ -32,8 +32,7 @@
 
 struct brw_context;
 
-
-void brw_draw_init( struct brw_context *brw );
+boolean brw_draw_init( struct brw_context *brw );
 void brw_draw_cleanup( struct brw_context *brw );
 
 
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index 7b0860d04c..040d8ca93a 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -26,21 +26,23 @@
  **************************************************************************/
 
 #include "pipe/p_context.h"
+#include "pipe/p_error.h"
 
 #include "util/u_upload_mgr.h"
+#include "util/u_math.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
-
+#include "brw_screen.h"
 #include "brw_batchbuffer.h"
+#include "brw_debug.h"
 
 
-unsigned brw_translate_surface_format( unsigned id )
+static unsigned brw_translate_surface_format( unsigned id )
 {
    switch (id) {
    case PIPE_FORMAT_R64_FLOAT:
@@ -186,70 +188,136 @@ static unsigned get_index_type(int type)
 }
 
 
-
-static boolean brw_prepare_vertices(struct brw_context *brw)
+static int brw_prepare_vertices(struct brw_context *brw)
 {
-   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
+   unsigned int min_index = brw->curr.min_index;
+   unsigned int max_index = brw->curr.max_index;
    GLuint i;
-   const unsigned char *ptr = NULL;
-   GLuint interleave = 0;
-   unsigned int min_index = brw->vb.min_index;
-   unsigned int max_index = brw->vb.max_index;
-
-   struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
-   GLuint nr_uploads = 0;
-
-   /* First build an array of pointers to ve's in vb.inputs_read
-    */
-   if (0)
-      _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
-
+   int ret;
 
+   if (BRW_DEBUG & DEBUG_VERTS)
+      debug_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
-   for (i = 0; i < brw->vb.num_vertex_buffer; i++) {
-      struct brw_vertex_buffer *vb = brw->vb.vertex_buffer[i];
-      unsigned size = (vb->stride == 0 ? 
-		       vb->size :
-		       vb->stride * (max_index + 1 - min_index));
 
-
-      if (brw_is_user_buffer(vb)) {
-	 u_upload_buffer( brw->upload_vertex, 
-			  min_index * vb->stride,
-			  size,
-			  &offset,
-			  &buffer );
+   for (i = 0; i < brw->curr.num_vertex_buffers; i++) {
+      struct pipe_vertex_buffer *vb = &brw->curr.vertex_buffer[i];
+      struct brw_winsys_buffer *bo;
+      struct pipe_buffer *upload_buf;
+      unsigned offset;
+      
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s vb[%d] user:%d offset:0x%x sz:0x%x stride:0x%x\n",
+		      __FUNCTION__, i,
+		      brw_buffer_is_user_buffer(vb->buffer),
+		      vb->buffer_offset,
+		      vb->buffer->size,
+		      vb->stride);
+
+      if (brw_buffer_is_user_buffer(vb->buffer)) {
+
+	 /* XXX: simplify this.  Stop the state trackers from generating
+	  * zero-stride buffers & have them use additional constants (or
+	  * add support for >1 constant buffer) instead.
+	  */
+	 unsigned size = (vb->stride == 0 ? 
+			  vb->buffer->size - vb->buffer_offset :
+			  MAX2(vb->buffer->size - vb->buffer_offset,
+			       vb->stride * (max_index + 1 - min_index)));
+
+	 ret = u_upload_buffer( brw->vb.upload_vertex, 
+				vb->buffer_offset + min_index * vb->stride,
+				size,
+				vb->buffer,
+				&offset,
+				&upload_buf );
+	 if (ret)
+	    return ret;
+
+	 bo = brw_buffer(upload_buf)->bo;
+	 
+	 assert(offset + size <= bo->size);
       }
       else
       {
-	 offset = 0;
-	 buffer = vb->buffer;
+	 offset = vb->buffer_offset;
+	 bo = brw_buffer(vb->buffer)->bo;
       }
+
+      assert(offset < bo->size);
       
       /* Set up post-upload info about this vertex buffer:
        */
-      input->offset = (unsigned long)offset;
-      input->stride = vb->stride;
-      input->count = count;
-      brw->sws->bo_unreference(input->bo);
-      input->bo = intel_bufferobj_buffer(intel, intel_buffer,
-					 INTEL_READ);
-      brw->sws->bo_reference(input->bo);
-
-      assert(input->offset < input->bo->size);
-      assert(input->offset + size <= input->bo->size);
+      brw->vb.vb[i].offset = offset;
+      brw->vb.vb[i].stride = vb->stride;
+      brw->vb.vb[i].vertex_count = (vb->stride == 0 ?
+				    1 :
+				    (bo->size - offset) / vb->stride);
+      brw->sws->bo_unreference(brw->vb.vb[i].bo);
+      brw->vb.vb[i].bo = bo;
+      brw->sws->bo_reference(brw->vb.vb[i].bo);
+
+      /* Don't need to retain this reference.  We have a reference on
+       * the underlying winsys buffer:
+       */
+      pipe_buffer_reference( &upload_buf, NULL );
    }
 
+   brw->vb.nr_vb = i;
    brw_prepare_query_begin(brw);
 
-   for (i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      brw_add_validated_bo(brw, brw->vb.vb[i].bo);
+   }
+
+   return 0;
+}
+
+static int brw_emit_vertex_buffers( struct brw_context *brw )
+{
+   int i;
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), just bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_vb == 0) {
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s: no active vertex buffers\n", __FUNCTION__);
 
-      brw_add_validated_bo(brw, input->bo);
+      return 0;
+   }
+
+   /* Emit VB state packets.
+    */
+   BEGIN_BATCH(1 + brw->vb.nr_vb * 4, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
+	     ((1 + brw->vb.nr_vb * 4) - 2));
+
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
+		BRW_VB0_ACCESS_VERTEXDATA |
+		(brw->vb.vb[i].stride << BRW_VB0_PITCH_SHIFT));
+      OUT_RELOC(brw->vb.vb[i].bo,
+		I915_GEM_DOMAIN_VERTEX, 0,
+		brw->vb.vb[i].offset);
+      if (BRW_IS_IGDNG(brw)) {
+	 OUT_RELOC(brw->vb.vb[i].bo,
+		   I915_GEM_DOMAIN_VERTEX, 0,
+		   brw->vb.vb[i].bo->size - 1);
+      } else
+	 OUT_BATCH(brw->vb.vb[i].stride ? brw->vb.vb[i].vertex_count : 0);
+      OUT_BATCH(0); /* Instance data step rate */
    }
+   ADVANCE_BATCH();
+   return 0;
 }
 
-static void brw_emit_vertices(struct brw_context *brw)
+
+
+
+static int brw_emit_vertex_elements(struct brw_context *brw)
 {
    GLuint i;
 
@@ -262,7 +330,7 @@ static void brw_emit_vertices(struct brw_context *brw)
     * The stale VB state stays in place, but they don't do anything unless
     * a VE loads from them.
     */
-   if (brw->vb.nr_enabled == 0) {
+   if (brw->vb.nr_ve == 0) {
       BEGIN_BATCH(3, IGNORE_CLIPRECTS);
       OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
       OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
@@ -274,59 +342,23 @@ static void brw_emit_vertices(struct brw_context *brw)
 		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
 		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
       ADVANCE_BATCH();
-      return;
+      return 0;
    }
 
-   /* Now emit VB and VEP state packets.
+   /* Now emit vertex element (VEP) state packets.
     *
-    * This still defines a hardware VB for each input, even if they
-    * are interleaved or from the same VBO.  TBD if this makes a
-    * performance difference.
     */
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
-	     ((1 + brw->vb.nr_enabled * 4) - 2));
-
-   for (i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
-
-      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
-		BRW_VB0_ACCESS_VERTEXDATA |
-		(input->stride << BRW_VB0_PITCH_SHIFT));
-      OUT_RELOC(input->bo,
-		I915_GEM_DOMAIN_VERTEX, 0,
-		input->offset);
-      if (BRW_IS_IGDNG(brw)) {
-          if (input->stride) {
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->stride * input->count - 1);
-          } else {
-              assert(input->count == 1);
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->element_size - 1);
-          }
-      } else
-          OUT_BATCH(input->stride ? input->count : 0);
-      OUT_BATCH(0); /* Instance data step rate */
-   }
-   ADVANCE_BATCH();
-
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
-   for (i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = get_surface_type(input->glarray->Type,
-					 input->glarray->Size,
-					 input->glarray->Format,
-					 input->glarray->Normalized);
+   BEGIN_BATCH(1 + brw->curr.num_vertex_elements * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_ve * 2) - 2));
+   for (i = 0; i < brw->vb.nr_ve; i++) {
+      const struct pipe_vertex_element *input = &brw->curr.vertex_element[i];
+      uint32_t format = brw_translate_surface_format( input->src_format );
       uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
       uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
       uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
       uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
 
-      switch (input->glarray->Size) {
+      switch (input->nr_components) {
       case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
       case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
       case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
@@ -352,11 +384,29 @@ static void brw_emit_vertices(struct brw_context *brw)
                     ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
    ADVANCE_BATCH();
+   return 0;
+}
+
+
+static int brw_emit_vertices( struct brw_context *brw )
+{
+   int ret;
+
+   ret = brw_emit_vertex_buffers( brw );
+   if (ret)
+      return ret;
+
+   ret = brw_emit_vertex_elements( brw );
+   if (ret)
+      return ret;
+   
+   return 0;
 }
 
+
 const struct brw_tracked_state brw_vertices = {
    .dirty = {
-      .mesa = 0,
+      .mesa = PIPE_NEW_INDEX_RANGE,
       .brw = BRW_NEW_BATCH | BRW_NEW_VERTICES,
       .cache = 0,
    },
@@ -364,104 +414,106 @@ const struct brw_tracked_state brw_vertices = {
    .emit = brw_emit_vertices,
 };
 
-static void brw_prepare_indices(struct brw_context *brw)
+
+static int brw_prepare_indices(struct brw_context *brw)
 {
-   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   GLuint ib_size;
+   struct pipe_buffer *index_buffer = brw->curr.index_buffer;
    struct brw_winsys_buffer *bo = NULL;
-   struct gl_buffer_object *bufferobj;
    GLuint offset;
-   GLuint ib_type_size;
+   GLuint index_size;
+   GLuint ib_size;
+   int ret;
 
    if (index_buffer == NULL)
-      return;
+      return 0;
 
-   ib_type_size = get_size(index_buffer->type);
-   ib_size = ib_type_size * index_buffer->count;
-   bufferobj = index_buffer->obj;;
+   if (DEBUG & DEBUG_VERTS)
+      debug_printf("%s: index_size:%d index_buffer->size:%d\n",
+		   __FUNCTION__,
+		   brw->curr.index_size,
+		   brw->curr.index_buffer->size);
 
-   /* Turn into a proper VBO:
-    */
-   if (!_mesa_is_bufferobj(bufferobj)) {
-      brw->ib.start_vertex_offset = 0;
+   ib_size = index_buffer->size;
+   index_size = brw->curr.index_size;
 
-      /* Get new bufferobj, offset:
-       */
-      get_space(brw, ib_size, &bo, &offset);
-
-      /* Straight upload
+   /* Turn userbuffer into a proper hardware buffer?
+    */
+   if (brw_buffer_is_user_buffer(index_buffer)) {
+      struct pipe_buffer *upload_buf;
+
+      ret = u_upload_buffer( brw->vb.upload_index,
+			     0,
+			     ib_size,
+			     index_buffer,
+			     &offset,
+			     &upload_buf );
+      if (ret)
+	 return ret;
+
+      bo = brw_buffer(upload_buf)->bo;
+      brw->sws->bo_reference(bo);
+      pipe_buffer_reference( &upload_buf, NULL );
+
+      /* XXX: annotate the userbuffer with the upload information so
+       * that successive calls don't get re-uploaded.
        */
-      brw_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
-
-   } else {
-      offset = (GLuint) (unsigned long) index_buffer->ptr;
-      brw->ib.start_vertex_offset = 0;
+   }
+   else {
+      bo = brw_buffer(index_buffer)->bo;
+      brw->sws->bo_reference(bo);
+      
+      ib_size = bo->size;
+      offset = 0;
+   }
 
-      /* If the index buffer isn't aligned to its element size, we have to
-       * rebase it into a temporary.
-       */
-       if ((get_size(index_buffer->type) - 1) & offset) {
-           GLubyte *map = ctx->Driver.MapBuffer(ctx,
-                                                GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                                GL_DYNAMIC_DRAW_ARB,
-                                                bufferobj);
-           map += offset;
-
-	   get_space(brw, ib_size, &bo, &offset);
-
-	   dri_bo_subdata(bo, offset, ib_size, map);
-
-           ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj);
-       } else {
-	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
-				      INTEL_READ);
-	  brw->sws->bo_reference(bo);
-
-	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
-	   * the index buffer state when we're just moving the start index
-	   * of our drawing.
-	   */
-	  brw->ib.start_vertex_offset = offset / ib_type_size;
-	  offset = 0;
-	  ib_size = bo->size;
-       }
+   /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading the
+    * index buffer state when we're just moving the start index of our
+    * drawing.
+    *
+    * In gallium this will happen in the case where successive draw
+    * calls are made with (distinct?) userbuffers, but the upload_mgr
+    * places the data into a single winsys buffer.
+    * 
+    * This statechange doesn't raise any state flags and is always
+    * just merged into the final draw packet:
+    */
+   if (1) {
+      assert((offset & (index_size - 1)) == 0);
+      brw->ib.start_vertex_offset = offset / index_size;
    }
 
+   /* These statechanges trigger a new CMD_INDEX_BUFFER packet:
+    */
    if (brw->ib.bo != bo ||
-       brw->ib.offset != offset ||
        brw->ib.size != ib_size)
    {
-      drm_intel_bo_unreference(brw->ib.bo);
+      brw->sws->bo_unreference(brw->ib.bo);
       brw->ib.bo = bo;
-      brw->ib.offset = offset;
       brw->ib.size = ib_size;
-
       brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
-   } else {
-      drm_intel_bo_unreference(bo);
+   }
+   else {
+      brw->sws->bo_unreference(bo);
    }
 
    brw_add_validated_bo(brw, brw->ib.bo);
+   return 0;
 }
 
 const struct brw_tracked_state brw_indices = {
    .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_INDICES,
+      .mesa = PIPE_NEW_INDEX_BUFFER,
+      .brw = 0,
       .cache = 0,
    },
    .prepare = brw_prepare_indices,
 };
 
-static void brw_emit_index_buffer(struct brw_context *brw)
+static int brw_emit_index_buffer(struct brw_context *brw)
 {
-   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-
-   if (index_buffer == NULL)
-      return;
-
    /* Emit the indexbuffer packet:
     */
+   if (brw->ib.bo)
    {
       struct brw_indexbuffer ib;
 
@@ -469,7 +521,7 @@ static void brw_emit_index_buffer(struct brw_context *brw)
 
       ib.header.bits.opcode = CMD_INDEX_BUFFER;
       ib.header.bits.length = sizeof(ib)/4 - 2;
-      ib.header.bits.index_format = get_index_type(index_buffer->type);
+      ib.header.bits.index_format = get_index_type(brw->ib.size);
       ib.header.bits.cut_index_enable = 0;
 
       BEGIN_BATCH(4, IGNORE_CLIPRECTS);
@@ -483,6 +535,8 @@ static void brw_emit_index_buffer(struct brw_context *brw)
       OUT_BATCH( 0 );
       ADVANCE_BATCH();
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_index_buffer = {
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
index 1df561386e..df49d4b72f 100644
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -29,6 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
   
+#include "util/u_memory.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
@@ -237,7 +238,7 @@ brw_resolve_cals(struct brw_compile *c)
         struct brw_glsl_call *call, *next;
         for (call = c->first_call; call; call = next) {
 	    next = call->next;
-	    _mesa_free(call);
+	    FREE(call);
 	}
 	c->first_call = NULL;
     }
@@ -247,7 +248,7 @@ brw_resolve_cals(struct brw_compile *c)
         struct brw_glsl_label *label, *next;
 	for (label = c->first_label; label; label = next) {
 	    next = label->next;
-	    _mesa_free(label);
+	    FREE(label);
 	}
 	c->first_label = NULL;
     }
diff --git a/src/gallium/drivers/i965/brw_eu_debug.c b/src/gallium/drivers/i965/brw_eu_debug.c
index ad7ec36e86..5989f5a04e 100644
--- a/src/gallium/drivers/i965/brw_eu_debug.c
+++ b/src/gallium/drivers/i965/brw_eu_debug.c
@@ -28,7 +28,8 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-    
+ 
+#include "util/u_debug.h"
 
 #include "brw_eu.h"
 
@@ -52,7 +53,7 @@ void brw_print_reg( struct brw_reg hwreg )
       "f"
    };
 
-   _mesa_printf("%s%s", 
+   debug_printf("%s%s", 
 		hwreg.abs ? "abs/" : "",
 		hwreg.negate ? "-" : "");
      
@@ -64,7 +65,7 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
       /* vector register */
-      _mesa_printf("vec%d", hwreg.nr);
+      debug_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
 	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
@@ -72,13 +73,13 @@ void brw_print_reg( struct brw_reg hwreg )
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
       /* "scalar" register */
-      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
    else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
-      _mesa_printf("imm %f", hwreg.dw1.f);
+      debug_printf("imm %f", hwreg.dw1.f);
    }
    else {
-      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
 		   hwreg.nr,
 		   hwreg.subnr / type_sz(hwreg.type),
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index 0f2612c181..98fec85c1d 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -315,24 +315,20 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
-   __DRIdrawablePrivate *dPriv = brw->intel.driDrawable;
    struct brw_polygon_stipple_offset bpso;
 
    memset(&bpso, 0, sizeof(bpso));
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
 
-   /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
-    * we have to invert the Y axis in order to match the OpenGL
-    * pixel coordinate system, and our offset must be matched
-    * to the window position.  If we're drawing to a FBO
-    * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate
-    * system works just fine, and there's no window system to
-    * worry about.
+   /* Never need to offset stipple coordinates.
+    *
+    * XXX: is it ever necessary to invert Y values?
     */
-   if (brw->intel.ctx.DrawBuffer->Name == 0) {
-      bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
-      bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   if (0) {
+      int x = 0, y = 0, h = 0;
+      bpso.bits0.x_offset = (32 - (x & 31)) & 31;
+      bpso.bits0.y_offset = (32 - ((y + h) & 31)) & 31;
    }
    else {
       bpso.bits0.y_offset = 0;
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index e85a1a9c1b..65e7151517 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -53,6 +53,9 @@ static void brw_note_fence( struct brw_context *brw, GLuint fence )
 static GLuint brw_flush_cmd( void )
 {
    struct brw_mi_flush flush;
+
+   return ;
+
    flush.opcode = CMD_MI_FLUSH;
    flush.pad = 0;
    flush.flags = BRW_FLUSH_STATE_CACHE;
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index fbb772d18c..8b61da763c 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -33,6 +33,25 @@
 #include "brw_util.h"
 #include "brw_wm.h"
 
+
+/**
+ * Determine if the given fragment program uses GLSL features such
+ * as flow conditionals, loops, subroutines.
+ * Some GLSL shaders may use these features, others might not.
+ */
+GLboolean brw_wm_is_glsl(const struct brw_fragment_shader *fp)
+{
+    return (fp->info.insn_count[TGSI_OPCODE_ARL] > 0 ||
+	    fp->info.insn_count[TGSI_OPCODE_IF] > 0 ||
+	    fp->info.insn_count[TGSI_OPCODE_ENDIF] > 0 || /* redundant - IF */
+	    fp->info.insn_count[TGSI_OPCODE_CAL] > 0 ||
+	    fp->info.insn_count[TGSI_OPCODE_BRK] > 0 ||   /* redundant - BGNLOOP */
+	    fp->info.insn_count[TGSI_OPCODE_RET] > 0 ||	  /* redundant - CAL */
+	    fp->info.insn_count[TGSI_OPCODE_BGNLOOP] > 0);
+}
+
+
+
 static void brwBindProgram( struct brw_context *brw,
 			    GLenum target, 
 			    struct gl_program *prog )
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
index b0928adbe4..d1d0d7cd43 100644
--- a/src/gallium/drivers/i965/brw_pipe_vertex.c
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -1,26 +1,11 @@
 
-static void brw_merge_inputs( struct brw_context *brw,
-		       const struct gl_client_array *arrays[])
-{
-   struct brw_vertex_info old = brw->vb.info;
-   GLuint i;
-
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
-      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
 
-   memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs));
-   memset(&brw->vb.info, 0, sizeof(brw->vb.info));
 
+void 
+brw_pipe_vertex_cleanup( struct brw_context *brw )
+{
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      brw->vb.inputs[i].glarray = arrays[i];
-      brw->vb.inputs[i].attrib = (gl_vert_attrib) i;
-
-      if (arrays[i]->StrideB != 0)
-	 brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) <<
-	    ((i%16) * 2);
+      brw->sws->bo_unreference(brw->vb.inputs[i].bo);
+      brw->vb.inputs[i].bo = NULL;
    }
-
-   /* Raise statechanges if input sizes have changed. */
-   if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0)
-      brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
 }
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
index 79d595d0ad..b0be0e1f8a 100644
--- a/src/gallium/drivers/i965/brw_screen.h
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -56,6 +56,14 @@ struct brw_transfer
    unsigned offset;
 };
 
+struct brw_buffer
+{
+   struct pipe_buffer base;
+   struct brw_winsys_buffer *bo;
+   void *ptr;
+   boolean is_user_buffer;
+};
+
 
 /*
  * Cast wrappers
@@ -72,5 +80,19 @@ brw_transfer(struct pipe_transfer *transfer)
    return (struct brw_transfer *)transfer;
 }
 
+static INLINE struct brw_buffer *
+brw_buffer(struct pipe_buffer *buffer)
+{
+   return (struct brw_buffer *)buffer;
+}
+
+
+/* Pipe buffer helpers
+ */
+static INLINE boolean
+brw_buffer_is_user_buffer( const struct pipe_buffer *buf )
+{
+   return ((const struct brw_buffer *)buf)->is_user_buffer;
+}
 
 #endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 53e8f09e37..e2db2e76e6 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -134,7 +134,7 @@ static void upload_sf_prog(struct brw_context *brw)
    key.attrs = brw->vs.prog_data->outputs_written; 
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
-   switch (brw->intel.reduced_primitive) {
+   switch (brw->reduced_primitive) {
    case GL_TRIANGLES: 
       /* NOTE: We just use the edgeflag attribute as an indicator that
        * unfilled triangles are active.  We don't actually do the
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
index 0e406f12e1..648a16a038 100644
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -40,19 +40,12 @@ static void upload_sf_vp(struct brw_context *brw)
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport sfv;
    GLfloat y_scale, y_bias;
-   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
    const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   }
-   else {
-      y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
-   }
+   y_scale = 1.0;
+   y_bias = 0;
 
    /* _NEW_VIEWPORT */
 
@@ -73,20 +66,11 @@ static void upload_sf_vp(struct brw_context *brw)
     * Note that the hardware's coordinates are inclusive, while Mesa's min is
     * inclusive but max is exclusive.
     */
-   if (render_to_fbo) {
-      /* texmemory: Y=0=bottom */
-      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
-      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
-      sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
-   }
-   else {
-      /* memory: Y=0=top */
-      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
-      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      sfv.scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
-      sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
-   }
+   /* Y=0=bottom */
+   sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
+   sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+   sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
+   sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
 
    brw->sws->bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
@@ -151,7 +135,7 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    /* _NEW_LIGHT */
    key->provoking_vertex = ctx->Light.ProvokingVertex;
 
-   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   key->render_to_fbo = 1;
 }
 
 static struct brw_winsys_buffer *
@@ -211,11 +195,6 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
-   /* The viewport is inverted for rendering to a FBO, and that inverts
-    * polygon front/back orientation.
-    */
-   sf.sf5.front_winding ^= key->render_to_fbo;
-
    switch (key->cull_face) {
    case GL_FRONT:
       sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
@@ -245,7 +224,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
        sf.sf6.line_width = 0;
 
    /* _NEW_BUFFERS */
-   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   key->render_to_fbo = 1;
    if (!key->render_to_fbo) {
       /* Rendering to an OpenGL window */
       sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 02657eaba7..9bf34c3fe4 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -109,8 +109,8 @@ struct brw_surface_key {
 /***********************************************************************
  * brw_state.c
  */
-void brw_validate_state(struct brw_context *brw);
-void brw_upload_state(struct brw_context *brw);
+int brw_validate_state(struct brw_context *brw);
+int brw_upload_state(struct brw_context *brw);
 void brw_init_state(struct brw_context *brw);
 void brw_destroy_state(struct brw_context *brw);
 
@@ -157,7 +157,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index b285837070..324fce5163 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -47,7 +47,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
 
@@ -74,7 +74,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
    return GL_TRUE;
 }
 
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
index 83f138f67a..d2df8af9f4 100644
--- a/src/gallium/drivers/i965/brw_swtnl.c
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -12,13 +12,13 @@ static GLboolean check_fallbacks( struct brw_context *brw,
     * use fallbacks.  If we're forcing fallbacks, always
     * use fallfacks.
     */
-   if (brw->intel.conformance_mode == 0)
+   if (brw->flags.no_swtnl)
       return GL_FALSE;
 
-   if (brw->intel.conformance_mode == 2)
+   if (brw->flags.force_swtnl)
       return GL_TRUE;
 
-   if (ctx->Polygon.SmoothFlag) {
+   if (brw->curr.rast->tmpl.smooth_polys) {
       for (i = 0; i < nr_prims; i++)
 	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
 	    return GL_TRUE;
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
index 82cd8007ac..51e23b9640 100644
--- a/src/gallium/drivers/i965/brw_winsys.h
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -161,6 +161,13 @@ struct brw_winsys_screen {
 		      size_t size,
 		      const void *data);
 
+   /* XXX: couldn't this be handled by returning true/false on
+    * bo_emit_reloc?
+    */
+   boolean (*check_aperture_space)( struct brw_winsys_screen *iws,
+				    struct brw_winsys_buffer **buffers,
+				    unsigned count );
+
    /**
     * Map a buffer.
     */
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 284cf42f8b..4948ea0dff 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -158,7 +158,7 @@ static void do_wm_prog( struct brw_context *brw,
    memcpy(&c->key, key, sizeof(*key));
 
    c->fp = fp;
-   c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
+   c->env_param = NULL; /*brw->intel.ctx.FragmentProgram.Parameters;*/
 
    brw_init_compile(brw, &c->func);
 
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 18775830f9..e06de95a8a 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -131,9 +131,9 @@ struct brw_wm_ref {
    GLuint insn:24;
 };
 
-struct brw_wm_constref {
+struct brw_wm_imm_ref {
    const struct brw_wm_ref *ref;
-   GLfloat constval;
+   GLfloat imm1f;
 };
 
 
@@ -232,8 +232,8 @@ struct brw_wm_compile {
    struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
    GLuint nr_insns;
 
-   struct brw_wm_constref constref[BRW_WM_MAX_CONST];
-   GLuint nr_constrefs;
+   struct brw_wm_imm_ref imm_ref[BRW_WM_MAX_CONST];
+   GLuint nr_imm_refs;
 
    struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
 
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index c4f0711793..a8de5fdd0b 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -7,34 +7,6 @@ static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
                                   const struct prog_instruction *inst,
                                   GLuint component);
 
-/**
- * Determine if the given fragment program uses GLSL features such
- * as flow conditionals, loops, subroutines.
- * Some GLSL shaders may use these features, others might not.
- */
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
-{
-    int i;
-
-    for (i = 0; i < fp->Base.NumInstructions; i++) {
-	const struct prog_instruction *inst = &fp->Base.Instructions[i];
-	switch (inst->Opcode) {
-	    case OPCODE_ARL:
-	    case OPCODE_IF:
-	    case OPCODE_ENDIF:
-	    case OPCODE_CAL:
-	    case OPCODE_BRK:
-	    case OPCODE_RET:
-	    case OPCODE_BGNLOOP:
-		return GL_TRUE; 
-	    default:
-		break;
-	}
-    }
-    return GL_FALSE; 
-}
-
-
 
 static void
 reclaim_temps(struct brw_wm_compile *c);
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index de5f5fe821..31b0270e84 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -124,33 +124,33 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
 }
 
 
-/** Return a ref to a constant/literal value */
-static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
-					       const GLfloat *constval )
+/** Return a ref to an immediate value */
+static const struct brw_wm_ref *get_imm_ref( struct brw_wm_compile *c,
+					     const GLfloat *imm1f )
 {
    GLuint i;
 
    /* Search for an existing const value matching the request:
     */
-   for (i = 0; i < c->nr_constrefs; i++) {
-      if (c->constref[i].constval == *constval) 
-	 return c->constref[i].ref;
+   for (i = 0; i < c->nr_imm_refs; i++) {
+      if (c->imm_ref[i].imm_val == *imm1f) 
+	 return c->imm_ref[i].ref;
    }
 
    /* Else try to add a new one:
     */
-   if (c->nr_constrefs < BRW_WM_MAX_CONST) {
-      GLuint i = c->nr_constrefs++;
+   if (c->nr_imm_refs < BRW_WM_MAX_IMM) {
+      GLuint i = c->nr_imm_refs++;
 
-      /* A constant is a special type of parameter:
+      /* An immediate is a special type of parameter:
        */
-      c->constref[i].constval = *constval;
-      c->constref[i].ref = get_param_ref(c, constval);
+      c->imm_ref[i].imm_val = *imm_val;
+      c->imm_ref[i].ref = get_param_ref(c, imm_val);
 
-      return c->constref[i].ref;
+      return c->imm_ref[i].ref;
    }
    else {
-      _mesa_printf("%s: out of constrefs\n", __FUNCTION__);
+      _mesa_printf("%s: out of imm_refs\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -200,7 +200,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 case PROGRAM_CONSTANT:
 	    /* These are invarient:
 	     */
-	    ref = get_const_ref(c, &plist->ParameterValues[idx][component]);
+	    ref = get_imm_ref(c, &plist->ParameterValues[idx][component]);
 	    break;
 
 	 case PROGRAM_STATE_VAR:
@@ -266,9 +266,9 @@ static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
    static const GLfloat const_one = 1.0;
 
    if (component == SWIZZLE_ZERO) 
-      src_ref = get_const_ref(c, &const_zero);
+      src_ref = get_imm_ref(c, &const_zero);
    else if (component == SWIZZLE_ONE) 
-      src_ref = get_const_ref(c, &const_one);
+      src_ref = get_imm_ref(c, &const_one);
    else 
       src_ref = pass0_get_reg(c, src.File, src.Index, component);
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index c76bff9181..ec9c859fcb 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -554,7 +554,8 @@ st_draw_vbo(GLcontext *ctx,
 
    /* Gallium probably doesn't want this in some cases. */
    if (!index_bounds_valid)
-      vbo_get_minmax_index(ctx, prims, ib, &min_index, &max_index);
+      if (!vbo_all_varyings_in_vbos(arrays))
+	 vbo_get_minmax_index(ctx, prims, ib, &min_index, &max_index);
 
    /* sanity check for pointer arithmetic below */
    assert(sizeof(arrays[0]->Ptr[0]) == 1);
-- 
cgit v1.2.3


From 562ca4eae257dd3b268e7f13487c8cd91f618eae Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 25 Oct 2009 01:20:56 +0100
Subject: i965g: more compiling wip

---
 src/gallium/drivers/i965/brw_context.h    |  15 +-
 src/gallium/drivers/i965/brw_curbe.c      |   3 +-
 src/gallium/drivers/i965/brw_gs.c         |  48 +++----
 src/gallium/drivers/i965/brw_gs.h         |   4 +-
 src/gallium/drivers/i965/brw_gs_state.c   |  21 +--
 src/gallium/drivers/i965/brw_misc_state.c | 222 ++++++++++++++----------------
 src/gallium/drivers/i965/brw_pipe_blend.c |  19 +++
 src/gallium/drivers/i965/brw_pipe_rast.c  |  20 +++
 src/gallium/drivers/i965/brw_screen.h     |   7 +
 src/gallium/drivers/i965/brw_sf.c         |   2 +-
 src/gallium/drivers/i965/brw_state.h      |   4 +-
 src/gallium/drivers/i965/brw_urb.c        |   3 +-
 src/gallium/drivers/i965/brw_vs.c         |   4 +-
 src/gallium/drivers/i965/brw_vs_emit.c    |  67 +++++----
 src/gallium/drivers/i965/brw_wm.c         |   2 +-
 src/gallium/drivers/i965/brw_wm.h         |   2 +-
 16 files changed, 243 insertions(+), 200 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 7ead641811..2e17e150bb 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -184,6 +184,8 @@ struct brw_fragment_shader {
 #define PIPE_NEW_CLIP                   0x2
 #define PIPE_NEW_INDEX_BUFFER           0x2
 #define PIPE_NEW_INDEX_RANGE            0x2
+#define PIPE_NEW_BLEND_COLOR            0x2
+#define PIPE_NEW_POLYGON_STIPPLE        0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -202,7 +204,9 @@ struct brw_fragment_shader {
 #define BRW_NEW_VERTICES		0x8000
 /**
  * Used for any batch entry with a relocated pointer that will be used
- * by any 3D rendering.
+ * by any 3D rendering.  Need to re-emit these fresh in each
+ * batchbuffer as the referenced buffers may be relocated in the
+ * meantime.
  */
 #define BRW_NEW_BATCH			0x10000
 /** brw->depth_region updated */
@@ -271,7 +275,7 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint outputs_written;
+   GLuint nr_outputs_written;
    GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
@@ -487,6 +491,9 @@ struct brw_context
       struct pipe_buffer *vertex_constants;
       struct pipe_buffer *fragment_constants;
 
+      struct brw_blend_constant_color bcc;
+      struct brw_polygon_stipple bps;
+
       /**
        * Index buffer for this draw_prims call.
        *
@@ -726,11 +733,11 @@ void brw_init_shader_funcs( struct brw_context *brw );
 
 /* brw_urb.c
  */
-void brw_upload_urb_fence(struct brw_context *brw);
+int brw_upload_urb_fence(struct brw_context *brw);
 
 /* brw_curbe.c
  */
-void brw_upload_cs_urb_state(struct brw_context *brw);
+int brw_upload_cs_urb_state(struct brw_context *brw);
 
 /* brw_disasm.c */
 int brw_disasm (FILE *file, struct brw_instruction *inst);
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 278ffa4ca2..3dd08f6eeb 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -126,7 +126,7 @@ const struct brw_tracked_state brw_curbe_offsets = {
  * fixed-function hardware in a double-buffering scheme to avoid a
  * pipeline stall each time the contents of the curbe is changed.
  */
-void brw_upload_cs_urb_state(struct brw_context *brw)
+int brw_upload_cs_urb_state(struct brw_context *brw)
 {
    struct brw_cs_urb_state cs_urb;
    memset(&cs_urb, 0, sizeof(cs_urb));
@@ -144,6 +144,7 @@ void brw_upload_cs_urb_state(struct brw_context *brw)
 
    assert(brw->urb.nr_cs_entries);
    BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   return 0;
 }
 
 static GLfloat fixed_plane[6][4] = {
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 692ce46679..3ecaa74e4f 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -54,7 +54,7 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   c.nr_attrs = c.key.nr_attrs;
 
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -80,30 +80,30 @@ static void compile_gs_prog( struct brw_context *brw,
     * already been weeded out by this stage:
     */
    switch (key->primitive) {
-   case GL_QUADS:
+   case PIPE_PRIM_QUADS:
       brw_gs_quads( &c ); 
       break;
-   case GL_QUAD_STRIP:
+   case PIPE_PRIM_QUAD_STRIP:
       brw_gs_quad_strip( &c );
       break;
-   case GL_LINE_LOOP:
+   case PIPE_PRIM_LINE_LOOP:
       brw_gs_lines( &c );
       break;
-   case GL_LINES:
+   case PIPE_PRIM_LINES:
       if (key->hint_gs_always)
 	 brw_gs_lines( &c );
       else {
 	 return;
       }
       break;
-   case GL_TRIANGLES:
+   case PIPE_PRIM_TRIANGLES:
       if (key->hint_gs_always)
 	 brw_gs_tris( &c );
       else {
 	 return;
       }
       break;
-   case GL_POINTS:
+   case PIPE_PRIM_POINTS:
       if (key->hint_gs_always)
 	 brw_gs_points( &c );
       else {
@@ -129,17 +129,17 @@ static void compile_gs_prog( struct brw_context *brw,
 				       &brw->gs.prog_data );
 }
 
-static const GLenum gs_prim[GL_POLYGON+1] = {  
-   GL_POINTS,
-   GL_LINES,
-   GL_LINE_LOOP,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_QUADS,
-   GL_QUAD_STRIP,
-   GL_TRIANGLES
+static const unsigned gs_prim[PIPE_PRIM_MAX] = {  
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
 };
 
 static void populate_key( struct brw_context *brw,
@@ -148,7 +148,7 @@ static void populate_key( struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
-   key->attrs = brw->vs.prog_data->outputs_written;
+   key->nr_attrs = brw->vs.prog_data->nr_outputs_written;
 
    /* BRW_NEW_PRIMITIVE */
    key->primitive = gs_prim[brw->primitive];
@@ -156,14 +156,14 @@ static void populate_key( struct brw_context *brw,
    key->hint_gs_always = 0;	/* debug code? */
 
    key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == GL_QUADS ||
-			brw->primitive == GL_QUAD_STRIP ||
-			brw->primitive == GL_LINE_LOOP);
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void prepare_gs_prog(struct brw_context *brw)
+static int prepare_gs_prog(struct brw_context *brw)
 {
    struct brw_gs_prog_key key;
    /* Populate the key:
@@ -184,6 +184,8 @@ static void prepare_gs_prog(struct brw_context *brw)
       if (brw->gs.prog_bo == NULL)
 	 compile_gs_prog( brw, &key );
    }
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_gs.h b/src/gallium/drivers/i965/brw_gs.h
index bbb991ea2e..6e616dcb87 100644
--- a/src/gallium/drivers/i965/brw_gs.h
+++ b/src/gallium/drivers/i965/brw_gs.h
@@ -40,11 +40,11 @@
 #define MAX_GS_VERTS (4)	     
 
 struct brw_gs_prog_key {
-   GLuint attrs:32;
+   GLuint nr_attrs:8;
    GLuint primitive:4;
    GLuint hint_gs_always:1;
    GLuint need_gs_prog:1;
-   GLuint pad:26;
+   GLuint pad:18;
 };
 
 struct brw_gs_compile {
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
index 6d03d72d96..15a66c9741 100644
--- a/src/gallium/drivers/i965/brw_gs_state.c
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -29,11 +29,12 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
  
-
+#include "util/u_math.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
 
 struct brw_gs_unit_key {
    unsigned int total_grf;
@@ -76,7 +77,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    memset(&gs, 0, sizeof(gs));
 
-   gs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   gs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    if (key->prog_active) /* reloc */
       gs.thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
 
@@ -100,7 +101,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    if (BRW_IS_IGDNG(brw))
       gs.thread4.rendering_enable = 1;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
@@ -111,17 +112,17 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    if (key->prog_active) {
       /* Emit GS program relocation */
-      dri_bo_emit_reloc(bo,
-			I915_GEM_DOMAIN_INSTRUCTION, 0,
-			gs.thread0.grf_reg_count << 1,
-			offsetof(struct brw_gs_unit_state, thread0),
-			brw->gs.prog_bo);
+      brw->sws->bo_emit_reloc(bo,
+			      I915_GEM_DOMAIN_INSTRUCTION, 0,
+			      gs.thread0.grf_reg_count << 1,
+			      offsetof(struct brw_gs_unit_state, thread0),
+			      brw->gs.prog_bo);
    }
 
    return bo;
 }
 
-static void prepare_gs_unit(struct brw_context *brw)
+static int prepare_gs_unit(struct brw_context *brw)
 {
    struct brw_gs_unit_key key;
 
@@ -135,6 +136,8 @@ static void prepare_gs_unit(struct brw_context *brw)
    if (brw->gs.state_bo == NULL) {
       brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_gs_unit = {
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index 98fec85c1d..ccebe08b4f 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -31,10 +31,12 @@
  
 
+#include "brw_debug.h"
 #include "brw_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_screen.h"
 
 
@@ -44,25 +46,16 @@
  * Blend color
  */
 
-static void upload_blend_constant_color(struct brw_context *brw)
+static int upload_blend_constant_color(struct brw_context *brw)
 {
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));      
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bcc);
+   return 0;
 }
 
 
 const struct brw_tracked_state brw_blend_constant_color = {
    .dirty = {
-      .mesa = _NEW_COLOR,
+      .mesa = PIPE_NEW_BLEND_COLOR,
       .brw = 0,
       .cache = 0
    },
@@ -70,30 +63,32 @@ const struct brw_tracked_state brw_blend_constant_color = {
 };
 
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void upload_drawing_rect(struct brw_context *brw)
+static int upload_drawing_rect(struct brw_context *brw)
 {
    BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
    OUT_BATCH(0);
-   OUT_BATCH(((brw->fb.width - 1) & 0xffff) |
-	    ((brw->fb.height - 1) << 16));
+   OUT_BATCH(((brw->curr.fb.width - 1) & 0xffff) |
+	    ((brw->curr.fb.height - 1) << 16));
    OUT_BATCH(0);
    ADVANCE_BATCH();
+   return 0;
 }
 
 const struct brw_tracked_state brw_drawing_rect = {
    .dirty = {
-      .mesa = _NEW_BUFFERS,
+      .mesa = PIPE_NEW_FRAMEBUFFER,
       .brw = 0,
       .cache = 0
    },
    .emit = upload_drawing_rect
 };
 
-static void prepare_binding_table_pointers(struct brw_context *brw)
+static int prepare_binding_table_pointers(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->vs.bind_bo);
    brw_add_validated_bo(brw, brw->wm.bind_bo);
+   return 0;
 }
 
 /**
@@ -103,7 +98,7 @@ static void prepare_binding_table_pointers(struct brw_context *brw)
  * The binding table pointers are relative to the surface state base address,
  * which is 0.
  */
-static void upload_binding_table_pointers(struct brw_context *brw)
+static int upload_binding_table_pointers(struct brw_context *brw)
 {
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
@@ -116,6 +111,7 @@ static void upload_binding_table_pointers(struct brw_context *brw)
    OUT_BATCH(0); /* sf */
    OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
    ADVANCE_BATCH();
+   return 0;
 }
 
 const struct brw_tracked_state brw_binding_table_pointers = {
@@ -135,7 +131,7 @@ const struct brw_tracked_state brw_binding_table_pointers = {
  * The state pointers in this packet are all relative to the general state
  * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
  */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
+static int upload_pipelined_state_pointers(struct brw_context *brw )
 {
    BEGIN_BATCH(7, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
@@ -151,10 +147,11 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    ADVANCE_BATCH();
 
    brw->state.dirty.brw |= BRW_NEW_PSP;
+   return 0;
 }
 
 
-static void prepare_psp_urb_cbs(struct brw_context *brw)
+static int prepare_psp_urb_cbs(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->vs.state_bo);
    brw_add_validated_bo(brw, brw->gs.state_bo);
@@ -162,13 +159,26 @@ static void prepare_psp_urb_cbs(struct brw_context *brw)
    brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
    brw_add_validated_bo(brw, brw->cc.state_bo);
+   return 0;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+static int upload_psp_urb_cbs(struct brw_context *brw )
 {
-   upload_pipelined_state_pointers(brw);
-   brw_upload_urb_fence(brw);
-   brw_upload_cs_urb_state(brw);
+   int ret;
+   
+   ret = upload_pipelined_state_pointers(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_urb_fence(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cs_urb_state(brw);
+   if (ret)
+      return ret;
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_psp_urb_cbs = {
@@ -187,20 +197,22 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
    .emit = upload_psp_urb_cbs,
 };
 
-static void prepare_depthbuffer(struct brw_context *brw)
+static int prepare_depthbuffer(struct brw_context *brw)
 {
-   struct intel_region *region = brw->state.depth_region;
+   struct pipe_surface *zsbuf = brw->curr.fb.zsbuf;
 
-   if (region != NULL)
-      brw_add_validated_bo(brw, region->buffer);
+   if (zsbuf)
+      brw_add_validated_bo(brw, brw_surface_bo(zsbuf));
+
+   return 0;
 }
 
-static void emit_depthbuffer(struct brw_context *brw)
+static int emit_depthbuffer(struct brw_context *brw)
 {
-   struct intel_region *region = brw->state.depth_region;
+   struct pipe_surface *surface = brw->curr.fb.zsbuf;
    unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
-   if (region == NULL) {
+   if (surface == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
@@ -214,38 +226,45 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       ADVANCE_BATCH();
    } else {
+      struct brw_winsys_buffer *bo;
       unsigned int format;
+      unsigned int pitch;
+      unsigned int cpp;
 
-      switch (region->cpp) {
-      case 2:
+      switch (surface->format) {
+      case PIPE_FORMAT_Z16_UNORM:
 	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 cpp = 2;
+	 break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+	 format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 cpp = 4;
 	 break;
-      case 4:
-	 if (intel->depth_buffer_is_float)
-	    format = BRW_DEPTHFORMAT_D32_FLOAT;
-	 else
-	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+	 format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 cpp = 4;
 	 break;
       default:
 	 assert(0);
-	 return;
+	 return PIPE_ERROR_BAD_INPUT;
       }
 
-      assert(region->tiling != I915_TILING_X);
+      bo = brw_surface_bo(surface);
+      pitch = brw_surface_pitch(surface);
 
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
-      OUT_BATCH(((region->pitch * region->cpp) - 1) |
+      OUT_BATCH(((pitch * cpp) - 1) |
 		(format << 18) |
 		(BRW_TILEWALK_YMAJOR << 26) |
-		((region->tiling != I915_TILING_NONE) << 27) |
+		((surface->layout != PIPE_SURFACE_LAYOUT_LINEAR) << 27) |
 		(BRW_SURFACE_2D << 29));
-      OUT_RELOC(region->buffer,
+      OUT_RELOC(bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		0);
+		surface->offset);
       OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
-		((region->pitch - 1) << 6) |
-		((region->height - 1) << 19));
+		((pitch - 1) << 6) |
+		((surface->height - 1) << 19));
       OUT_BATCH(0);
 
       if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
@@ -253,6 +272,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       ADVANCE_BATCH();
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_depthbuffer = {
@@ -271,37 +292,15 @@ const struct brw_tracked_state brw_depthbuffer = {
  * Polygon stipple packet
  */
 
-static void upload_polygon_stipple(struct brw_context *brw)
+static int upload_polygon_stipple(struct brw_context *brw)
 {
-   struct brw_polygon_stipple bps;
-   GLuint i;
-
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
-
-   /* Polygon stipple is provided in OpenGL order, i.e. bottom
-    * row first.  If we're rendering to a window (i.e. the
-    * default frame buffer object, 0), then we need to invert
-    * it to match our pixel layout.  But if we're rendering
-    * to a FBO (i.e. any named frame buffer object), we *don't*
-    * need to invert - we already match the layout.
-    */
-   if (ctx->DrawBuffer->Name == 0) {
-      for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
-   }
-   else {
-      for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
-   }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bps);
+   return 0;
 }
 
 const struct brw_tracked_state brw_polygon_stipple = {
    .dirty = {
-      .mesa = _NEW_POLYGONSTIPPLE,
+      .mesa = PIPE_NEW_POLYGON_STIPPLE,
       .brw = 0,
       .cache = 0
    },
@@ -313,37 +312,26 @@ const struct brw_tracked_state brw_polygon_stipple = {
  * Polygon stipple offset packet
  */
 
-static void upload_polygon_stipple_offset(struct brw_context *brw)
+static int upload_polygon_stipple_offset(struct brw_context *brw)
 {
    struct brw_polygon_stipple_offset bpso;
 
+   /* This is invarient state in gallium:
+    */
    memset(&bpso, 0, sizeof(bpso));
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
-
-   /* Never need to offset stipple coordinates.
-    *
-    * XXX: is it ever necessary to invert Y values?
-    */
-   if (0) {
-      int x = 0, y = 0, h = 0;
-      bpso.bits0.x_offset = (32 - (x & 31)) & 31;
-      bpso.bits0.y_offset = (32 - ((y + h) & 31)) & 31;
-   }
-   else {
-      bpso.bits0.y_offset = 0;
-      bpso.bits0.x_offset = 0;
-   }
+   bpso.bits0.y_offset = 0;
+   bpso.bits0.x_offset = 0;
 
    BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+   return 0;
 }
 
-#define _NEW_WINDOW_POS 0x40000000
-
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
-      .mesa = _NEW_WINDOW_POS,
-      .brw = 0,
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple_offset
@@ -352,12 +340,12 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
 /**********************************************************************
  * AA Line parameters
  */
-static void upload_aa_line_parameters(struct brw_context *brw)
+static int upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
    if (BRW_IS_965(brw))
-      return;
+      return 0;
 
    /* use legacy aa line coverage computation */
    memset(&balp, 0, sizeof(balp));
@@ -365,6 +353,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
    balp.header.length = sizeof(balp) / 4 - 2;
    
    BRW_CACHED_BATCH_STRUCT(brw, &balp);
+   return 0;
 }
 
 const struct brw_tracked_state brw_aa_line_parameters = {
@@ -380,31 +369,16 @@ const struct brw_tracked_state brw_aa_line_parameters = {
  * Line stipple packet
  */
 
-static void upload_line_stipple(struct brw_context *brw)
+static int upload_line_stipple(struct brw_context *brw)
 {
-   struct brw_line_stipple bls;
-   GLfloat tmp;
-   GLint tmpi;
-
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
-
-   bls.bits0.pattern = ctx->Line.StipplePattern;
-   bls.bits1.repeat_count = ctx->Line.StippleFactor;
-
-   tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
-   tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+   struct brw_line_stipple *bls = NULL; //brw->curr.rast->bls;
+   BRW_CACHED_BATCH_STRUCT(brw, bls);
+   return 0;
 }
 
 const struct brw_tracked_state brw_line_stipple = {
    .dirty = {
-      .mesa = _NEW_LINE,
+      .mesa = PIPE_NEW_RAST,
       .brw = 0,
       .cache = 0
    },
@@ -416,7 +390,7 @@ const struct brw_tracked_state brw_line_stipple = {
  * Misc invarient state packets
  */
 
-static void upload_invarient_state( struct brw_context *brw )
+static int upload_invarient_state( struct brw_context *brw )
 {
    {
       /* 0x61040000  Pipeline Select */
@@ -424,7 +398,10 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_pipeline_select ps;
 
       memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+	 ps.header.opcode = CMD_PIPELINE_SELECT_GM45;
+      else
+	 ps.header.opcode = CMD_PIPELINE_SELECT_965;
       ps.header.pipeline_select = 0;
       BRW_BATCH_STRUCT(brw, &ps);
    }
@@ -460,12 +437,18 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_vf_statistics vfs;
       memset(&vfs, 0, sizeof(vfs));
 
-      vfs.opcode = CMD_VF_STATISTICS(brw);
-      if (INTEL_DEBUG & DEBUG_STATS)
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) 
+	 vfs.opcode = CMD_VF_STATISTICS_GM45;
+      else 
+	 vfs.opcode = CMD_VF_STATISTICS_965;
+
+      if (BRW_DEBUG & DEBUG_STATS)
 	 vfs.statistics_enable = 1; 
 
       BRW_BATCH_STRUCT(brw, &vfs);
    }
+   
+   return 0;
 }
 
 const struct brw_tracked_state brw_invarient_state = {
@@ -485,7 +468,7 @@ const struct brw_tracked_state brw_invarient_state = {
  * state pools.  This comes at the expense of memory, and more expensive cache
  * misses.
  */
-static void upload_state_base_address( struct brw_context *brw )
+static int upload_state_base_address( struct brw_context *brw )
 {
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
@@ -511,6 +494,7 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_BATCH(1); /* Indirect object upper bound */
        ADVANCE_BATCH();
    }
+   return 0;
 }
 
 const struct brw_tracked_state brw_state_base_address = {
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
index 17895d2782..54d09d9e45 100644
--- a/src/gallium/drivers/i965/brw_pipe_blend.c
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -43,3 +43,22 @@
    if (INTEL_DEBUG & DEBUG_STATS)
       cc.cc5.statistics_enable = 1;
 }
+
+
+
+static void brw_set_blend_color(struct pipe_context *pipe,
+				const float *blend_color)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_blend_constant_color *bcc = &brw->curr.blend_color.bcc;
+
+   memset(bcc, 0, sizeof(*bcc));      
+   bcc->header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc->header.length = sizeof(*bcc)/4-2;
+   bcc->blend_constant_color[0] = blend_color[0];
+   bcc->blend_constant_color[1] = blend_color[1];
+   bcc->blend_constant_color[2] = blend_color[2];
+   bcc->blend_constant_color[3] = blend_color[3];
+
+   brw->state.dirty.pipe |= PIPE_NEW_BLEND_COLOR;
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
index ff64dbd48d..86822d478a 100644
--- a/src/gallium/drivers/i965/brw_pipe_rast.c
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -44,3 +44,23 @@ calculate_clip_key_rast()
       }
    }
 }
+
+
+static void
+calculate_line_stipple_rast()
+{
+   GLfloat tmp;
+   GLint tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+   bls.bits0.pattern = brw->curr.rast.line_stipple_pattern;
+   bls.bits1.repeat_count = brw->curr.rast.line_stipple_factor + 1;
+
+   tmp = 1.0 / (GLfloat) bls.bits1.repeat_count;
+   tmpi = tmp * (1<<13);
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+}
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
index b0be0e1f8a..eafd8ddf77 100644
--- a/src/gallium/drivers/i965/brw_screen.h
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -95,4 +95,11 @@ brw_buffer_is_user_buffer( const struct pipe_buffer *buf )
    return ((const struct brw_buffer *)buf)->is_user_buffer;
 }
 
+struct brw_winsys_buffer *
+brw_surface_bo( struct pipe_surface *surface );
+
+unsigned
+brw_surface_pitch( const struct pipe_surface *surface );
+
+
 #endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e2db2e76e6..1b73b3fd51 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -131,7 +131,7 @@ static void upload_sf_prog(struct brw_context *brw)
    /* Populate the key, noting state dependencies:
     */
    /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written; 
+   key.attrs = brw->vs.prog_data->nr_outputs_written; 
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
    switch (brw->reduced_primitive) {
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 9bf34c3fe4..663fc839df 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -33,9 +33,11 @@
 #ifndef BRW_STATE_H
 #define BRW_STATE_H
 
-#include "brw_context.h"
+#include "pipe/p_error.h"
 #include "util/u_memory.h"
 
+#include "brw_context.h"
+
 static inline void
 brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
 {
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index 18d79c5ebb..a2277519ad 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -221,7 +221,7 @@ const struct brw_tracked_state brw_recalculate_urb_fence = {
 
 
-void brw_upload_urb_fence(struct brw_context *brw)
+int brw_upload_urb_fence(struct brw_context *brw)
 {
    struct brw_urb_fence uf;
    memset(&uf, 0, sizeof(uf));
@@ -247,4 +247,5 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = URB_SIZES(brw);
 
    BRW_BATCH_STRUCT(brw, &uf);
+   return 0;
 }
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index dcd687ac34..010ac115d3 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -51,11 +51,11 @@ static void do_vs_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
    c.vp = vp;
 
-   c.prog_data.outputs_written = vp->program.Base.OutputsWritten;
+   c.prog_data.nr_outputs_written = vp->program.Base.OutputsWritten;
    c.prog_data.inputs_read = vp->program.Base.InputsRead;
 
    if (c.key.copy_edgeflag) {
-      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.nr_outputs_written |= 1<<VERT_RESULT_EDGE;
       c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
    }
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index e946944295..086f54799e 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -140,35 +140,33 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_overflow_output = 0;
 
    if (BRW_IS_IGDNG(c->func.brw))
-       mrf = 8;
+      mrf = 8;
    else
-       mrf = 4;
+      mrf = 4;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1 << i)) {
-	 c->nr_outputs++;
-         assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
-	 if (i == VERT_RESULT_HPOS) {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
+   for (i = 0; i < c->prog_data.nr_outputs_written; i++) {
+      c->nr_outputs++;
+      assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
+      if (i == VERT_RESULT_HPOS) {
+	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+      }
+      else if (i == VERT_RESULT_PSIZ) {
+	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+	 if (mrf < 16) {
+	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+	    mrf++;
 	 }
-	 else if (i == VERT_RESULT_PSIZ) {
+	 else {
+	    /* too many vertex results to fit in MRF, use GRF for overflow */
+	    if (!c->first_overflow_output)
+	       c->first_overflow_output = i;
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
-	 }
-	 else {
-            if (mrf < 16) {
-               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-               mrf++;
-            }
-            else {
-               /* too many vertex results to fit in MRF, use GRF for overflow */
-               if (!c->first_overflow_output)
-                  c->first_overflow_output = i;
-               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-               reg++;
-            }
 	 }
       }
    }     
@@ -238,9 +236,9 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
    if (BRW_IS_IGDNG(c->func.brw))
-       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
-       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
    c->prog_data.total_grf = reg;
 
@@ -1050,8 +1048,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
-   if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || BRW_IS_965(p->brw))
+   if (c->prog_data.writes_psiz ||
+       c->key.nr_userclip || 
+       BRW_IS_965(p->brw))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1060,7 +1059,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
-      if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
+      if (c->prog_data.writes_psiz) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
@@ -1149,12 +1148,10 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * at mrf[4] atm...
        */
       GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
-         if (c->prog_data.outputs_written & (1 << i)) {
-            /* move from GRF to MRF */
-            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
-            mrf++;
-         }
+      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs_written; i++) {
+	 /* move from GRF to MRF */
+	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+	 mrf++;
       }
 
       brw_urb_WRITE(p,
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 4948ea0dff..764708f7df 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -310,7 +310,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    }
 
    /* CACHE_NEW_VS_PROG */
-   key->vp_outputs_written = brw->vs.prog_data->outputs_written; /* bitmask */
+   key->vp_nr_outputs_written = brw->vs.prog_data->nr_outputs_written;
 
    /* The unique fragment program ID */
    key->program_string_id = fp->id;
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index e06de95a8a..bf241f5fa4 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -76,7 +76,7 @@ struct brw_wm_prog_key {
 
    GLuint program_string_id:32;
    GLuint drawable_height;
-   GLuint vp_outputs_written;
+   GLuint vp_nr_outputs_written;
 };
 
 
-- 
cgit v1.2.3


From 09c231f84a20a306a173b60c82484ce1f9331edf Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 26 Oct 2009 00:20:33 +0000
Subject: i965g: still working on compilation

---
 src/gallium/auxiliary/tgsi/tgsi_scan.h          |   3 +
 src/gallium/drivers/i965/Makefile               |   9 +-
 src/gallium/drivers/i965/brw_batchbuffer.c      |  14 +-
 src/gallium/drivers/i965/brw_context.h          |  18 +-
 src/gallium/drivers/i965/brw_eu_emit.c          |   4 +-
 src/gallium/drivers/i965/brw_pipe_fb.c          |   2 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   9 +-
 src/gallium/drivers/i965/brw_pipe_query.c       | 110 +++++++-----
 src/gallium/drivers/i965/brw_pipe_sampler.c     |  81 +++++++++
 src/gallium/drivers/i965/brw_screen_surface.c   | 156 ++++++++++++++---
 src/gallium/drivers/i965/brw_screen_texture.c   | 218 ++++++++++++++++++++++++
 src/gallium/drivers/i965/brw_sf.c               |  80 ++++-----
 src/gallium/drivers/i965/brw_sf.h               |  13 +-
 src/gallium/drivers/i965/brw_sf_emit.c          | 145 +++++++++-------
 src/gallium/drivers/i965/brw_sf_state.c         | 178 +++++++++----------
 src/gallium/drivers/i965/brw_state.h            |  13 +-
 src/gallium/drivers/i965/brw_state_batch.c      |   8 +-
 src/gallium/drivers/i965/brw_state_cache.c      |  64 ++++---
 src/gallium/drivers/i965/brw_state_debug.c      |  19 ++-
 src/gallium/drivers/i965/brw_state_dump.c       |  64 +++----
 src/gallium/drivers/i965/brw_state_upload.c     |  37 ++--
 src/gallium/drivers/i965/brw_tex.c              |  50 ------
 src/gallium/drivers/i965/brw_tex_layout.c       | 218 ------------------------
 src/gallium/drivers/i965/brw_urb.c              |  10 +-
 src/gallium/drivers/i965/brw_vs.h               |   2 +-
 src/gallium/drivers/i965/brw_vs_emit.c          |  20 +--
 src/gallium/drivers/i965/brw_vs_state.c         |   4 +-
 src/gallium/drivers/i965/brw_winsys.h           |  18 +-
 src/gallium/drivers/i965/brw_wm.c               |   4 +-
 src/gallium/drivers/i965/brw_wm.h               |  36 ++--
 src/gallium/drivers/i965/brw_wm_debug.c         |  68 ++++----
 src/gallium/drivers/i965/brw_wm_emit.c          |   8 +-
 src/gallium/drivers/i965/brw_wm_fp.c            |  18 +-
 src/gallium/drivers/i965/brw_wm_glsl.c          |  16 +-
 src/gallium/drivers/i965/brw_wm_pass0.c         |   6 +-
 src/gallium/drivers/i965/brw_wm_pass1.c         |   2 +-
 src/gallium/drivers/i965/brw_wm_pass2.c         |   4 +-
 src/gallium/drivers/i965/brw_wm_sampler_state.c | 170 ++++--------------
 src/gallium/drivers/i965/brw_wm_state.c         |   6 +-
 39 files changed, 1007 insertions(+), 898 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_screen_texture.c
 delete mode 100644 src/gallium/drivers/i965/brw_tex.c
 delete mode 100644 src/gallium/drivers/i965/brw_tex_layout.c

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 8a7ee0c7e4..6754001e88 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -61,6 +61,9 @@ struct tgsi_shader_info
    boolean uses_kill;  /**< KIL or KILP instruction used? */
    boolean uses_fogcoord; /**< fragment shader uses fog coord? */
    boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */
+
+   uint texture_max;
+   uint texture_mask;
 };
 
 
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
index 40e8aa8786..c3dbad72ae 100644
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@@ -28,10 +28,7 @@ C_SOURCES = \
 	brw_pipe_blend.c \
 	brw_pipe_depth.c \
 	brw_pipe_fb.c \
-	brw_pipe_flush.c \
 	brw_pipe_query.c \
-	brw_pipe_shader.c \
-	brw_screen_surface.c \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
@@ -40,8 +37,6 @@ C_SOURCES = \
 	brw_state_dump.c \
 	brw_state_upload.c \
 	brw_swtnl.c \
-	brw_tex.c \
-	brw_tex_layout.c \
 	brw_urb.c \
 	brw_util.c \
 	brw_vs.c \
@@ -60,8 +55,12 @@ C_SOURCES = \
 	brw_wm_sampler_state.c \
 	brw_wm_state.c \
 	brw_wm_surface_state.c \
+	brw_screen_surface.c \
+	brw_screen_texture.c \
 	brw_bo.c \
 	brw_batchbuffer.c \
+	brw_pipe_shader.c \
+	brw_pipe_flush.c \
 	intel_tex_layout.c 
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
index 8bcac76ede..45fbd59273 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.c
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -105,13 +105,13 @@ _brw_batchbuffer_flush(struct brw_batchbuffer *batch, const char *file,
    }
 
 
-   if (INTEL_DEBUG & DEBUG_BATCH)
-      fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
+   if (BRW_DEBUG & DEBUG_BATCH)
+      debug_printf("%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      used);
 
    /* Emit a flush if the bufmgr doesn't do it for us. */
    if (intel->always_flush_cache || !intel->ttm) {
-      *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
+      *(GLuint *) (batch->ptr) = ((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
       batch->ptr += 4;
       used = batch->ptr - batch->map;
    }
@@ -136,15 +136,15 @@ _brw_batchbuffer_flush(struct brw_batchbuffer *batch, const char *file,
       
    batch->sws->bo_exec(batch->buf, used, NULL, 0, 0 );
       
-   if (INTEL_DEBUG & DEBUG_BATCH) {
+   if (BRW_DEBUG & DEBUG_BATCH) {
       dri_bo_map(batch->buf, GL_FALSE);
       intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
 		   brw->brw_screen->pci_id);
       dri_bo_unmap(batch->buf);
    }
 
-   if (INTEL_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "waiting for idle\n");
+   if (BRW_DEBUG & DEBUG_SYNC) {
+      debug_printf("waiting for idle\n");
       dri_bo_map(batch->buf, GL_TRUE);
       dri_bo_unmap(batch->buf);
    }
@@ -166,7 +166,7 @@ brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
    int ret;
 
    if (batch->ptr - batch->map > batch->buf->size)
-      _mesa_printf ("bad relocation ptr %p map %p offset %d size %d\n",
+      debug_printf ("bad relocation ptr %p map %p offset %d size %d\n",
 		    batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
 
    ret = batch->sws->bo_emit_reloc(batch->buf,
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index df43d8ba4d..10c1cf6f33 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -190,6 +190,8 @@ struct brw_fragment_shader {
 #define PIPE_NEW_FRAMEBUFFER_DIMENSIONS 0x10000
 #define PIPE_NEW_DEPTH_BUFFER           0x20000
 #define PIPE_NEW_COLOR_BUFFERS          0x40000
+#define PIPE_NEW_QUERY                  0x80000
+#define PIPE_NEW_SCISSOR                0x100000
 
 
@@ -204,7 +206,7 @@ struct brw_fragment_shader {
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
-#define BRW_NEW_FENCE                   0x2000
+#define BRW_NEW_xxx                     0x2000 /* was FENCE */
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -373,6 +375,7 @@ struct brw_cache_item {
 
 struct brw_cache {
    struct brw_context *brw;
+   struct brw_winsys_screen *sws;
 
    struct brw_cache_item **items;
    GLuint size, n_items;
@@ -380,6 +383,7 @@ struct brw_cache {
    GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
    GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
+   
 
    /* Record of the last BOs chosen for each cache_id.  Used to set
     * brw->state.dirty.cache when a new cache item is chosen.
@@ -448,7 +452,7 @@ struct brw_query_object {
    int last_index;
 
    /* Total count of pixels from previous BOs */
-   unsigned int count;
+   uint64_t result;
 };
 
 
@@ -477,11 +481,18 @@ struct brw_context
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_state *zstencil;
 
+      const struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      const struct pipe_sampler *sampler[PIPE_MAX_SAMPLERS];
+      unsigned num_textures;
+      unsigned num_samplers;
+      
+
       struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
       struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
       unsigned num_vertex_elements;
       unsigned num_vertex_buffers;
 
+      struct pipe_scissor_state scissor;
       struct pipe_framebuffer_state fb;
       struct pipe_viewport_state vp;
       struct pipe_clip_state ucp;
@@ -492,6 +503,8 @@ struct brw_context
       struct brw_blend_constant_color bcc;
       struct brw_polygon_stipple bps;
 
+      
+
       /**
        * Index buffer for this draw_prims call.
        *
@@ -688,6 +701,7 @@ struct brw_context
       struct brw_winsys_buffer *bo;
       int index;
       GLboolean active;
+      int stats_wm;
    } query;
 
    struct {
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index f6b8843e01..f7fa520348 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -1262,7 +1262,7 @@ void brw_SAMPLE(struct brw_compile *p,
    GLboolean need_stall = 0;
    
    if (writemask == 0) {
-      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+      /*debug_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1294,7 +1294,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* debug_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
index 6391717227..c65f9bc374 100644
--- a/src/gallium/drivers/i965/brw_pipe_fb.c
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -53,7 +53,7 @@ static void brw_set_viewport_state( struct pipe_context *pipe,
 void brw_pipe_framebuffer_init( struct brw_context *brw )
 {
    brw->base.set_framebuffer_state = brw_set_framebuffer_state;
-   brw->base.set_framebuffer_state = brw_set_framebuffer_state;
+   brw->base.set_viewport_state = brw_set_viewport_state;
 }
 
 void brw_pipe_framebuffer_cleanup( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index 65e7151517..fb4a784de9 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -52,14 +52,7 @@ static void brw_note_fence( struct brw_context *brw, GLuint fence )
  */
 static GLuint brw_flush_cmd( void )
 {
-   struct brw_mi_flush flush;
-
-   return ;
-
-   flush.opcode = CMD_MI_FLUSH;
-   flush.pad = 0;
-   flush.flags = BRW_FLUSH_STATE_CACHE;
-   return *(GLuint *)&flush;
+   return ((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index a2da1373bf..18a9b71af0 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -46,25 +46,38 @@
 #include "brw_reg.h"
 
 /** Waits on the query object's BO and totals the results for this query */
-static void
-brw_queryobj_get_results(struct brw_query_object *query)
+static boolean
+brw_query_get_result(struct pipe_context *pipe,
+		     struct pipe_query *q,
+		     boolean wait,
+		     uint64_t *result)
 {
-   int i;
-   uint64_t *results;
-
-   if (query->bo == NULL)
-      return;
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Map and count the pixels from the current query BO */
-   dri_bo_map(query->bo, GL_FALSE);
-   results = query->bo->virtual;
-   for (i = query->first_index; i <= query->last_index; i++) {
-      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   if (query->bo) {
+      int i;
+      uint64_t *map;
+      
+      if (brw->sws->bo_is_busy(query->bo) && !wait)
+	 return FALSE;
+      
+      map = brw->sws->bo_map(query->bo, GL_FALSE);
+      if (map == NULL)
+	 return FALSE;
+      
+      for (i = query->first_index; i <= query->last_index; i++) {
+	 query->result += map[i * 2 + 1] - map[i * 2];
+      }
+
+      brw->sws->bo_unmap(query->bo);
+      brw->sws->bo_unreference(query->bo);
+      query->bo = NULL;
    }
-   dri_bo_unmap(query->bo);
 
-   brw->sws->bo_unreference(query->bo);
-   query->bo = NULL;
+   *result = query->result;
+   return TRUE;
 }
 
 static struct pipe_query *
@@ -72,12 +85,12 @@ brw_query_create(struct pipe_context *pipe, unsigned type )
 {
    struct brw_query_object *query;
 
-   switch (query->type) {
+   switch (type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       query = CALLOC_STRUCT( brw_query_object );
       if (query == NULL)
 	 return NULL;
-      return &query->Base;
+      return (struct pipe_query *)query;
       
    default:
       return NULL;
@@ -87,6 +100,7 @@ brw_query_create(struct pipe_context *pipe, unsigned type )
 static void
 brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
 {
+   struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    brw->sws->bo_unreference(query->bo);
@@ -94,24 +108,25 @@ brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
 }
 
 static void
-brw_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+brw_query_begin(struct pipe_context *pipe, struct pipe_query *q)
 {
    struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Reset our driver's tracking of query state. */
    brw->sws->bo_unreference(query->bo);
+   query->result = 0;
    query->bo = NULL;
    query->first_index = -1;
    query->last_index = -1;
 
    insert_at_head(&brw->query.active_head, query);
-   brw->stats_wm++;
-   brw->dirty.mesa |= PIPE_NEW_QUERY;
+   brw->query.stats_wm++;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
 }
 
 static void
-brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
+brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
 {
    struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
@@ -129,27 +144,13 @@ brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
    }
 
    remove_from_list(query);
-   brw->stats_wm--;
-   brw->dirty.mesa |= PIPE_NEW_QUERY;
+   brw->query.stats_wm--;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
 }
 
-static void brw_wait_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   brw_queryobj_get_results(query);
-   query->Base.Ready = GL_TRUE;
-}
-
-static void brw_check_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
-      brw_queryobj_get_results(query);
-      query->Base.Ready = GL_TRUE;
-   }
-}
+/***********************************************************************
+ * Internal functions and callbacks to implement queries 
+ */
 
 /** Called to set up the query BO and account for its aperture space */
 void
@@ -201,8 +202,17 @@ brw_emit_query_begin(struct brw_context *brw)
 
    foreach(query, &brw->query.active_head) {
       if (query->bo != brw->query.bo) {
+	 uint64_t tmp;
+	 
+	 /* Propogate the results from this buffer to all of the
+	  * active queries, as the bo is going away.
+	  */
 	 if (query->bo != NULL)
-	    brw_queryobj_get_results(query);
+	    brw_query_get_result( &brw->base, 
+				  (struct pipe_query *)query,
+				  FALSE,
+				  &tmp );
+
 	 brw->sws->bo_reference(brw->query.bo);
 	 query->bo = brw->query.bo;
 	 query->first_index = brw->query.index;
@@ -235,12 +245,18 @@ brw_emit_query_end(struct brw_context *brw)
    brw->query.index++;
 }
 
-void brw_init_queryobj_functions(struct dd_function_table *functions)
+void brw_pipe_query_init( struct brw_context *brw )
 {
-   functions->NewQueryObject = brw_new_query_object;
-   functions->DeleteQuery = brw_delete_query;
-   functions->BeginQuery = brw_begin_query;
-   functions->EndQuery = brw_end_query;
-   functions->CheckQuery = brw_check_query;
-   functions->WaitQuery = brw_wait_query;
+   brw->base.create_query = brw_query_create;
+   brw->base.destroy_query = brw_query_destroy;
+   brw->base.begin_query = brw_query_begin;
+   brw->base.end_query = brw_query_end;
+   brw->base.get_query_result = brw_query_get_result;
+}
+
+
+void brw_pipe_query_cleanup( struct brw_context *brw )
+{
+   /* Unreference brw->query.bo ??
+    */
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
index b3069f08c0..bc20eef6fb 100644
--- a/src/gallium/drivers/i965/brw_pipe_sampler.c
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -14,6 +14,87 @@ static void *brw_create_sampler_state( struct pipe_context *pipe,
 {
    struct brw_sampler_state *sampler = CALLOC_STRUCT(brw_sampler_state);
 
+   switch (key->minfilter) {
+   case GL_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_NEAREST_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_LINEAR_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_NEAREST_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case GL_LINEAR_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   /* Set Anisotropy: 
+    */
+   if (key->max_aniso > 1.0) {
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
+      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+
+      if (key->max_aniso > 2.0) {
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
+				       BRW_ANISORATIO_16);
+      }
+   }
+   else {
+      switch (key->magfilter) {
+      case GL_NEAREST:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	 break;
+      default:
+	 break;
+      }
+   }
+
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set shadow function: 
+    */
+   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function =
+	 intel_translate_shadow_compare_func(key->comparefunc);
+   }
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6);
 
    return (void *)sampler;
 }
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index 544be6a089..e0df6cc629 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -1,27 +1,131 @@
-   /* _NEW_BUFFERS */
-   if (IS_965(brw->brw_screen->pci_id) &&
-       !IS_G4X(brw->brw_screen->pci_id)) {
-      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-
-	 /* The original gen4 hardware couldn't set up WM surfaces pointing
-	  * at an offset within a tile, which can happen when rendering to
-	  * anything but the base level of a texture or the +X face/0 depth.
-	  * This was fixed with the 4 Series hardware.
-	  *
-	  * For these original chips, you would have to make the depth and
-	  * color destination surfaces include information on the texture
-	  * type, LOD, face, and various limits to use them as a destination.
-	  * I would have done this, but there's also a nasty requirement that
-	  * the depth and the color surfaces all be of the same LOD, which
-	  * may be a worse requirement than this alignment.  (Also, we may
-	  * want to just demote the texture to untiled, instead).
-	  */
-	 if (irb->region && 
-	     irb->region->tiling != I915_TILING_NONE &&
-	     (irb->region->draw_offset & 4095)) {
-	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
-	    return GL_TRUE;
-	 }
+
+#include "pipe/p_screen.h"
+#include "brw_screen.h"
+
+struct brw_surface_id {
+   unsigned face:3;
+   unsigned zslice:13;
+   unsigned level:16;
+};
+
+static boolean need_linear_view( struct brw_screen *brw_screen,
+				 struct brw_texture *brw_texture,
+				 unsigned face,
+				 unsigned level,
+				 unsigned zslice )
+{
+#if 0
+   /* XXX: what about IDGNG?
+    */
+   if (!BRW_IS_G4X(brw->brw_screen->pci_id))
+   {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      /* The original gen4 hardware couldn't set up WM surfaces pointing
+       * at an offset within a tile, which can happen when rendering to
+       * anything but the base level of a texture or the +X face/0 depth.
+       * This was fixed with the 4 Series hardware.
+       *
+       * For these original chips, you would have to make the depth and
+       * color destination surfaces include information on the texture
+       * type, LOD, face, and various limits to use them as a destination.
+       *
+       * This is easy in Gallium as surfaces are all backed by
+       * textures, but there's also a nasty requirement that the depth
+       * and the color surfaces all be of the same LOD, which is
+       * harder to get around as we can't look at a surface in
+       * isolation and decide if it's legal.
+       *
+       * Instead, end up being pessimistic and say that for i965,
+       * ... ??
+       */
+      if (brw_tex->tiling != I915_TILING_NONE &&
+	  (brw_tex_image_offset(brw_tex, face, level, zslize) & 4095)) {
+	 if (BRW_DEBUG & DEBUG_VIEW)
+	    debug_printf("%s: need surface view for non-aligned tex image\n",
+			 __FUNCTION__);
+	 return GL_TRUE;
       }
+   }
+#endif
+
+   /* Tiled 3d textures don't have subsets that look like 2d surfaces:
+    */
+   
+   /* Everything else should be fine to render to in-place:
+    */
+   return GL_FALSE;
+}
+
+/* Look at all texture views and figure out if any of them need to be
+ * back-copied into the texture for sampling
+ */
+void brw_update_texture( struct pipe_screen *screen,
+			 struct pipe_texture *texture )
+{
+   /* currently nothing to do */
+}
+
+
+static struct pipe_surface *create_linear_view( struct brw_screen *brw_screen,
+						struct brw_texture *brw_tex,
+						struct brw_surface_id id )
+{
+   
+}
+
+static struct pipe_surface *create_in_place_view( struct brw_screen *brw_screen,
+						  struct brw_texture *brw_tex,
+						  struct brw_surface_id id )
+{
+   struct brw_surface *surface = CALLOC_STRUCT(brw_surface);
+   surface->id = id;
+   
+}
+
+/* Get a surface which is view into a texture 
+ */
+struct pipe_surface *brw_get_tex_surface(struct pipe_screen *screen,
+					 struct pipe_texture *texture,
+					 unsigned face, unsigned level,
+					 unsigned zslice,
+					 unsigned usage )
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_surface_id id;
+
+   id.face = face;
+   id.level = level;
+   id.zslice = zslice;
+
+   if (need_linear_view(brw_screen, brw_tex, id)) 
+      type = BRW_VIEW_LINEAR;
+   else
+      type = BRW_VIEW_IN_PLACE;
+
+   
+   foreach (surface, texture->views[type]) {
+      if (id.value == surface->id.value)
+	 return surface;
+   }
+
+   switch (type) {
+   case BRW_VIEW_LINEAR:
+      surface = create_linear_view( texture, id, type );
+      break;
+   case BRW_VIEW_IN_PLACE:
+      surface = create_in_place_view( texture, id, type );
+      break;
+   default:
+      return NULL;
+   }
+
+   insert_at_head( texture->views[type], surface );
+   return surface;
+}
+
+
+void brw_tex_surface_destroy( struct pipe_surface *surface )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_screen_texture.c b/src/gallium/drivers/i965/brw_screen_texture.c
new file mode 100644
index 0000000000..50c30878c6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_texture.c
@@ -0,0 +1,218 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "brw_tex_layout.h"
+
+#define FILE_DEBUG_FLAG DEBUG_MIPTREE
+
+GLboolean brw_miptree_layout(struct brw_context *brw,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
+{
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
+
+   switch (mt->target) {
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(brw->brw_screen->pci_id)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+          }
+
+          if (mt->last_level != 0) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          for (level = 0; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
+   case GL_TEXTURE_3D: {
+      GLuint width  = mt->width0;
+      GLuint height = mt->height0;
+      GLuint depth = mt->depth0;
+      GLuint pack_x_pitch, pack_x_nr;
+      GLuint pack_y_pitch;
+      GLuint level;
+      GLuint align_h = 2;
+      GLuint align_w = 4;
+
+      mt->total_height = 0;
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
+      if (mt->compressed) {
+          mt->pitch = ALIGN(width, align_w);
+          pack_y_pitch = (height + 3) / 4;
+      } else {
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
+      }
+
+      pack_x_pitch = width;
+      pack_x_nr = 1;
+
+      for (level = 0 ; level <= mt->last_level ; level++) {
+	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
+	 GLint x = 0;
+	 GLint y = 0;
+	 GLint q, j;
+
+	 intel_miptree_set_level_info(mt, level, nr_images,
+				      0, mt->total_height,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(mt, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 mt->total_height += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
+
+      }
+      /* The 965's sampler lays cachelines out according to how accesses
+       * in the texture surfaces run, so they may be "vertical" through
+       * memory.  As a result, the docs say in Surface Padding Requirements:
+       * Sampling Engine Surfaces that two extra rows of padding are required.
+       * We don't know of similar requirements for pre-965, but given that
+       * those docs are silent on padding requirements in general, let's play
+       * it safe.
+       */
+      if (mt->target == GL_TEXTURE_CUBE_MAP)
+	 mt->total_height += 2;
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(intel, mt, tiling);
+      break;
+   }
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
+		mt->total_height,
+		mt->cpp,
+		mt->pitch * mt->total_height * mt->cpp );
+
+   return GL_TRUE;
+}
+
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 1b73b3fd51..013d839e37 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -29,11 +29,12 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
   
+#include "pipe/p_state.h"
 
 #include "brw_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
+#include "brw_pipe_rast.h"
 #include "brw_eu.h"
 #include "brw_util.h"
 #include "brw_sf.h"
@@ -45,7 +46,6 @@ static void compile_sf_prog( struct brw_context *brw,
    struct brw_sf_compile c;
    const GLuint *program;
    GLuint program_size;
-   GLuint i, idx;
 
    memset(&c, 0, sizeof(c));
 
@@ -54,7 +54,7 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
 
    c.key = *key;
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   c.nr_attrs = c.key.nr_attrs;
    c.nr_attr_regs = (c.nr_attrs+1)/2;
    c.nr_setup_attrs = c.key.nr_attrs;
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
@@ -62,21 +62,6 @@ static void compile_sf_prog( struct brw_context *brw,
    c.prog_data.urb_read_length = c.nr_attr_regs;
    c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
 
-   /* Construct map from attribute number to position in the vertex.
-    */
-   for (i = idx = 0; i < VERT_RESULT_MAX; i++) 
-      if (c.key.attrs & (1<<i)) {
-	 c.attr_to_idx[i] = idx;
-	 c.idx_to_attr[idx] = i;
-	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
-            c.point_attrs[i].CoordReplace = 
-               ctx->Point.CoordReplace[i - VERT_RESULT_TEX0];
-	 }
-         else {
-            c.point_attrs[i].CoordReplace = GL_FALSE;
-         }
-	 idx++;
-      }
    
    /* Which primitive?  Or all three? 
     */
@@ -122,7 +107,7 @@ static void compile_sf_prog( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_sf_prog(struct brw_context *brw)
+static int upload_sf_prog(struct brw_context *brw)
 {
    struct brw_sf_prog_key key;
 
@@ -131,46 +116,49 @@ static void upload_sf_prog(struct brw_context *brw)
    /* Populate the key, noting state dependencies:
     */
    /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->nr_outputs_written; 
+   key.nr_attrs = brw->curr.vertex_shader->info.file_max[TGSI_FILE_OUTPUT] + 1;
+
+
+   /* XXX: this is probably where the mapping between vertex shader
+    * outputs and fragment shader inputs should be handled.  Assume
+    * for now 1:1 correspondance.
+    *
+    * XXX: scan frag shader inputs to work out linear vs. perspective
+    * interpolation below.
+    *
+    * XXX: as long as we're hard-wiring, is eg. position required to
+    * be linear?
+    */
+   key.linear_attrs = 0;
+   key.persp_attrs = (1 << key.nr_attrs) - 1;
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
    switch (brw->reduced_primitive) {
-   case GL_TRIANGLES: 
-      /* NOTE: We just use the edgeflag attribute as an indicator that
-       * unfilled triangles are active.  We don't actually do the
-       * edgeflag testing here, it is already done in the clip
-       * program.
+   case PIPE_PRIM_TRIANGLES: 
+      /* PIPE_NEW_RAST
        */
-      if (key.attrs & (1<<VERT_RESULT_EDGE))
+      if (brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL ||
+	  brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL)
 	 key.primitive = SF_UNFILLED_TRIS;
       else
 	 key.primitive = SF_TRIANGLES;
       break;
-   case GL_LINES: 
+   case PIPE_PRIM_LINES: 
       key.primitive = SF_LINES; 
       break;
-   case GL_POINTS: 
+   case PIPE_PRIM_POINTS: 
       key.primitive = SF_POINTS; 
       break;
    }
 
-   key.do_point_sprite = ctx->Point.PointSprite;
-   key.SpriteOrigin = ctx->Point.SpriteOrigin;
-   /* _NEW_LIGHT */
-   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
-   key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
+   key.do_point_sprite = brw->curr.rast->templ.point_sprite;
+   key.sprite_origin_lower_left = 0; /* XXX: ctx->Point.SpriteOrigin - fix rast state */
+   key.do_flat_shading = brw->curr.rast->templ.flatshade;
+   key.do_twoside_color = brw->curr.rast->templ.light_twoside;
 
-   /* _NEW_HINT */
-   key.linear_color = 0;
-
-   /* _NEW_POLYGON */
    if (key.do_twoside_color) {
-      /* If we're rendering to a FBO, we have to invert the polygon
-       * face orientation, just as we invert the viewport in
-       * sf_unit_create_from_key().  ctx->DrawBuffer->Name will be
-       * nonzero if we're rendering to such an FBO.
-       */
-      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
+      key.frontface_ccw = (brw->curr.rast->templ.front_winding == 
+			   PIPE_WINDING_CCW);
    }
 
    brw->sws->bo_unreference(brw->sf.prog_bo);
@@ -180,14 +168,16 @@ static void upload_sf_prog(struct brw_context *brw)
 				      &brw->sf.prog_data);
    if (brw->sf.prog_bo == NULL)
       compile_sf_prog( brw, &key );
+
+   return 0;
 }
 
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
+      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_VERTEX_SHADER),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = CACHE_NEW_VS_PROG
+      .cache = 0
    },
    .prepare = upload_sf_prog
 };
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index c99116b8b1..0b7003dc5e 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -49,14 +49,21 @@ struct brw_sf_prog_key {
     */
    GLuint persp_attrs:32;
    GLuint linear_attrs:32;
+   GLuint point_coord_replace_attrs:32;
 
+   GLuint nr_attrs:8;
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
    GLuint sprite_origin_lower_left:1;
-   GLuint pad:25;
+   GLuint pad:17;
+
+   GLuint attr_col0:8;
+   GLuint attr_col1:8;
+   GLuint attr_bfc0:8;
+   GLuint attr_bfc1:8;
 };
 
 struct brw_sf_point_tex {
@@ -101,9 +108,7 @@ struct brw_sf_compile {
    GLuint nr_setup_attrs;
    GLuint nr_setup_regs;
 
-   GLubyte attr_to_idx[VERT_RESULT_MAX];   
-   GLubyte idx_to_attr[VERT_RESULT_MAX];   
-   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+   GLuint point_coord_replace_mask;
 };
 
  
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index 4acb2b7d72..db52c9553e 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -43,17 +43,12 @@ static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
 				    struct brw_reg vert,
 				    GLuint attr)
 {
-   GLuint off = c->attr_to_idx[attr] / 2;
-   GLuint sub = c->attr_to_idx[attr] % 2;
+   GLuint off = attr / 2;
+   GLuint sub = attr % 2;
 
    return brw_vec4_grf(vert.nr + off, sub * 4);
 }
 
-static GLboolean have_attr(struct brw_sf_compile *c,
-			   GLuint attr)
-{
-   return (c->key.attrs & (1<<attr)) ? 1 : 0;
-}
 
 /*********************************************************************** 
  * Twoside lighting
@@ -62,15 +57,16 @@ static void copy_bfc( struct brw_sf_compile *c,
 		      struct brw_reg vert )
 {
    struct brw_compile *p = &c->func;
-   GLuint i;
 
-   for (i = 0; i < 2; i++) {
-      if (have_attr(c, VERT_RESULT_COL0+i) &&
-	  have_attr(c, VERT_RESULT_BFC0+i))
-	 brw_MOV(p, 
-		 get_vert_attr(c, vert, VERT_RESULT_COL0+i), 
-		 get_vert_attr(c, vert, VERT_RESULT_BFC0+i));
-   }
+   if (c->key.attr_col0 && c->key.attr_bfc0)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col0), 
+	      get_vert_attr(c, vert, c->key.attr_bfc0));
+
+   if (c->key.attr_col1 && c->key.attr_bfc1)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col1), 
+	      get_vert_attr(c, vert, c->key.attr_bfc1));
 }
 
 
@@ -89,8 +85,8 @@ static void do_twoside_color( struct brw_sf_compile *c )
     * for user-supplied vertex programs, as t_vp_build.c always does
     * the right thing.
     */
-   if (!(have_attr(c, VERT_RESULT_COL0) && have_attr(c, VERT_RESULT_BFC0)) &&
-       !(have_attr(c, VERT_RESULT_COL1) && have_attr(c, VERT_RESULT_BFC1)))
+   if (!(c->key.attr_col0 && c->key.attr_bfc0) &&
+       !(c->key.attr_col1 && c->key.attr_bfc1))
       return;
    
    /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
@@ -126,14 +122,17 @@ static void copy_colors( struct brw_sf_compile *c,
 		     struct brw_reg src)
 {
    struct brw_compile *p = &c->func;
-   GLuint i;
 
-   for (i = VERT_RESULT_COL0; i <= VERT_RESULT_COL1; i++) {
-      if (have_attr(c,i))
-	 brw_MOV(p, 
-		 get_vert_attr(c, dst, i), 
-		 get_vert_attr(c, src, i));
-   }
+   if (c->key.attr_col0)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col0), 
+	      get_vert_attr(c, src, c->key.attr_col0));
+
+   if (c->key.attr_col1)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col1), 
+	      get_vert_attr(c, src, c->key.attr_col1));
+
 }
 
 
@@ -146,10 +145,16 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
+   GLuint nr = 0;
 
-   if (!nr)
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
       return;
 
    /* Already done in clip program:
@@ -184,10 +189,16 @@ static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
 
-   if (!nr)
+   if (nr == 0)
       return;
 
    /* Already done in clip program: 
@@ -319,10 +330,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    *pc_linear = 0;
    *pc = 0xf;
       
-   if (persp_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (persp_mask & (1 << (reg*2))) 
       *pc_persp = 0xf;
 
-   if (linear_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (linear_mask & (1 << (reg*2))) 
       *pc_linear = 0xf;
 
    /* Maybe only processs one attribute on the final round:
@@ -330,10 +341,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    if (reg*2+1 < c->nr_setup_attrs) {
       *pc |= 0xf0;
 
-      if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (persp_mask & (1 << (reg*2+1))) 
 	 *pc_persp |= 0xf0;
 
-      if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (linear_mask & (1 << (reg*2+1))) 
 	 *pc_linear |= 0xf0;
    }
 
@@ -513,24 +524,28 @@ void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
       alloc_regs(c);
 
    copy_z_inv_w(c);
+
    for (i = 0; i < c->nr_setup_regs; i++)
    {
-      struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]];
+      /* XXX: only seems to check point_coord_replace_attrs for every
+       * second attribute?!?
+       */
+      boolean coord_replace = !!(c->key.point_coord_replace_attrs & (1<<(2*i)));
       struct brw_reg a0 = offset(c->vert[0], i);
       GLushort pc, pc_persp, pc_linear;
       GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
             
       if (pc_persp)
       {				
-	  if (!tex->CoordReplace) {
-	      brw_set_predicate_control_flag_value(p, pc_persp);
-	      brw_MUL(p, a0, a0, c->inv_w[0]);
-	  }
+	 if (coord_replace) {
+	    brw_set_predicate_control_flag_value(p, pc_persp);
+	    brw_MUL(p, a0, a0, c->inv_w[0]);
+	 }
       }
 
-      if (tex->CoordReplace) {
-	  /* Caculate 1.0/PointWidth */
-	  brw_math(&c->func,
+      if (coord_replace) {
+	 /* Caculate 1.0/PointWidth */
+	 brw_math(&c->func,
 		  c->tmp,
 		  BRW_MATH_FUNCTION_INV,
 		  BRW_MATH_SATURATE_NONE,
@@ -539,33 +554,37 @@ void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
 		  BRW_MATH_DATA_SCALAR,
 		  BRW_MATH_PRECISION_FULL);
 
-	  if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  } else {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  }
-      } else {
-	  brw_MOV(p, c->m1Cx, brw_imm_ud(0));
-	  brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 } 
+	 else {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 }
+      } 
+      else {
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
       }
 
       {
 	 brw_set_predicate_control_flag_value(p, pc); 
-	 if (tex->CoordReplace) {
-	     if (c->key.sprite_origin_lower_left) {
-		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
-		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
-	     }
-	     else
-		 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
-	 } else {
-	 	brw_MOV(p, c->m3C0, a0); /* constant value */
+	 if (coord_replace) {
+	    if (c->key.sprite_origin_lower_left) {
+	       brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+	       brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	    }
+	    else {
+	       brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	    }
+	 } 
+	 else {
+	    brw_MOV(p, c->m3C0, a0); /* constant value */
 	 }
 
 	 /* Copy m0..m3 to URB. 
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
index 648a16a038..fbc9f15eb4 100644
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -29,58 +29,48 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
    
+#include "util/u_math.h"
 
+#include "pipe/p_state.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
 
-static void upload_sf_vp(struct brw_context *brw)
+static int upload_sf_vp(struct brw_context *brw)
 {
-   const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   const struct pipe_viewport_state *vp = &brw->curr.vp;
+   const struct pipe_scissor_state *scissor = &brw->curr.scissor;
    struct brw_sf_viewport sfv;
-   GLfloat y_scale, y_bias;
-   const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   y_scale = 1.0;
-   y_bias = 0;
+   /* PIPE_NEW_VIEWPORT, PIPE_NEW_SCISSOR */
 
-   /* _NEW_VIEWPORT */
+   sfv.viewport.m00 = vp->scale[0];
+   sfv.viewport.m11 = vp->scale[1];
+   sfv.viewport.m22 = vp->scale[2];
+   sfv.viewport.m30 = vp->translate[0];
+   sfv.viewport.m31 = vp->translate[1];
+   sfv.viewport.m32 = vp->translate[2];
 
-   sfv.viewport.m00 = v[MAT_SX];
-   sfv.viewport.m11 = v[MAT_SY] * y_scale;
-   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
-   sfv.viewport.m30 = v[MAT_TX];
-   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
-   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
-    * for DrawBuffer->_[XY]{min,max}
-    */
-
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-   /* Y=0=bottom */
-   sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
-   sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-   sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
-   sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+   sfv.scissor.xmin = scissor->minx;
+   sfv.scissor.xmax = scissor->maxx; /* -1 ?? */
+   sfv.scissor.ymin = scissor->miny;
+   sfv.scissor.ymax = scissor->maxy; /* -1 ?? */
 
    brw->sws->bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
-      .mesa  = (_NEW_VIEWPORT | 
-		_NEW_SCISSOR |
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_VIEWPORT | 
+		PIPE_NEW_SCISSOR),
       .brw   = 0,
       .cache = 0
    },
@@ -90,15 +80,17 @@ const struct brw_tracked_state brw_sf_vp = {
 struct brw_sf_unit_key {
    unsigned int total_grf;
    unsigned int urb_entry_read_length;
-
    unsigned int nr_urb_entries, urb_size, sfsize;
-
-   GLenum front_face, cull_face, provoking_vertex;
+   
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
    unsigned point_attenuated:1;
-   unsigned render_to_fbo:1;
+   unsigned front_face:2;
+   unsigned cull_mode:2;
+   unsigned flatshade_first:1;
+   unsigned gl_rasterization_rules:1;
+   unsigned line_last_pixel_enable:1;
    float line_width;
    float point_size;
 };
@@ -106,6 +98,7 @@ struct brw_sf_unit_key {
 static void
 sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 {
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_SF_PROG */
@@ -117,25 +110,22 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->urb_size = brw->urb.vsize;
    key->sfsize = brw->urb.sfsize;
 
-   key->scissor = ctx->Scissor.Enabled;
-   key->front_face = ctx->Polygon.FrontFace;
-
-   if (ctx->Polygon.CullFlag)
-      key->cull_face = ctx->Polygon.CullFaceMode;
-   else
-      key->cull_face = GL_NONE;
-
-   key->line_width = ctx->Line.Width;
-   key->line_smooth = ctx->Line.SmoothFlag;
-
-   key->point_sprite = ctx->Point.PointSprite;
-   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-   key->point_attenuated = ctx->Point._Attenuated;
-
-   /* _NEW_LIGHT */
-   key->provoking_vertex = ctx->Light.ProvokingVertex;
-
-   key->render_to_fbo = 1;
+   /* PIPE_NEW_RAST */
+   key->scissor = rast->scissor;
+   key->front_face = rast->front_winding;
+   key->cull_mode = rast->cull_mode;
+   key->line_smooth = rast->line_smooth;
+   key->line_width = rast->line_width;
+   key->flatshade_first = rast->flatshade_first;
+   key->line_last_pixel_enable = rast->line_last_pixel;
+   key->gl_rasterization_rules = rast->gl_rasterization_rules;
+
+   key->point_sprite = rast->point_sprite;
+   key->point_attenuated = rast->point_size_per_vertex;
+
+   key->point_size = CLAMP(rast->point_size, 
+			   rast->point_size_min, 
+			   rast->point_size_max);
 }
 
 static struct brw_winsys_buffer *
@@ -147,7 +137,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
-   sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   sf.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    sf.thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */
 
    sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
@@ -174,10 +164,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       sf.thread4.max_threads = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       sf.thread4.stats_enable = 1;
 
    /* CACHE_NEW_SF_VP */
@@ -185,31 +175,30 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.sf5.viewport_transform = 1;
 
-   /* _NEW_SCISSOR */
    if (key->scissor)
       sf.sf6.scissor = 1;
 
-   /* _NEW_POLYGON */
-   if (key->front_face == GL_CCW)
+   if (key->front_face == PIPE_WINDING_CCW)
       sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
-   switch (key->cull_face) {
-   case GL_FRONT:
-      sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+   switch (key->cull_mode) {
+   case PIPE_WINDING_CCW:
+   case PIPE_WINDING_CW:
+      sf.sf6.cull_mode = (key->front_face == key->cull_mode ?
+			  BRW_CULLMODE_FRONT :
+			  BRW_CULLMODE_BACK);
       break;
-   case GL_BACK:
-      sf.sf6.cull_mode = BRW_CULLMODE_BACK;
-      break;
-   case GL_FRONT_AND_BACK:
+   case PIPE_WINDING_BOTH:
       sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
       break;
-   case GL_NONE:
+   case PIPE_WINDING_NONE:
       sf.sf6.cull_mode = BRW_CULLMODE_NONE;
       break;
    default:
       assert(0);
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
       break;
    }
 
@@ -223,9 +212,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_BUFFERS */
-   key->render_to_fbo = 1;
-   if (!key->render_to_fbo) {
+   /* XXX: gl_rasterization_rules?  something else?
+    */
+   if (0) {
       /* Rendering to an OpenGL window */
       sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
    }
@@ -261,7 +250,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+   if (!key->flatshade_first) {
       sf.sf7.trifan_pv = 2;
       sf.sf7.linestrip_pv = 1;
       sf.sf7.tristrip_pv = 2;
@@ -270,12 +259,19 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
       sf.sf7.linestrip_pv = 0;
       sf.sf7.tristrip_pv = 0;
    }
-   sf.sf7.line_last_pixel_enable = 0;
+
+   sf.sf7.line_last_pixel_enable = key->line_last_pixel_enable;
 
    /* Set bias for OpenGL rasterization rules:
     */
-   sf.sf6.dest_org_vbias = 0x8;
-   sf.sf6.dest_org_hbias = 0x8;
+   if (key->gl_rasterization_rules) {
+      sf.sf6.dest_org_vbias = 0x8;
+      sf.sf6.dest_org_hbias = 0x8;
+   }
+   else {
+      sf.sf6.dest_org_vbias = 0x0;
+      sf.sf6.dest_org_hbias = 0x0;
+   }
 
    bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
 			 key, sizeof(*key),
@@ -287,23 +283,23 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
     */
    /* Emit SF program relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     sf.thread0.grf_reg_count << 1,
-		     offsetof(struct brw_sf_unit_state, thread0),
-		     brw->sf.prog_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   sf.thread0.grf_reg_count << 1,
+			   offsetof(struct brw_sf_unit_state, thread0),
+			   brw->sf.prog_bo);
 
    /* Emit SF viewport relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
-		     offsetof(struct brw_sf_unit_state, sf5),
-		     brw->sf.vp_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
+			   offsetof(struct brw_sf_unit_state, sf5),
+			   brw->sf.vp_bo);
 
    return bo;
 }
 
-static void upload_sf_unit( struct brw_context *brw )
+static int upload_sf_unit( struct brw_context *brw )
 {
    struct brw_sf_unit_key key;
    struct brw_winsys_buffer *reloc_bufs[2];
@@ -321,16 +317,12 @@ static void upload_sf_unit( struct brw_context *brw )
    if (brw->sf.state_bo == NULL) {
       brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs);
    }
+   return 0;
 }
 
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
-      .mesa  = (_NEW_POLYGON | 
-		_NEW_LIGHT |
-		_NEW_LINE | 
-		_NEW_POINT | 
-		_NEW_SCISSOR |
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_RAST),
       .brw   = BRW_NEW_URB_FENCE,
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 663fc839df..2275e9ad69 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -168,9 +168,20 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache( struct brw_context *brw );
 
-/* brw_wm_surface_state.c */
+/***********************************************************************
+ * brw_wm_surface_state.c 
+ */
 struct brw_winsys_buffer *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key );
 
+/***********************************************************************
+ * brw_state_debug.c
+ */
+void brw_update_dirty_counts( unsigned mesa,
+			      unsigned brw,
+			      unsigned cache );
+
+
+
 #endif
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 324fce5163..7d212e5c24 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -46,7 +46,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct brw_cached_batch_item *item = brw->cached_batch_items;
    struct header *newheader = (struct header *)data;
 
-   if (brw->emit_state_always) {
+   if (brw->flags.always_emit_state) {
       brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
@@ -56,8 +56,8 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
 	    return GL_FALSE;
 	 if (item->sz != sz) {
-	    _mesa_free(item->header);
-	    item->header = _mesa_malloc(sz);
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
 	    item->sz = sz;
 	 }
 	 goto emit;
@@ -67,7 +67,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
    assert(!item);
    item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = _mesa_malloc(sz);
+   item->header = MALLOC(sz);
    item->sz = sz;
    item->next = brw->cached_batch_items;
    brw->cached_batch_items = item;
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index 97f88b3ab3..4310d01ba2 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -55,7 +55,9 @@
  * only one of the two buffers referenced gets put into the offset, and the
  * incorrect program is run for the other instance.
  */
+#include "util/u_memory.h"
 
+#include "brw_debug.h"
 #include "brw_state.h"
 #include "brw_batchbuffer.h"
 
@@ -107,9 +109,9 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    if (bo == cache->last_bo[cache_id])
       return; /* no change */
 
-   brw->sws->bo_unreference(cache->last_bo[cache_id]);
+   cache->sws->bo_unreference(cache->last_bo[cache_id]);
    cache->last_bo[cache_id] = bo;
-   brw->sws->bo_reference(cache->last_bo[cache_id]);
+   cache->sws->bo_reference(cache->last_bo[cache_id]);
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
@@ -127,7 +129,7 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
    for (c = cache->items[hash % cache->size]; c; c = c->next)
       bucketcount++;
 
-   fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
+   debug_printf("bucket %d/%d = %d/%d items\n", hash % cache->size,
 	   cache->size, bucketcount, cache->n_items);
 #endif
 
@@ -154,7 +156,7 @@ rehash(struct brw_cache *cache)
    GLuint size, i;
 
    size = cache->size * 3;
-   items = (struct brw_cache_item**) _mesa_calloc(size * sizeof(*items));
+   items = (struct brw_cache_item**) CALLOC(size, sizeof(*items));
 
    for (i = 0; i < cache->size; i++)
       for (c = cache->items[i]; c; c = next) {
@@ -194,7 +196,7 @@ brw_search_cache(struct brw_cache *cache,
 
    update_cache_last(cache, cache_id, item->bo);
 
-   brw->sws->bo_reference(item->bo);
+   cache->sws->bo_reference(item->bo);
    return item->bo;
 }
 
@@ -219,20 +221,25 @@ brw_upload_cache( struct brw_cache *cache,
    struct brw_winsys_buffer *bo;
    int i;
 
-   /* Create the buffer object to contain the data */
-   bo = brw->sws->bo_alloc(cache->sws,
-			   cache->buffer_type[cache_id], data_size, 1 << 6);
+   /* Create the buffer object to contain the data.  For now, use a
+    * single buffer type to describe all cached state atoms.  Later,
+    * may want to take advantage of hardware distinctions between
+    * these various entities.
+    */
+   bo = cache->sws->bo_alloc(cache->sws,
+			     BRW_BUFFER_TYPE_STATE_CACHE, 
+			     data_size, 1 << 6);
 
 
    /* Set up the memory containing the key, aux_data, and reloc_bufs */
-   tmp = _mesa_malloc(key_size + aux_size + relocs_size);
+   tmp = MALLOC(key_size + aux_size + relocs_size);
 
    memcpy(tmp, key, key_size);
    memcpy(tmp + key_size, aux, cache->aux_size[cache_id]);
    memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
-	 brw->sws->bo_reference(reloc_bufs[i]);
+	 cache->sws->bo_reference(reloc_bufs[i]);
    }
 
    item->cache_id = cache_id;
@@ -243,7 +250,7 @@ brw_upload_cache( struct brw_cache *cache,
    item->nr_reloc_bufs = nr_reloc_bufs;
 
    item->bo = bo;
-   brw->sws->bo_reference(bo);
+   cache->sws->bo_reference(bo);
    item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
@@ -259,13 +266,13 @@ brw_upload_cache( struct brw_cache *cache,
       *(void **)aux_return = (void *)((char *)item->key + item->key_size);
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("upload %s: %d bytes to cache id %d\n",
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to cache id %d\n",
 		   cache->name[cache_id],
 		   data_size, cache_id);
 
    /* Copy data to the buffer */
-   dri_bo_subdata(bo, 0, data_size, data);
+   cache->sws->bo_subdata(bo, 0, data_size, data);
 
    update_cache_last(cache, cache_id, bo);
 
@@ -292,7 +299,7 @@ brw_cache_data_sz(struct brw_cache *cache,
 		       reloc_bufs, nr_reloc_bufs);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
-      brw->sws->bo_reference(item->bo);
+      cache->sws->bo_reference(item->bo);
       return item->bo;
    }
 
@@ -349,11 +356,12 @@ brw_init_non_surface_cache(struct brw_context *brw)
    struct brw_cache *cache = &brw->cache;
 
    cache->brw = brw;
+   cache->sws = brw->sws;
 
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
 
    brw_init_cache_id(cache,
 		     "CC_VP",
@@ -457,7 +465,7 @@ brw_init_surface_cache(struct brw_context *brw)
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
 
    brw_init_cache_id(cache,
 		     "SS_SURFACE",
@@ -487,8 +495,8 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    struct brw_cache_item *c, *next;
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -507,7 +515,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
-      _mesa_free(brw->curbe.last_buf);
+      FREE(brw->curbe.last_buf);
       brw->curbe.last_buf = NULL;
    }
 
@@ -527,8 +535,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
    struct brw_cache_item **prev;
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (prev = &cache->items[i]; *prev;) {
@@ -540,8 +548,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 	    *prev = c->next;
 
 	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       brw->sws->bo_unreference(c->reloc_bufs[j]);
-	    brw->sws->bo_unreference(c->bo);
+	       cache->sws->bo_unreference(c->reloc_bufs[j]);
+	    cache->sws->bo_unreference(c->bo);
 	    free((void *)c->key);
 	    free(c);
 	    cache->n_items--;
@@ -555,8 +563,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 void
 brw_state_cache_check_size(struct brw_context *brw)
 {
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
@@ -574,8 +582,8 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
diff --git a/src/gallium/drivers/i965/brw_state_debug.c b/src/gallium/drivers/i965/brw_state_debug.c
index 22cea4b7d8..cc4744dc16 100644
--- a/src/gallium/drivers/i965/brw_state_debug.c
+++ b/src/gallium/drivers/i965/brw_state_debug.c
@@ -109,8 +109,25 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
       if (bit_map[i].bit == 0)
 	 return;
 
-      fprintf(stderr, "0x%08x: %12d (%s)\n",
+      debug_printf("0x%08x: %12d (%s)\n",
 	      bit_map[i].bit, bit_map[i].count, bit_map[i].name);
    }
 }
 
+void
+brw_update_dirty_counts( unsigned mesa,
+			 unsigned brw,
+			 unsigned cache )
+{
+   static int dirty_count = 0;
+
+   brw_update_dirty_count(mesa_bits, mesa);
+   brw_update_dirty_count(brw_bits, brw);
+   brw_update_dirty_count(cache_bits, cache);
+      if (dirty_count++ % 1000 == 0) {
+	 brw_print_dirty_count(mesa_bits, mesa);
+	 brw_print_dirty_count(brw_bits, brw);
+	 brw_print_dirty_count(cache_bits, cache);
+	 debug_printf("\n");
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_state_dump.c b/src/gallium/drivers/i965/brw_state_dump.c
index 1bc83fb9c1..72604304d4 100644
--- a/src/gallium/drivers/i965/brw_state_dump.c
+++ b/src/gallium/drivers/i965/brw_state_dump.c
@@ -28,6 +28,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_winsys.h"
 
 /**
  * Prints out a header, the contents, and the message associated with
@@ -44,28 +45,32 @@ state_out(const char *name, void *data, uint32_t hw_offset, int index,
 {
     va_list va;
 
-    fprintf(stderr, "%8s: 0x%08x: 0x%08x: ",
-	    name, hw_offset + index * 4, ((uint32_t *)data)[index]);
+    debug_printf("%8s: 0x%08x: 0x%08x: ",
+		 name, hw_offset + index * 4, ((uint32_t *)data)[index]);
     va_start(va, fmt);
-    vfprintf(stderr, fmt, va);
+    debug_vprintf(fmt, va);
     va_end(va);
 }
 
 /** Generic, undecoded state buffer debug printout */
 static void
-state_struct_out(const char *name, struct brw_winsys_buffer *buffer, unsigned int state_size)
+state_struct_out(struct brw_winsys_screen *sws,
+		 const char *name,
+		 struct brw_winsys_buffer *buffer,
+		 unsigned int state_size)
 {
    int i;
+   void *data;
 
    if (buffer == NULL)
       return;
 
-   dri_bo_map(buffer, GL_FALSE);
+   data = sws->bo_map(buffer, GL_FALSE);
    for (i = 0; i < state_size / 4; i++) {
-      state_out(name, buffer->virtual, buffer->offset, i,
+      state_out(name, data, buffer->offset, i,
 		"dword %d\n", i);
    }
-   dri_bo_unmap(buffer);
+   sws->bo_unmap(buffer);
 }
 
 static const char *
@@ -106,12 +111,11 @@ static void dump_wm_surface_state(struct brw_context *brw)
       char name[20];
 
       if (surf_bo == NULL) {
-	 fprintf(stderr, "  WM SS%d: NULL\n", i);
+	 debug_printf("  WM SS%d: NULL\n", i);
 	 continue;
       }
-      dri_bo_map(surf_bo, GL_FALSE);
+      surf = (struct brw_surface_state *)brw->sws->bo_map(surf_bo, GL_FALSE);
       surfoff = surf_bo->offset;
-      surf = (struct brw_surface_state *)(surf_bo->virtual);
 
       sprintf(name, "WM SS%d", i);
       state_out(name, surf, surfoff, 0, "%s %s\n",
@@ -127,7 +131,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
       state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
 		surf->ss5.x_offset, surf->ss5.y_offset);
 
-      dri_bo_unmap(surf_bo);
+      brw->sws->bo_unmap(surf_bo);
    }
 }
 
@@ -140,9 +144,7 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    if (brw->sf.vp_bo == NULL)
       return;
 
-   dri_bo_map(brw->sf.vp_bo, GL_FALSE);
-
-   vp = brw->sf.vp_bo->virtual;
+   vp = (struct brw_sf_viewport *)brw->sws->bo_map(brw->sf.vp_bo, GL_FALSE);
    vp_off = brw->sf.vp_bo->offset;
 
    state_out(name, vp, vp_off, 0, "m00 = %f\n", vp->viewport.m00);
@@ -157,10 +159,12 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    state_out(name, vp, vp_off, 7, "bottom right = %d,%d\n",
 	     vp->scissor.xmax, vp->scissor.ymax);
 
-   dri_bo_unmap(brw->sf.vp_bo);
+   brw->sws->bo_unmap(brw->sf.vp_bo);
 }
 
-static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
+static void brw_debug_prog(struct brw_winsys_screen *sws,
+			   const char *name,
+			   struct brw_winsys_buffer *prog)
 {
    unsigned int i;
    uint32_t *data;
@@ -168,12 +172,10 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
    if (prog == NULL)
       return;
 
-   dri_bo_map(prog, GL_FALSE);
-
-   data = prog->virtual;
+   data = (uint32_t *)sws->bo_map(prog, GL_FALSE);
 
    for (i = 0; i < prog->size / 4 / 4; i++) {
-      fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+      debug_printf("%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
 	      name, (unsigned int)prog->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
       /* Stop at the end of the program.  It'd be nice to keep track of the actual
@@ -186,7 +188,7 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
 	 break;
    }
 
-   dri_bo_unmap(prog);
+   sws->bo_unmap(prog);
 }
 
 
@@ -202,19 +204,21 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
  */
 void brw_debug_batch(struct brw_context *brw)
 {
-   state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
+   struct brw_winsys_screen *sws = brw->sws;
+
+   state_struct_out(sws, "WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
    dump_wm_surface_state(brw);
 
-   state_struct_out("VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state));
-   brw_debug_prog("VS prog", brw->vs.prog_bo);
+   state_struct_out(sws, "VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state));
+   brw_debug_prog(sws, "VS prog", brw->vs.prog_bo);
 
-   state_struct_out("GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state));
-   brw_debug_prog("GS prog", brw->gs.prog_bo);
+   state_struct_out(sws, "GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state));
+   brw_debug_prog(sws, "GS prog", brw->gs.prog_bo);
 
-   state_struct_out("SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state));
+   state_struct_out(sws, "SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state));
    dump_sf_viewport_state(brw);
-   brw_debug_prog("SF prog", brw->sf.prog_bo);
+   brw_debug_prog(sws, "SF prog", brw->sf.prog_bo);
 
-   state_struct_out("WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state));
-   brw_debug_prog("WM prog", brw->wm.prog_bo);
+   state_struct_out(sws, "WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state));
+   brw_debug_prog(sws, "WM prog", brw->wm.prog_bo);
 }
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index 8659e35289..eff3a40a46 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -34,6 +34,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_batchbuffer.h"
+#include "brw_debug.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
@@ -83,12 +84,8 @@ const struct brw_tracked_state *atoms[] =
    &brw_blend_constant_color,
 
    &brw_depthbuffer,
-
    &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
    &brw_line_stipple,
-   &brw_aa_line_parameters,
 
    &brw_psp_urb_cbs,
 
@@ -163,11 +160,12 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
 {
    struct brw_state_flags *state = &brw->state.dirty;
    GLuint i;
+   int ret;
 
    brw_clear_validated_bos(brw);
-   brw_add_validated_bo(brw, intel->batch->buf);
+   brw_add_validated_bo(brw, brw->batch->buf);
 
-   if (brw->emit_state_always) {
+   if (brw->flags.always_emit_state) {
       state->mesa |= ~0;
       state->brw |= ~0;
       state->cache |= ~0;
@@ -199,10 +197,10 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
     * If this fails, we can experience GPU lock-ups.
     */
    {
-      const struct brw_fragment_program *fp = brw->fragment_program;
+      const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
       if (fp) {
-         assert(fp->info.max_sampler <= brw->nr_samplers &&
-		fp->info.max_texture <= brw->nr_textures);
+         assert(fp->info.file_max[TGSI_FILE_SAMPLER] < brw->curr.num_samplers &&
+		fp->info.texture_max < brw->curr.num_textures);
       }
    }
 
@@ -213,18 +211,18 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
 enum pipe_error brw_upload_state(struct brw_context *brw)
 {
    struct brw_state_flags *state = &brw->state.dirty;
+   int ret;
    int i;
-   static int dirty_count = 0;
 
    brw_clear_validated_bos(brw);
 
-   if (INTEL_DEBUG) {
+   if (BRW_DEBUG) {
       /* Debug version which enforces various sanity checks on the
        * state flags which are generated and checked to help ensure
        * state atoms are ordered correctly in the list.
        */
       struct brw_state_flags examined, prev;      
-      _mesa_memset(&examined, 0, sizeof(examined));
+      memset(&examined, 0, sizeof(examined));
       prev = *state;
 
       for (i = 0; i < Elements(atoms); i++) {
@@ -268,19 +266,14 @@ enum pipe_error brw_upload_state(struct brw_context *brw)
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE) {
-      brw_update_dirty_count(mesa_bits, state->mesa);
-      brw_update_dirty_count(brw_bits, state->brw);
-      brw_update_dirty_count(cache_bits, state->cache);
-      if (dirty_count++ % 1000 == 0) {
-	 brw_print_dirty_count(mesa_bits, state->mesa);
-	 brw_print_dirty_count(brw_bits, state->brw);
-	 brw_print_dirty_count(cache_bits, state->cache);
-	 debug_printf("\n");
-      }
+   if (BRW_DEBUG & DEBUG_STATE) {
+      brw_update_dirty_counts( state->mesa, 
+			       state->brw,
+			       state->cache );
    }
    
    /* Clear dirty flags:
     */
    memset(state, 0, sizeof(*state));
+   return 0;
 }
diff --git a/src/gallium/drivers/i965/brw_tex.c b/src/gallium/drivers/i965/brw_tex.c
deleted file mode 100644
index 6f7adb6393..0000000000
--- a/src/gallium/drivers/i965/brw_tex.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-        
-
-#include "brw_context.h"
-
-/**
- * Finalizes all textures, completing any rendering that needs to be done
- * to prepare them.
- */
-void brw_validate_textures( struct brw_context *brw )
-{
-   int i;
-
-   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
-
-      if (texUnit->_ReallyEnabled) {
-	 intel_finalize_mipmap_tree(intel, i);
-      }
-   }
-}
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
deleted file mode 100644
index 50c30878c6..0000000000
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-/* Code to layout images in a mipmap tree for i965.
- */
-
-#include "brw_tex_layout.h"
-
-#define FILE_DEBUG_FLAG DEBUG_MIPTREE
-
-GLboolean brw_miptree_layout(struct brw_context *brw,
-			     struct intel_mipmap_tree *mt,
-			     uint32_t tiling)
-{
-   /* XXX: these vary depending on image format: */
-   /* GLint align_w = 4; */
-
-   switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(brw->brw_screen->pci_id)) {
-          GLuint align_h = 2, align_w = 4;
-          GLuint level;
-          GLuint x = 0;
-          GLuint y = 0;
-          GLuint width = mt->width0;
-          GLuint height = mt->height0;
-          GLuint qpitch = 0;
-          GLuint y_pitch = 0;
-
-          mt->pitch = mt->width0;
-          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
-          y_pitch = ALIGN(height, align_h);
-
-          if (mt->compressed) {
-              mt->pitch = ALIGN(mt->width0, align_w);
-          }
-
-          if (mt->last_level != 0) {
-              GLuint mip1_width;
-
-              if (mt->compressed) {
-                  mip1_width = ALIGN(minify(mt->width0), align_w)
-                      + ALIGN(minify(minify(mt->width0)), align_w);
-              } else {
-                  mip1_width = ALIGN(minify(mt->width0), align_w)
-                      + minify(minify(mt->width0));
-              }
-
-              if (mip1_width > mt->pitch) {
-                  mt->pitch = mip1_width;
-              }
-          }
-
-          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
-
-          if (mt->compressed) {
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
-          } else {
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
-          }
-
-          for (level = 0; level <= mt->last_level; level++) {
-              GLuint img_height;
-              GLuint nr_images = 6;
-              GLuint q = 0;
-
-              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
-                                           height, 1);
-
-              for (q = 0; q < nr_images; q++)
-                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
-
-              if (mt->compressed)
-                  img_height = MAX2(1, height/4);
-              else
-                  img_height = ALIGN(height, align_h);
-
-              if (level == 1) {
-                  x += ALIGN(width, align_w);
-              }
-              else {
-                  y += img_height;
-              }
-
-              width  = minify(width);
-              height = minify(height);
-          }
-
-          break;
-      }
-
-   case GL_TEXTURE_3D: {
-      GLuint width  = mt->width0;
-      GLuint height = mt->height0;
-      GLuint depth = mt->depth0;
-      GLuint pack_x_pitch, pack_x_nr;
-      GLuint pack_y_pitch;
-      GLuint level;
-      GLuint align_h = 2;
-      GLuint align_w = 4;
-
-      mt->total_height = 0;
-      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
-
-      if (mt->compressed) {
-          mt->pitch = ALIGN(width, align_w);
-          pack_y_pitch = (height + 3) / 4;
-      } else {
-	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
-	 pack_y_pitch = ALIGN(mt->height0, align_h);
-      }
-
-      pack_x_pitch = width;
-      pack_x_nr = 1;
-
-      for (level = 0 ; level <= mt->last_level ; level++) {
-	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
-	 GLint x = 0;
-	 GLint y = 0;
-	 GLint q, j;
-
-	 intel_miptree_set_level_info(mt, level, nr_images,
-				      0, mt->total_height,
-				      width, height, depth);
-
-	 for (q = 0; q < nr_images;) {
-	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
-	       intel_miptree_set_image_offset(mt, level, q, x, y);
-	       x += pack_x_pitch;
-	    }
-
-	    x = 0;
-	    y += pack_y_pitch;
-	 }
-
-
-	 mt->total_height += y;
-	 width  = minify(width);
-	 height = minify(height);
-	 depth  = minify(depth);
-
-	 if (mt->compressed) {
-	    pack_y_pitch = (height + 3) / 4;
-
-	    if (pack_x_pitch > ALIGN(width, align_w)) {
-	       pack_x_pitch = ALIGN(width, align_w);
-	       pack_x_nr <<= 1;
-	    }
-	 } else {
-	    if (pack_x_pitch > 4) {
-	       pack_x_pitch >>= 1;
-	       pack_x_nr <<= 1;
-	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-	    }
-
-	    if (pack_y_pitch > 2) {
-	       pack_y_pitch >>= 1;
-	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-	    }
-	 }
-
-      }
-      /* The 965's sampler lays cachelines out according to how accesses
-       * in the texture surfaces run, so they may be "vertical" through
-       * memory.  As a result, the docs say in Surface Padding Requirements:
-       * Sampling Engine Surfaces that two extra rows of padding are required.
-       * We don't know of similar requirements for pre-965, but given that
-       * those docs are silent on padding requirements in general, let's play
-       * it safe.
-       */
-      if (mt->target == GL_TEXTURE_CUBE_MAP)
-	 mt->total_height += 2;
-      break;
-   }
-
-   default:
-      i945_miptree_layout_2d(intel, mt, tiling);
-      break;
-   }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
-		mt->pitch,
-		mt->total_height,
-		mt->cpp,
-		mt->pitch * mt->total_height * mt->cpp );
-
-   return GL_TRUE;
-}
-
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index a2277519ad..ff2466528d 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -184,17 +184,17 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    _mesa_printf("couldn't calculate URB layout!\n");
+	    debug_printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 	 
-	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    _mesa_printf("URB CONSTRAINED\n");
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
       }
 
 done:
-      if (INTEL_DEBUG & DEBUG_URB)
-	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 54f7d7d7c4..e33fa2f0aa 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -64,7 +64,7 @@ struct brw_vs_compile {
 
    struct brw_reg r0;
    struct brw_reg r1;
-   struct brw_reg regs[PROGRAM_ADDRESS+1][128];
+   struct brw_reg regs[TGSI_FILE_COUNT][128];
    struct brw_reg tmp;
    struct brw_reg stack;
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 086f54799e..04132a167b 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -242,10 +242,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    c->prog_data.total_grf = reg;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
-      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
-      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      debug_printf("%s reg = %d\n", __FUNCTION__, reg);
    }
 }
 
@@ -1248,10 +1248,10 @@ void brw_vs_emit(struct brw_vs_compile *c )
    GLuint index;
    GLuint file;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-mesa:\n");
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1526,12 +1526,12 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    post_vs_emit(c, end_inst, last_inst);
 
-   if (INTEL_DEBUG & DEBUG_VS) {
+   if (BRW_DEBUG & DEBUG_VS) {
       int i;
 
-      _mesa_printf("vs-native:\n");
+      debug_printf("vs-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index 1717223e49..05a91f2de4 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -122,7 +122,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
 				  1, chipset_max_threads) - 1;
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       vs.thread4.max_threads = 0;
 
    /* No samplers for ARB_vp programs:
@@ -131,7 +131,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.vs5.sampler_count = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       vs.thread4.stats_enable = 1;
 
    /* Vertex program always enabled:
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
index 51e23b9640..33032276bc 100644
--- a/src/gallium/drivers/i965/brw_winsys.h
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -69,6 +69,7 @@ enum brw_buffer_type
    BRW_BUFFER_TYPE_SHADER_CONSTANTS,
    BRW_BUFFER_TYPE_WM_SCRATCH,
    BRW_BUFFER_TYPE_BATCH,
+   BRW_BUFFER_TYPE_STATE_CACHE,
 };
 
 
@@ -156,11 +157,15 @@ struct brw_winsys_screen {
 			  unsigned offset,
 			  struct brw_winsys_buffer *b2);
 
-   void (*bo_subdata)(struct brw_winsys_buffer *dst,
+   void (*bo_subdata)(struct brw_winsys_buffer *buffer,
 		      size_t offset,
 		      size_t size,
 		      const void *data);
 
+   boolean (*bo_is_busy)(struct brw_winsys_buffer *buffer);
+   boolean (*bo_references)(struct brw_winsys_buffer *a,
+			    struct brw_winsys_buffer *b);
+
    /* XXX: couldn't this be handled by returning true/false on
     * bo_emit_reloc?
     */
@@ -171,18 +176,13 @@ struct brw_winsys_screen {
    /**
     * Map a buffer.
     */
-   void *(*buffer_map)(struct brw_winsys *iws,
-                       struct brw_winsys_buffer *buffer,
-                       boolean write);
+   void *(*bo_map)(struct brw_winsys_buffer *buffer,
+		   boolean write);
 
    /**
     * Unmap a buffer.
     */
-   void (*buffer_unmap)(struct brw_winsys *iws,
-                        struct brw_winsys_buffer *buffer);
-
-   void (*buffer_destroy)(struct brw_winsys *iws,
-                          struct brw_winsys_buffer *buffer);
+   void (*bo_unmap)(struct brw_winsys_buffer *buffer);
    /*@}*/
 
 
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 764708f7df..3d889699f8 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -178,8 +178,8 @@ static void do_wm_prog( struct brw_context *brw,
       brw_wm_non_glsl_emit(brw, c);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM)
-      fprintf(stderr, "\n");
+   if (BRW_DEBUG & DEBUG_WM)
+      debug_printf("\n");
 
    /* get the program
     */
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index bf241f5fa4..5bc2a49c1f 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -33,6 +33,7 @@
 #ifndef BRW_WM_H
 #define BRW_WM_H
 
+#include "tgsi/tgsi_ureg.h"
 
 #include "brw_context.h"
 #include "brw_eu.h"
@@ -57,17 +58,18 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
+   unsigned proj_attrib_mask; /**< one bit per fragment program attribute */
+   unsigned linear_attrib_mask:1;  /**< linear interpolation vs perspective interp */
+
    GLuint source_depth_reg:3;
    GLuint aa_dest_stencil_reg:3;
    GLuint dest_depth_reg:3;
    GLuint nr_depth_regs:3;
-   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint computes_depth:1;
    GLuint source_depth_to_render_target:1;
    GLuint flat_shade:1;
-   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint runtime_check_aads_emit:1;
-   
-   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
+
    GLuint shadowtex_mask:16;
    GLuint yuvtex_mask:16;
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
@@ -75,7 +77,7 @@ struct brw_wm_prog_key {
    GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
-   GLuint drawable_height;
+
    GLuint vp_nr_outputs_written;
 };
 
@@ -151,7 +153,7 @@ struct brw_wm_instruction {
 };
 
 
-#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
+#define BRW_WM_MAX_INSN  2048
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
 #define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
@@ -161,11 +163,19 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_SUBROUTINE 16
 
 
+struct ureg_instruction {
+   unsigned opcode:8;
+   unsigned tex_target:3;
+   struct ureg_dst dst;
+   struct ureg_src src[3];
+};
+
 
 /* New opcodes to track internal operations required for WM unit.
  * These are added early so that the registers used can be tracked,
  * freed and reused like those of other instructions.
  */
+#define MAX_OPCODE        TGSI_OPCODE_LAST
 #define WM_PIXELXY        (MAX_OPCODE)
 #define WM_DELTAXY        (MAX_OPCODE + 1)
 #define WM_PIXELW         (MAX_OPCODE + 2)
@@ -177,7 +187,7 @@ struct brw_wm_instruction {
 #define WM_FRONTFACING    (MAX_OPCODE + 8)
 #define MAX_WM_OPCODE     (MAX_OPCODE + 9)
 
-#define PROGRAM_PAYLOAD   (PROGRAM_FILE_MAX)
+#define PROGRAM_PAYLOAD   (TGSI_FILE_COUNT)
 #define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
 
 struct brw_wm_compile {
@@ -198,15 +208,15 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   struct ureg_instruction prog_instructions[BRW_WM_MAX_INSN];
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
    GLuint fp_fragcolor_emitted;
 
-   struct prog_src_register pixel_xy;
-   struct prog_src_register delta_xy;
-   struct prog_src_register pixel_w;
+   struct ureg_src pixel_xy;
+   struct ureg_src delta_xy;
+   struct ureg_src pixel_w;
 
 
    struct brw_wm_value vreg[BRW_WM_MAX_VREG];
@@ -217,7 +227,7 @@ struct brw_wm_compile {
 
    struct {
       struct brw_wm_value depth[4]; /* includes r0/r1 */
-      struct brw_wm_value input_interp[FRAG_ATTRIB_MAX];
+      struct brw_wm_value input_interp[PIPE_MAX_SHADER_INPUTS];
    } payload;
 
 
@@ -295,7 +305,7 @@ void brw_wm_lookup_iz( GLuint line_aa,
 		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
+//GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 void emit_ddxy(struct brw_compile *p,
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index c6659646f2..04dec5ba39 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -41,21 +41,21 @@ void brw_wm_print_value( struct brw_wm_compile *c,
    if (c->state >= PASS2_DONE) 
       brw_print_reg(value->hw_reg);
    else if( value == &c->undef_value )
-      _mesa_printf("undef");
+      debug_printf("undef");
    else if( value - c->vreg >= 0 &&
 	    value - c->vreg < BRW_WM_MAX_VREG)
-      _mesa_printf("r%d", value - c->vreg);
+      debug_printf("r%d", value - c->vreg);
    else if (value - c->creg >= 0 &&
 	    value - c->creg < BRW_WM_MAX_PARAM)
-      _mesa_printf("c%d", value - c->creg);
+      debug_printf("c%d", value - c->creg);
    else if (value - c->payload.input_interp >= 0 &&
 	    value - c->payload.input_interp < FRAG_ATTRIB_MAX)
-      _mesa_printf("i%d", value - c->payload.input_interp);
+      debug_printf("i%d", value - c->payload.input_interp);
    else if (value - c->payload.depth >= 0 &&
 	    value - c->payload.depth < FRAG_ATTRIB_MAX)
-      _mesa_printf("d%d", value - c->payload.depth);
+      debug_printf("d%d", value - c->payload.depth);
    else 
-      _mesa_printf("?");
+      debug_printf("?");
 }
 
 void brw_wm_print_ref( struct brw_wm_compile *c,
@@ -64,16 +64,16 @@ void brw_wm_print_ref( struct brw_wm_compile *c,
    struct brw_reg hw_reg = ref->hw_reg;
 
    if (ref->unspill_reg)
-      _mesa_printf("UNSPILL(%x)/", ref->value->spill_slot);
+      debug_printf("UNSPILL(%x)/", ref->value->spill_slot);
 
    if (c->state >= PASS2_DONE)
       brw_print_reg(ref->hw_reg);
    else {
-      _mesa_printf("%s", hw_reg.negate ? "-" : "");
-      _mesa_printf("%s", hw_reg.abs ? "abs/" : "");
+      debug_printf("%s", hw_reg.negate ? "-" : "");
+      debug_printf("%s", hw_reg.abs ? "abs/" : "");
       brw_wm_print_value(c, ref->value);
       if ((hw_reg.nr&1) || hw_reg.subnr) {
-	 _mesa_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+	 debug_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
       }
    }
 }
@@ -84,22 +84,22 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    GLuint i, arg;
    GLuint nr_args = brw_wm_nr_args(inst->opcode);
 
-   _mesa_printf("[");
+   debug_printf("[");
    for (i = 0; i < 4; i++) {
       if (inst->dst[i]) {
 	 brw_wm_print_value(c, inst->dst[i]);
 	 if (inst->dst[i]->spill_slot)
-	    _mesa_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+	    debug_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
       }
       else
-	 _mesa_printf("#");
+	 debug_printf("#");
       if (i < 3)      
-	 _mesa_printf(",");
+	 debug_printf(",");
    }
-   _mesa_printf("]");
+   debug_printf("]");
 
    if (inst->writemask != BRW_WRITEMASK_XYZW)
-      _mesa_printf(".%s%s%s%s", 
+      debug_printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
 		   GET_BIT(inst->writemask, 2) ? "z" : "",
@@ -107,58 +107,58 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 
    switch (inst->opcode) {
    case WM_PIXELXY:
-      _mesa_printf(" = PIXELXY");
+      debug_printf(" = PIXELXY");
       break;
    case WM_DELTAXY:
-      _mesa_printf(" = DELTAXY");
+      debug_printf(" = DELTAXY");
       break;
    case WM_PIXELW:
-      _mesa_printf(" = PIXELW");
+      debug_printf(" = PIXELW");
       break;
    case WM_WPOSXY:
-      _mesa_printf(" = WPOSXY");
+      debug_printf(" = WPOSXY");
       break;
    case WM_PINTERP:
-      _mesa_printf(" = PINTERP");
+      debug_printf(" = PINTERP");
       break;
    case WM_LINTERP:
-      _mesa_printf(" = LINTERP");
+      debug_printf(" = LINTERP");
       break;
    case WM_CINTERP:
-      _mesa_printf(" = CINTERP");
+      debug_printf(" = CINTERP");
       break;
    case WM_FB_WRITE:
-      _mesa_printf(" = FB_WRITE");
+      debug_printf(" = FB_WRITE");
       break;
    case WM_FRONTFACING:
-      _mesa_printf(" = FRONTFACING");
+      debug_printf(" = FRONTFACING");
       break;
    default:
-      _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode));
+      debug_printf(" = %s", _mesa_opcode_string(inst->opcode));
       break;
    }
 
    if (inst->saturate)
-      _mesa_printf("_SAT");
+      debug_printf("_SAT");
 
    for (arg = 0; arg < nr_args; arg++) {
 
-      _mesa_printf(" [");
+      debug_printf(" [");
 
       for (i = 0; i < 4; i++) {
 	 if (inst->src[arg][i]) {
 	    brw_wm_print_ref(c, inst->src[arg][i]);
 	 }
 	 else
-	    _mesa_printf("%%");
+	    debug_printf("%%");
 
 	 if (i < 3) 
-	    _mesa_printf(",");
+	    debug_printf(",");
 	 else
-	    _mesa_printf("]");
+	    debug_printf("]");
       }
    }
-   _mesa_printf("\n");
+   debug_printf("\n");
 }
 
 void brw_wm_print_program( struct brw_wm_compile *c,
@@ -166,9 +166,9 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 {
    GLuint insn;
 
-   _mesa_printf("%s:\n", stage);
+   debug_printf("%s:\n", stage);
    for (insn = 0; insn < c->nr_insns; insn++)
       brw_wm_print_insn(c, &c->instruction[insn]);
-   _mesa_printf("\n");
+   debug_printf("\n");
 }
 
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 7df9b79d7a..5f7ae6592c 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -1481,7 +1481,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       default:
-	 _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
+	 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
 		      inst->opcode, inst->opcode < MAX_OPCODE ?
 				    _mesa_opcode_string(inst->opcode) :
 				    "unknown");
@@ -1494,12 +1494,12 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       int i;
 
-      _mesa_printf("wm-native:\n");
+      debug_printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index be240031c7..d594730730 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -142,7 +142,7 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    int bit = _mesa_ffs( ~c->fp_temp );
 
    if (!bit) {
-      _mesa_printf("%s: out of temporaries\n", __FILE__);
+      debug_printf("%s: out of temporaries\n", __FILE__);
       exit(1);
    }
 
@@ -977,7 +977,7 @@ static void print_insns( const struct prog_instruction *insn,
 {
    GLuint i;
    for (i = 0; i < nr; i++, insn++) {
-      _mesa_printf("%3d: ", i);
+      debug_printf("%3d: ", i);
       if (insn->Opcode < MAX_OPCODE)
 	 _mesa_print_instruction(insn);
       else if (insn->Opcode < MAX_WM_OPCODE) {
@@ -988,7 +988,7 @@ static void print_insns( const struct prog_instruction *insn,
 				     3);
       }
       else 
-	 _mesa_printf("965 Opcode %d\n", insn->Opcode);
+	 debug_printf("965 Opcode %d\n", insn->Opcode);
    }
 }
 
@@ -1002,10 +1002,10 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    struct brw_fragment_program *fp = c->fp;
    GLuint insn;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pre-fp:\n");
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pre-fp:\n");
       _mesa_print_program(&fp->program.Base); 
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 
    c->pixel_xy = src_undef();
@@ -1103,10 +1103,10 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pass_fp:\n");
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pass_fp:\n");
       print_insns( c->prog_instructions, c->nr_fp_insns );
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
 
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index a8de5fdd0b..3118e615f9 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1694,7 +1694,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
         c->cur_inst = i;
 
 #if 0
-        _mesa_printf("Inst %d: ", i);
+        debug_printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
 #endif
 
@@ -1920,7 +1920,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                }
                break;
 	    default:
-		_mesa_printf("unsupported IR in fragment shader %d\n",
+		debug_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
 
@@ -1931,11 +1931,11 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     }
     post_wm_emit(c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("wm-native:\n");
+    if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
     }
 }
 
@@ -1945,8 +1945,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
  */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
-    if (INTEL_DEBUG & DEBUG_WM) {
-        _mesa_printf("brw_wm_glsl_emit:\n");
+    if (BRW_DEBUG & DEBUG_WM) {
+        debug_printf("brw_wm_glsl_emit:\n");
     }
 
     /* initial instruction translation/simplification */
@@ -1955,7 +1955,7 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
     /* actual code generation */
     brw_wm_emit_glsl(brw, c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
+    if (BRW_DEBUG & DEBUG_WM) {
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 31b0270e84..71e4c56835 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -101,7 +101,7 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
    GLuint i = c->prog_data.nr_params++;
    
    if (i >= BRW_WM_MAX_PARAM) {
-      _mesa_printf("%s: out of params\n", __FUNCTION__);
+      debug_printf("%s: out of params\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -150,7 +150,7 @@ static const struct brw_wm_ref *get_imm_ref( struct brw_wm_compile *c,
       return c->imm_ref[i].ref;
    }
    else {
-      _mesa_printf("%s: out of imm_refs\n", __FUNCTION__);
+      debug_printf("%s: out of imm_refs\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -434,7 +434,7 @@ void brw_wm_pass0( struct brw_wm_compile *c )
       }
    }
  
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass0");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index f2ae3a958f..85a3a55ca4 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -284,7 +284,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       track_arg(c, inst, 2, read2);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass1");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
index 6faea018fb..a19ca62328 100644
--- a/src/gallium/drivers/i965/brw_wm_pass2.c
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -331,13 +331,13 @@ void brw_wm_pass2( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass2");
    }
 
    c->state = PASS2_DONE;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
        brw_wm_print_program(c, "pass2/done");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
index a8993f9312..32692d533c 100644
--- a/src/gallium/drivers/i965/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -76,8 +76,9 @@ static GLint S_FIXED(GLfloat value, GLuint frac_bits)
 }
 
 
-static struct brw_winsys_buffer *upload_default_color( struct brw_context *brw,
-				     const GLfloat *color )
+static struct brw_winsys_buffer *
+upload_default_color( struct brw_context *brw,
+		      const GLfloat *color )
 {
    struct brw_sampler_default_color sdc;
 
@@ -117,63 +118,6 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
 {
    _mesa_memset(sampler, 0, sizeof(*sampler));
 
-   switch (key->minfilter) {
-   case GL_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_NEAREST_MIPMAP_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_LINEAR_MIPMAP_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_NEAREST_MIPMAP_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   case GL_LINEAR_MIPMAP_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   default:
-      break;
-   }
-
-   /* Set Anisotropy: 
-    */
-   if (key->max_aniso > 1.0) {
-      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
-      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
-
-      if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
-				       BRW_ANISORATIO_16);
-      }
-   }
-   else {
-      switch (key->magfilter) {
-      case GL_NEAREST:
-	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
-	 break;
-      case GL_LINEAR:
-	 sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-	 break;
-      default:
-	 break;
-      }  
-   }
-
-   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
-
    /* Cube-maps on 965 and later must use the same wrap mode for all 3
     * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
     */
@@ -198,36 +142,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
    }
 
 
-   /* Set shadow function: 
-    */
-   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
-      /* Shadowing is "enabled" by emitting a particular sampler
-       * message (sample_c).  So need to recompile WM program when
-       * shadow comparison is enabled on each/any texture unit.
-       */
-      sampler->ss0.shadow_function =
-	 intel_translate_shadow_compare_func(key->comparefunc);
-   }
-
-   /* Set LOD bias: 
-    */
-   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
-
-   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
-   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
-
-   /* Set BaseMipLevel, MaxLOD, MinLOD: 
-    *
-    * XXX: I don't think that using firstLevel, lastLevel works,
-    * because we always setup the surface state as if firstLevel ==
-    * level zero.  Probably have to subtract firstLevel from each of
-    * these:
-    */
-   sampler->ss0.base_level = U_FIXED(0, 1);
 
-   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6);
-   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6);
-   
    sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
 }
 
@@ -237,57 +152,42 @@ static void
 brw_wm_sampler_populate_key(struct brw_context *brw,
 			    struct wm_sampler_key *key)
 {
-   int unit;
+   int nr = MIN2(brw->curr.number_textures,
+		 brw->curr.number_samplers);
+   int i;
 
    memset(key, 0, sizeof(*key));
 
-   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
-      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
-	 struct wm_sampler_entry *entry = &key->sampler[unit];
-	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	 struct gl_texture_object *texObj = texUnit->_Current;
-	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
-	 struct gl_texture_image *firstImage =
-	    texObj->Image[0][intelObj->firstLevel];
-
-         entry->tex_target = texObj->Target;
-
-	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
-	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
-
-	 entry->wrap_r = texObj->WrapR;
-	 entry->wrap_s = texObj->WrapS;
-	 entry->wrap_t = texObj->WrapT;
-
-	 entry->maxlod = texObj->MaxLod;
-	 entry->minlod = texObj->MinLod;
-	 entry->lod_bias = texUnit->LodBias + texObj->LodBias;
-	 entry->max_aniso = texObj->MaxAnisotropy;
-	 entry->minfilter = texObj->MinFilter;
-	 entry->magfilter = texObj->MagFilter;
-	 entry->comparemode = texObj->CompareMode;
-         entry->comparefunc = texObj->CompareFunc;
-
-	 brw->sws->bo_unreference(brw->wm.sdc_bo[unit]);
-	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
-	    float bordercolor[4] = {
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0]
-	    };
-	    /* GL specs that border color for depth textures is taken from the
-	     * R channel, while the hardware uses A.  Spam R into all the
-	     * channels for safety.
-	     */
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
-	 } else {
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->BorderColor);
-	 }
-	 key->sampler_count = unit + 1;
+   for (i = 0; i < nr; i++) {
+      const struct brw_texture *tex = brw->curr.texture[i];
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      struct wm_sampler_entry *entry = &key->sampler[i];
+
+      entry->tex_target = texObj->Target;
+      entry->seamless_cube_map = FALSE; /* XXX: add this to gallium */
+      entry->ss0 = sampler->ss0;
+      entry->ss1 = sampler->ss1;
+      entry->ss3 = sampler->ss3;
+
+      brw->sws->bo_unreference(brw->wm.sdc_bo[i]);
+      if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+	 float bordercolor[4] = {
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0]
+	 };
+	 /* GL specs that border color for depth textures is taken from the
+	  * R channel, while the hardware uses A.  Spam R into all the
+	  * channels for safety.
+	  */
+	 brw->wm.sdc_bo[i] = upload_default_color(brw, bordercolor);
+      } else {
+	 brw->wm.sdc_bo[i] = upload_default_color(brw, texObj->BorderColor);
       }
    }
+
+   key->sampler_count = nr;
 }
 
 /* All samplers must be uploaded in a single contiguous array, which
@@ -354,7 +254,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 
 const struct brw_tracked_state brw_wm_samplers = {
    .dirty = {
-      .mesa = _NEW_TEXTURE,
+      .mesa = PIPE_NEW_BOUND_TEXTURES | PIPE_NEW_SAMPLER,
       .brw = 0,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
index 4989aae830..edabf6ceb6 100644
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -65,7 +65,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       key->max_threads = 1;
    else {
       /* WM maximum threads is number of EUs times number of threads per EU. */
@@ -120,7 +120,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
    ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
    /* _NEW_QUERY */
-   key->stats_wm = intel->stats_wm;
+   key->stats_wm = (brw->query.stats_wm != 0);
 
    /* _NEW_LINE */
    key->line_stipple = ctx->Line.StippleFlag;
@@ -215,7 +215,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    wm.wm5.line_stipple = key->line_stipple;
 
-   if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm)
+   if (BRW_DEBUG & DEBUG_STATS || key->stats_wm)
       wm.wm4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
-- 
cgit v1.2.3


From 7ba2fe40fa092551f1c493d754c80ca93564d32b Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 27 Oct 2009 00:29:21 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_context.h |   1 +
 src/gallium/drivers/i965/brw_eu.c      |  18 ++---
 src/gallium/drivers/i965/brw_eu.h      |   4 +-
 src/gallium/drivers/i965/brw_vs.h      |   6 ++
 src/gallium/drivers/i965/brw_vs_emit.c | 131 ++++++++++++++++-----------------
 src/gallium/drivers/i965/brw_wm.h      |   9 +--
 src/gallium/drivers/i965/brw_wm_glsl.c |   2 +-
 7 files changed, 83 insertions(+), 88 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 8aaf895d20..7b85363e9f 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -289,6 +289,7 @@ struct brw_vs_prog_data {
    GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
 
    GLboolean copy_edgeflag;
+   GLboolean writes_psiz;
 
    /* Used for calculating urb partitions:
     */
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
index df49d4b72f..1189a35b6f 100644
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -152,7 +152,7 @@ const GLuint *brw_get_program( struct brw_compile *p,
  */
 struct brw_glsl_label
 {
-   const char *name; /**< the label string */
+   GLuint label;     /**< the label number */
    GLuint position;  /**< the position of the brw instruction for this label */
    struct brw_glsl_label *next;  /**< next in linked list */
 };
@@ -164,7 +164,7 @@ struct brw_glsl_label
 struct brw_glsl_call
 {
    GLuint call_inst_pos;  /**< location of the CAL instruction */
-   const char *sub_name;  /**< name of subroutine to call */
+   GLuint label;
    struct brw_glsl_call *next;  /**< next in linked list */
 };
 
@@ -173,10 +173,10 @@ struct brw_glsl_call
  * Called for each OPCODE_BGNSUB.
  */
 void
-brw_save_label(struct brw_compile *c, const char *name, GLuint position)
+brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
 {
    struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
-   label->name = name;
+   label->label = l;
    label->position = position;
    label->next = c->first_label;
    c->first_label = label;
@@ -187,11 +187,11 @@ brw_save_label(struct brw_compile *c, const char *name, GLuint position)
  * Called for each OPCODE_CAL.
  */
 void
-brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
+brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
 {
    struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
    call->call_inst_pos = call_pos;
-   call->sub_name = name;
+   call->label = label;
    call->next = c->first_call;
    c->first_call = call;
 }
@@ -201,11 +201,11 @@ brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
  * Lookup a label, return label's position/offset.
  */
 static GLuint
-brw_lookup_label(struct brw_compile *c, const char *name)
+brw_lookup_label(struct brw_compile *c, unsigned l)
 {
    const struct brw_glsl_label *label;
    for (label = c->first_label; label; label = label->next) {
-      if (strcmp(name, label->name) == 0) {
+      if (l == label->label) {
          return label->position;
       }
    }
@@ -224,7 +224,7 @@ brw_resolve_cals(struct brw_compile *c)
     const struct brw_glsl_call *call;
 
     for (call = c->first_call; call; call = call->next) {
-        const GLuint sub_loc = brw_lookup_label(c, call->sub_name);
+        const GLuint sub_loc = brw_lookup_label(c, call->label);
 	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
 	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
 	GLint offset = brw_sub_inst - brw_call_inst;
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index ac5a623cac..3379522104 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -136,10 +136,10 @@ struct brw_compile {
 
 
 void
-brw_save_label(struct brw_compile *c, const char *name, GLuint position);
+brw_save_label(struct brw_compile *c, unsigned label, GLuint position);
 
 void
-brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos);
+brw_save_call(struct brw_compile *c, unsigned label, GLuint call_pos);
 
 void
 brw_resolve_cals(struct brw_compile *c);
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 58119567dc..2a2dbb3457 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -54,6 +54,7 @@ struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
+   struct brw_chipset chipset;
 
    struct brw_vertex_shader *vp;
 
@@ -88,7 +89,12 @@ struct brw_vs_compile {
 
    struct brw_instruction *if_inst[MAX_IF_DEPTH];
    struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   GLuint insn;
+   GLuint if_depth;
+   GLuint loop_depth;
+   GLuint end_offset;
 
+   struct brw_indirect stack_index;
 };
 
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 4daa98b29e..5366ab8514 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -35,19 +35,15 @@
 #include "util/u_math.h"
 
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_ureg_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
 
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_debug.h"
 
 
-struct ureg_instruction {
-   unsigned opcode:8;
-   unsigned tex_target:3;
-   struct ureg_dst dst;
-   struct ureg_src src[3];
-};
-
 
 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 {
@@ -149,7 +145,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (c->chipset.is_igdng)
       mrf = 8;
    else
       mrf = 4;
@@ -251,7 +247,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (c->chipset.is_igdng)
       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
@@ -1058,7 +1054,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    if (c->prog_data.writes_psiz ||
        c->key.nr_userclip || 
-       BRW_IS_965(p->brw))
+       c->chipset.is_965)
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1089,7 +1085,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (BRW_IS_965(p->brw)) {
+      if (c->chipset.is_965) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1117,7 +1113,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
 
-   if (BRW_IS_IGDNG(p->brw)) {
+   if (c->chipset.is_igdng) {
        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
        /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
@@ -1205,6 +1201,9 @@ post_vs_emit( struct brw_vs_compile *c,
 static uint32_t
 get_predicate(const struct ureg_instruction *inst)
 {
+   /* XXX: disabling for now
+    */
+#if 0
    if (inst->dst.CondMask == COND_TR)
       return BRW_PREDICATE_NONE;
 
@@ -1237,11 +1236,15 @@ get_predicate(const struct ureg_instruction *inst)
 		    inst->dst.CondMask);
       return BRW_PREDICATE_NORMAL;
    }
+#else
+   return BRW_PREDICATE_NORMAL;
+#endif
 }
 
 static void emit_insn(struct brw_vs_compile *c,
-		      const struct tgsi_full_instruction *insn)
+		      const struct ureg_instruction *inst)
 {
+   struct brw_compile *p = &c->func;
    struct brw_reg args[3], dst;
    GLuint i;
 
@@ -1253,9 +1256,6 @@ static void emit_insn(struct brw_vs_compile *c,
    /* Get argument regs.
     */
    for (i = 0; i < 3; i++) {
-      const struct ureg_src src = inst->src[i];
-      index = src.Index;
-      file = src.File;	
       args[i] = get_arg(c, inst, i);
    }
 
@@ -1263,16 +1263,13 @@ static void emit_insn(struct brw_vs_compile *c,
     * dst and arg, given the static allocation of registers.  So
     * care needs to be taken emitting multi-operation instructions.
     */ 
-   index = inst->dst.Index;
-   file = inst->dst.File;
    dst = get_dst(c, inst->dst);
 
-   if (inst->SaturateMode != SATURATE_OFF) {
-      debug_printf("Unsupported saturate %d in vertex shader",
-		   inst->SaturateMode);
+   if (inst->dst.Saturate) {
+      debug_printf("Unsupported saturate in vertex shader");
    }
 
-   switch (inst->Opcode) {
+   switch (inst->opcode) {
    case TGSI_OPCODE_ABS:
       brw_MOV(p, dst, brw_abs(args[0]));
       break;
@@ -1291,7 +1288,7 @@ static void emit_insn(struct brw_vs_compile *c,
    case TGSI_OPCODE_DPH:
       brw_DPH(p, dst, args[0], args[1]);
       break;
-   case TGSI_OPCODE_NRM3:
+   case TGSI_OPCODE_NRM:
       emit_nrm(c, dst, args[0], 3);
       break;
    case TGSI_OPCODE_NRM4:
@@ -1384,21 +1381,21 @@ static void emit_insn(struct brw_vs_compile *c,
       emit_xpd(p, dst, args[0], args[1]);
       break;
    case TGSI_OPCODE_IF:
-      assert(if_depth < MAX_IF_DEPTH);
-      if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      assert(c->if_depth < MAX_IF_DEPTH);
+      c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
       /* Note that brw_IF smashes the predicate_control field. */
-      if_inst[if_depth]->header.predicate_control = get_predicate(inst);
-      if_depth++;
+      c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
+      c->if_depth++;
       break;
    case TGSI_OPCODE_ELSE:
-      if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
+      c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
       break;
    case TGSI_OPCODE_ENDIF:
-      assert(if_depth > 0);
-      brw_ENDIF(p, if_inst[--if_depth]);
+      assert(c->if_depth > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_depth]);
       break;			
    case TGSI_OPCODE_BGNLOOP:
-      loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
       break;
    case TGSI_OPCODE_BRK:
       brw_set_predicate_control(p, get_predicate(inst));
@@ -1415,14 +1412,14 @@ static void emit_insn(struct brw_vs_compile *c,
       struct brw_instruction *inst0, *inst1;
       GLuint br = 1;
 
-      loop_depth--;
+      c->loop_depth--;
 
-      if (BRW_IS_IGDNG(brw))
+      if (c->chipset.is_igdng)
 	 br = 2;
 
-      inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+      inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-      while (inst0 > loop_inst[loop_depth]) {
+      while (inst0 > c->loop_inst[c->loop_depth]) {
 	 inst0--;
 	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
 	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
@@ -1442,41 +1439,37 @@ static void emit_insn(struct brw_vs_compile *c,
       break;
    case TGSI_OPCODE_CAL:
       brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
       brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_ADD(p, get_addr_reg(stack_index),
-	      get_addr_reg(stack_index), brw_imm_d(4));
-      brw_save_call(p, inst->Comment, p->nr_insn);
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+      brw_save_call(p, inst->label, p->nr_insn);
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
    case TGSI_OPCODE_RET:
-      brw_ADD(p, get_addr_reg(stack_index),
-	      get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
       brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
+      brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
       brw_set_access_mode(p, BRW_ALIGN_16);
       break;
    case TGSI_OPCODE_END:	
-      end_offset = p->nr_insn;
+      c->end_offset = p->nr_insn;
       /* this instruction will get patched later to jump past subroutine
        * code, etc.
        */
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
-   case TGSI_OPCODE_PRINT:
-      /* no-op */
-      break;
    case TGSI_OPCODE_BGNSUB:
-      brw_save_label(p, inst->Comment, p->nr_insn);
+      brw_save_label(p, p->nr_insn, p->nr_insn);
       break;
    case TGSI_OPCODE_ENDSUB:
       /* no-op */
       break;
    default:
       debug_printf("Unsupported opcode %i (%s) in vertex shader",
-		   inst->Opcode, inst->Opcode < MAX_OPCODE ?
-		   _mesa_opcode_string(inst->Opcode) :
-		   "unknown");
+		   inst->opcode, 
+		   tgsi_get_opcode_name(inst->opcode));
    }
 
    /* Set the predication update on the last instruction of the native
@@ -1485,12 +1478,16 @@ static void emit_insn(struct brw_vs_compile *c,
     * This would be problematic if it was set on a math instruction,
     * but that shouldn't be the case with the current GLSL compiler.
     */
+#if 0
+   /* XXX: disabled
+    */
    if (inst->CondUpdate) {
       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
 
       assert(hw_insn->header.destreg__conditionalmod == 0);
       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
    }
+#endif
 
    release_tmps(c);
 }
@@ -1498,24 +1495,19 @@ static void emit_insn(struct brw_vs_compile *c,
 
 /* Emit the vertex program instructions here.
  */
-void brw_vs_emit(struct brw_vs_compile *c )
+void brw_vs_emit(struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
-   struct brw_context *brw = p->brw;
-   GLuint insn, if_depth = 0, loop_depth = 0;
-   GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   const struct brw_indirect stack_index = brw_indirect(0, 0);   
-   struct tgsi_parse_context parse;
-   struct tgsi_full_declaration *decl;
-   GLuint index;
-   GLuint file;
+   struct ureg_parse_context parse;
+   struct ureg_declaration *decl;
+   struct ureg_declaration *imm;
+   struct ureg_declaration *insn;
 
-   if (BRW_DEBUG & DEBUG_VS) {
-      debug_printf("vs-mesa:\n");
-      _mesa_print_program(&c->vp->program.Base); 
-      debug_printf("\n");
-   }
+   if (BRW_DEBUG & DEBUG_VS)
+      tgsi_dump(c->vp->tokens, 0); 
+
+   c->stack_index = brw_indirect(0, 0);
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
@@ -1523,12 +1515,15 @@ void brw_vs_emit(struct brw_vs_compile *c )
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
-   brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
 
-   for (insn = 0; insn < nr_insns; insn++) {
+   while (ureg_next_decl(&parse, &decl)) {
+   }
 
-      const struct ureg_instruction *inst = &c->vp->program.Base.Instructions[insn];
-      
+   while (ureg_next_immediate(&parse, &imm)) {
+   }
+
+   while (ureg_next_instruction(&parse, &insn)) {
    }
 
    end_inst = &p->store[end_offset];
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 5bc2a49c1f..084430cf28 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -34,6 +34,7 @@
 #define BRW_WM_H
 
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_ureg_parse.h"
 
 #include "brw_context.h"
 #include "brw_eu.h"
@@ -163,14 +164,6 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_SUBROUTINE 16
 
 
-struct ureg_instruction {
-   unsigned opcode:8;
-   unsigned tex_target:3;
-   struct ureg_dst dst;
-   struct ureg_src src[3];
-};
-
-
 /* New opcodes to track internal operations required for WM unit.
  * These are added early so that the registers used can be tracked,
  * freed and reused like those of other instructions.
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 23f7ba16fd..59bc4ef701 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1867,7 +1867,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                 brw_set_access_mode(p, BRW_ALIGN_16);
                 brw_ADD(p, get_addr_reg(stack_index),
                          get_addr_reg(stack_index), brw_imm_d(4));
-		brw_save_call(&c->func, inst->Comment, p->nr_insn);
+		brw_save_call(&c->func, inst->label, p->nr_insn);
                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
                 brw_pop_insn_state(p);
 		break;
-- 
cgit v1.2.3


From 99cc0fd67597cbcd6106afcf437a0d5e2431c9df Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 29 Oct 2009 20:18:01 +0000
Subject: i965g: work in progress on fragment shaders

---
 src/gallium/drivers/i965/brw_context.h     |   10 +-
 src/gallium/drivers/i965/brw_eu.c          |   20 +-
 src/gallium/drivers/i965/brw_eu.h          |    8 +-
 src/gallium/drivers/i965/brw_pipe_depth.c  |   42 +-
 src/gallium/drivers/i965/brw_pipe_rast.c   |   18 +
 src/gallium/drivers/i965/brw_pipe_rast.h   |    1 +
 src/gallium/drivers/i965/brw_pipe_shader.c |    4 +-
 src/gallium/drivers/i965/brw_screen.h      |    7 +
 src/gallium/drivers/i965/brw_vs_emit.c     |    2 -
 src/gallium/drivers/i965/brw_wm.c          |  167 ++---
 src/gallium/drivers/i965/brw_wm.h          |   41 +-
 src/gallium/drivers/i965/brw_wm_debug.c    |   17 +-
 src/gallium/drivers/i965/brw_wm_emit.c     |  195 +++---
 src/gallium/drivers/i965/brw_wm_fp.c       | 1031 ++++++++++------------------
 src/gallium/drivers/i965/brw_wm_glsl.c     |   12 +-
 src/gallium/drivers/i965/brw_wm_pass0.c    |   73 +-
 src/gallium/drivers/i965/brw_wm_pass1.c    |   26 +-
 src/gallium/drivers/i965/brw_wm_state.c    |    8 +-
 18 files changed, 682 insertions(+), 1000 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 7b85363e9f..e6c3161066 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -132,6 +132,8 @@ struct brw_depth_stencil_state {
    struct brw_cc2 cc2;
    struct brw_cc3 cc3;
    struct brw_cc7 cc7;
+
+   unsigned iz_lookup;
 };
 
 
@@ -164,7 +166,10 @@ struct brw_fragment_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
-   GLboolean isGLSL;
+   unsigned iz_lookup;
+   
+   boolean  uses_depth:1;
+   boolean  has_flow_control:1;
 
    unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
@@ -194,6 +199,7 @@ struct brw_fragment_shader {
 #define PIPE_NEW_COLOR_BUFFERS          0x40000
 #define PIPE_NEW_QUERY                  0x80000
 #define PIPE_NEW_SCISSOR                0x100000
+#define PIPE_NEW_BOUND_TEXTURES         0x200000
 
 
@@ -487,7 +493,7 @@ struct brw_context
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_state *zstencil;
 
-      const struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      const struct brw_texture *texture[PIPE_MAX_SAMPLERS];
       const struct pipe_sampler *sampler[PIPE_MAX_SAMPLERS];
       unsigned num_textures;
       unsigned num_samplers;
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
index 1189a35b6f..de43b14512 100644
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -150,22 +150,22 @@ const GLuint *brw_get_program( struct brw_compile *p,
 /**
  * For each OPCODE_BGNSUB we create one of these.
  */
-struct brw_glsl_label
+struct brw_eu_label
 {
    GLuint label;     /**< the label number */
    GLuint position;  /**< the position of the brw instruction for this label */
-   struct brw_glsl_label *next;  /**< next in linked list */
+   struct brw_eu_label *next;  /**< next in linked list */
 };
 
 
 /**
  * For each OPCODE_CAL we create one of these.
  */
-struct brw_glsl_call
+struct brw_eu_call
 {
    GLuint call_inst_pos;  /**< location of the CAL instruction */
    GLuint label;
-   struct brw_glsl_call *next;  /**< next in linked list */
+   struct brw_eu_call *next;  /**< next in linked list */
 };
 
 
@@ -175,7 +175,7 @@ struct brw_glsl_call
 void
 brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
 {
-   struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
+   struct brw_eu_label *label = CALLOC_STRUCT(brw_eu_label);
    label->label = l;
    label->position = position;
    label->next = c->first_label;
@@ -189,7 +189,7 @@ brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
 void
 brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
 {
-   struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
+   struct brw_eu_call *call = CALLOC_STRUCT(brw_eu_call);
    call->call_inst_pos = call_pos;
    call->label = label;
    call->next = c->first_call;
@@ -203,7 +203,7 @@ brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
 static GLuint
 brw_lookup_label(struct brw_compile *c, unsigned l)
 {
-   const struct brw_glsl_label *label;
+   const struct brw_eu_label *label;
    for (label = c->first_label; label; label = label->next) {
       if (l == label->label) {
          return label->position;
@@ -221,7 +221,7 @@ brw_lookup_label(struct brw_compile *c, unsigned l)
 void
 brw_resolve_cals(struct brw_compile *c)
 {
-    const struct brw_glsl_call *call;
+    const struct brw_eu_call *call;
 
     for (call = c->first_call; call; call = call->next) {
         const GLuint sub_loc = brw_lookup_label(c, call->label);
@@ -235,7 +235,7 @@ brw_resolve_cals(struct brw_compile *c)
 
     /* free linked list of calls */
     {
-        struct brw_glsl_call *call, *next;
+        struct brw_eu_call *call, *next;
         for (call = c->first_call; call; call = next) {
 	    next = call->next;
 	    FREE(call);
@@ -245,7 +245,7 @@ brw_resolve_cals(struct brw_compile *c)
 
     /* free linked list of labels */
     {
-        struct brw_glsl_label *label, *next;
+        struct brw_eu_label *label, *next;
 	for (label = c->first_label; label; label = next) {
 	    next = label->next;
 	    FREE(label);
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index 3379522104..7bddc3859c 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -109,8 +109,8 @@ struct brw_indirect {
 };
 
 
-struct brw_glsl_label;
-struct brw_glsl_call;
+struct brw_eu_label;
+struct brw_eu_call;
 
 
@@ -130,8 +130,8 @@ struct brw_compile {
    GLboolean single_program_flow;
    struct brw_context *brw;
 
-   struct brw_glsl_label *first_label;  /**< linked list of labels */
-   struct brw_glsl_call *first_call;    /**< linked list of CALs */
+   struct brw_eu_label *first_label;  /**< linked list of labels */
+   struct brw_eu_call *first_call;    /**< linked list of CALs */
 };
 
 
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
index 33fe517e0b..e010d76e0d 100644
--- a/src/gallium/drivers/i965/brw_pipe_depth.c
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -5,6 +5,10 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 
+/* XXX: Fixme - include this to get IZ_ defines
+ */
+#include "brw_wm.h"
+
 static unsigned brw_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -55,13 +59,9 @@ static unsigned translate_stencil_op(unsigned op)
    }
 }
 
-
-static void *
-brw_create_depth_stencil_state( struct pipe_context *pipe,
-				const struct pipe_depth_stencil_alpha_state *templ )
+static void create_bcc_state( struct brw_depth_stencil_state *zstencil,
+			      const struct pipe_depth_stencil_alpha_state *templ )
 {
-   struct brw_depth_stencil_state *zstencil = CALLOC_STRUCT(brw_depth_stencil_state);
-
    if (templ->stencil[0].enabled) {
       zstencil->cc0.stencil_enable = 1;
       zstencil->cc0.stencil_func =
@@ -108,6 +108,36 @@ brw_create_depth_stencil_state( struct pipe_context *pipe,
       zstencil->cc2.depth_test_function = brw_translate_compare_func(templ->depth.func);
       zstencil->cc2.depth_write_enable = templ->depth.writemask;
    }
+}
+
+static void create_wm_iz_state( struct brw_depth_stencil_state *zstencil )
+{
+   if (zstencil->cc3.alpha_test)
+      zstencil->iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (zstencil->cc2.depth_test)
+      zstencil->iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (zstencil->cc2.depth_write_enable)
+      zstencil->iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_write_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+
+}
+
+
+static void *
+brw_create_depth_stencil_state( struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *templ )
+{
+   struct brw_depth_stencil_state *zstencil = CALLOC_STRUCT(brw_depth_stencil_state);
+
+   create_bcc_state( zstencil, templ );
+   create_wm_iz_state( zstencil );
 
    return (void *)zstencil;
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
index 86822d478a..51159bf147 100644
--- a/src/gallium/drivers/i965/brw_pipe_rast.c
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -64,3 +64,21 @@ calculate_line_stipple_rast()
    bls.bits1.inverse_repeat_count = tmpi;
 
 }
+
+
+
+static void
+calculate_wm_lookup()
+{
+   if (rast->fill_cw == PIPE_POLYGON_MODE_LINE &&
+       rast->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+      line_aa = AA_ALWAYS;
+   }
+   else if (rast->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	    rast->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+      line_aa = AA_SOMETIMES;
+   }
+   else {
+      line_aa = AA_NEVER;
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
index 800a9208a7..9354f01e18 100644
--- a/src/gallium/drivers/i965/brw_pipe_rast.h
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -10,6 +10,7 @@ struct brw_rasterizer_state {
     */
    struct brw_clip_prog_key clip_key;
    struct brw_line_stipple bls;
+   unsigned unfilled_aa_line;
 };
 
 #endif
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 8b61da763c..6e37eac634 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -39,7 +39,7 @@
  * as flow conditionals, loops, subroutines.
  * Some GLSL shaders may use these features, others might not.
  */
-GLboolean brw_wm_is_glsl(const struct brw_fragment_shader *fp)
+GLboolean brw_wm_has_flow_control(const struct brw_fragment_shader *fp)
 {
     return (fp->info.insn_count[TGSI_OPCODE_ARL] > 0 ||
 	    fp->info.insn_count[TGSI_OPCODE_IF] > 0 ||
@@ -144,7 +144,7 @@ static void brwProgramStringNotify( struct brw_context *brw,
       if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = brw->program_id++;      
-      newFP->isGLSL = brw_wm_is_glsl(fprog);
+      newFP->has_flow_control = brw_wm_has_flow_control(fprog);
    }
    else if (target == GL_VERTEX_PROGRAM_ARB) {
       struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
index eafd8ddf77..efa27db1e0 100644
--- a/src/gallium/drivers/i965/brw_screen.h
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -64,6 +64,13 @@ struct brw_buffer
    boolean is_user_buffer;
 };
 
+struct brw_texture
+{
+   struct pipe_texture base;
+
+   ubyte shader_swizzle;
+};
+
 
 /*
  * Cast wrappers
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 6809bccdec..bcc5c5f713 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -1013,8 +1013,6 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
 				       src->SrcRegister.SwizzleZ,
 				       src->SrcRegister.SwizzleW);
 
-   /* Note this is ok for non-swizzle instructions: 
-    */
    reg.negate = src->SrcRegister.Negate ? 1 : 0;   
 
    /* XXX: abs, absneg
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index f0dabfcfd0..33602b59c1 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -28,14 +28,17 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
+#include "pipe/p_error.h"
 
 #include "tgsi/tgsi_info.h"
 
 #include "brw_context.h"
+#include "brw_screen.h"
 #include "brw_util.h"
 #include "brw_wm.h"
 #include "brw_state.h"
 #include "brw_debug.h"
+#include "brw_pipe_rast.h"
 
 
 /** Return number of src args for given instruction */
@@ -85,12 +88,12 @@ GLuint brw_wm_is_scalar_result( GLuint opcode )
 
 
 /**
- * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
- * no flow control instructions so we can more readily do SSA-style
- * optimizations.
+ * Do GPU code generation for shaders without flow control.  Shaders
+ * without flow control instructions can more readily be analysed for
+ * SSA-style optimizations.
  */
 static void
-brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+brw_wm_linear_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
    /* Augment fragment program.  Add instructions for pre- and
     * post-fragment-program tasks such as interpolation and fogging.
@@ -136,7 +139,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
  * Depending on the instructions used (i.e. flow control instructions)
  * we'll use one of two code generators.
  */
-static void do_wm_prog( struct brw_context *brw,
+static int do_wm_prog( struct brw_context *brw,
 			struct brw_fragment_shader *fp, 
 			struct brw_wm_prog_key *key)
 {
@@ -153,7 +156,7 @@ static void do_wm_prog( struct brw_context *brw,
           * without triggering a segfault, no way to signal,
           * so just return.
           */
-         return;
+         return PIPE_ERROR_OUT_OF_MEMORY;
       }
    } else {
       memset(c, 0, sizeof(*brw->wm.compile_data));
@@ -166,19 +169,19 @@ static void do_wm_prog( struct brw_context *brw,
    brw_init_compile(brw, &c->func);
 
    /* temporary sanity check assertion */
-   assert(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   assert(fp->has_flow_control == brw_wm_has_flow_control(c->fp));
 
    /*
     * Shader which use GLSL features such as flow control are handled
     * differently from "simple" shaders.
     */
-   if (fp->isGLSL) {
+   if (fp->has_flow_control) {
       c->dispatch_width = 8;
-      brw_wm_glsl_emit(brw, c);
+      brw_wm_branching_shader_emit(brw, c);
    }
    else {
       c->dispatch_width = 16;
-      brw_wm_non_glsl_emit(brw, c);
+      brw_wm_linear_shader_emit(brw, c);
    }
 
    if (BRW_DEBUG & DEBUG_WM)
@@ -195,6 +198,8 @@ static void do_wm_prog( struct brw_context *brw,
 				       program, program_size,
 				       &c->prog_data,
 				       &brw->wm.prog_data );
+
+   return 0;
 }
 
 
@@ -202,71 +207,36 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct brw_fragment_program *fp = brw->curr.fragment_shader;
-   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
-   GLuint lookup = 0;
-   GLuint line_aa;
-   GLuint i;
+   unsigned lookup, line_aa;
+   unsigned i;
 
    memset(key, 0, sizeof(*key));
 
-   /* Build the index for table lookup
+   /* PIPE_NEW_FRAGMENT_SHADER
+    * PIPE_NEW_DEPTH_STENCIL_ALPHA
     */
-   /* _NEW_COLOR */
-   if (fp->program.UsesKill ||
-       ctx->Color.AlphaEnabled)
-      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
-
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
-      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
-
-   /* _NEW_DEPTH */
-   if (ctx->Depth.Test)
-      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
-
-   if (ctx->Depth.Test &&  
-       ctx->Depth.Mask) /* ?? */
-      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+   lookup = (brw->curr.zstencil->iz_lookup |
+	     brw->curr.fragment_shader->iz_lookup);
 
-   /* _NEW_STENCIL */
-   if (ctx->Stencil._Enabled) {
-      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
-      if (ctx->Stencil.WriteMask[0] ||
-	  ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
-	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
-   }
-
-   line_aa = AA_NEVER;
-
-   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
-   if (ctx->Line.SmoothFlag) {
-      if (brw->intel.reduced_primitive == GL_LINES) {
-	 line_aa = AA_ALWAYS;
-      }
-      else if (brw->intel.reduced_primitive == GL_TRIANGLES) {
-	 if (ctx->Polygon.FrontMode == GL_LINE) {
-	    line_aa = AA_SOMETIMES;
-
-	    if (ctx->Polygon.BackMode == GL_LINE ||
-		(ctx->Polygon.CullFlag &&
-		 ctx->Polygon.CullFaceMode == GL_BACK))
-	       line_aa = AA_ALWAYS;
-	 }
-	 else if (ctx->Polygon.BackMode == GL_LINE) {
-	    line_aa = AA_SOMETIMES;
-
-	    if ((ctx->Polygon.CullFlag &&
-		 ctx->Polygon.CullFaceMode == GL_FRONT))
-	       line_aa = AA_ALWAYS;
-	 }
-      }
+   /* PIPE_NEW_RAST
+    * BRW_NEW_REDUCED_PRIMITIVE 
+    */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_POINTS:
+      line_aa = AA_NEVER;
+      break;
+   case PIPE_PRIM_LINES:
+      line_aa = AA_ALWAYS;
+      break;
+   default:
+      line_aa = brw->curr.rast->unfilled_aa_line;
+      break;
    }
 	 
    brw_wm_lookup_iz(line_aa,
 		    lookup,
-		    uses_depth,
+		    brw->curr.fragment_shader->uses_depth,
 		    key);
 
    /* Revisit this, figure out if it's really useful, and either push
@@ -276,54 +246,39 @@ static void brw_wm_populate_key( struct brw_context *brw,
    key->proj_attrib_mask = ~0; /*brw->wm.input_size_masks[4-1];*/
 
    /* PIPE_NEW_RAST */
-   key->flat_shade = brw->rast.flat_shade;
+   key->flat_shade = brw->curr.rast->templ.flatshade;
 
    /* This can be determined by looking at the INTERP mode each input decl.
     */
-   key->linear_color = 0;
-
-   /* _NEW_TEXTURE */
-   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      if (i < brw->nr_textures) {
-	 const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
-	 const struct gl_texture_object *t = unit->_Current;
-	 const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
-	 
-	 if (img->InternalFormat == GL_YCBCR_MESA) {
-	    key->yuvtex_mask |= 1 << i;
-	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
-	       key->yuvtex_swap_mask |= 1 << i;
-	 }
+   key->linear_attrib_mask = 0;
 
-	 key->tex_swizzles[i] = t->_Swizzle;
+   /* PIPE_NEW_BOUND_TEXTURES */
+   for (i = 0; i < brw->curr.num_textures; i++) {
+      const struct brw_texture *tex = brw->curr.texture[i];
 	 
-	 if (0)
-	    key->shadowtex_mask |= 1<<i;
-      }
-      else {
-         key->tex_swizzles[i] = SWIZZLE_NOOP;
-      }
-   }
+      if (tex->base.format == PIPE_FORMAT_YCBCR)
+	 key->yuvtex_mask |= 1 << i;
 
+      if (tex->base.format == PIPE_FORMAT_YCBCR_REV)
+	 key->yuvtex_swap_mask |= 1 << i;
 
-   /* _NEW_FRAMEBUFFER */
-   if (brw->intel.driDrawable != NULL) {
-      key->drawable_height = brw->fb.cbufs[0].height;
+      /* XXX: shadow texture
+       */
+      /* key->shadowtex_mask |= 1<<i; */
    }
 
    /* CACHE_NEW_VS_PROG */
-   key->vp_nr_outputs_written = brw->vs.prog_data->nr_outputs_written;
+   key->vp_nr_outputs = brw->vs.prog_data->nr_outputs;
 
    /* The unique fragment program ID */
-   key->program_string_id = fp->id;
+   key->program_string_id = brw->curr.fragment_shader->id;
 }
 
 
-static void brw_prepare_wm_prog(struct brw_context *brw)
+static int brw_prepare_wm_prog(struct brw_context *brw)
 {
    struct brw_wm_prog_key key;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)
-      brw->fragment_program;
+   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
      
    brw_wm_populate_key(brw, &key);
 
@@ -335,23 +290,19 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 				      NULL, 0,
 				      &brw->wm.prog_data);
    if (brw->wm.prog_bo == NULL)
-      do_wm_prog(brw, fp, &key);
+      return do_wm_prog(brw, fs, &key);
+
+   return 0;
 }
 
 
 const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
-      .mesa  = (_NEW_COLOR |
-		_NEW_DEPTH |
-                _NEW_HINT |
-		_NEW_STENCIL |
-		_NEW_POLYGON |
-		_NEW_LINE |
-		_NEW_LIGHT |
-		_NEW_BUFFERS |
-		_NEW_TEXTURE),
-      .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
-		BRW_NEW_WM_INPUT_DIMENSIONS |
+      .mesa  = (PIPE_NEW_FRAGMENT_SHADER |
+		PIPE_NEW_DEPTH_STENCIL_ALPHA |
+		PIPE_NEW_RAST |
+		PIPE_NEW_BOUND_TEXTURES),
+      .brw   = (BRW_NEW_WM_INPUT_DIMENSIONS |
 		BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG,
    },
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 084430cf28..2cd5bb7081 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -33,9 +33,6 @@
 #ifndef BRW_WM_H
 #define BRW_WM_H
 
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_ureg_parse.h"
-
 #include "brw_context.h"
 #include "brw_eu.h"
 
@@ -59,8 +56,8 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
-   unsigned proj_attrib_mask; /**< one bit per fragment program attribute */
-   unsigned linear_attrib_mask:1;  /**< linear interpolation vs perspective interp */
+   unsigned proj_attrib_mask;    /**< one bit per fragment program attribute */
+   unsigned linear_attrib_mask;  /**< linear interpolation vs perspective interp */
 
    GLuint source_depth_reg:3;
    GLuint aa_dest_stencil_reg:3;
@@ -75,11 +72,10 @@ struct brw_wm_prog_key {
    GLuint yuvtex_mask:16;
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
 
-   GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
-
-   GLuint program_string_id:32;
+   GLuint vp_nr_outputs:6;
+   GLuint nr_cbufs:3;
 
-   GLuint vp_nr_outputs_written;
+   GLuint program_string_id;
 };
 
 
@@ -146,9 +142,8 @@ struct brw_wm_instruction {
    GLuint opcode:8;
    GLuint saturate:1;
    GLuint writemask:4;
-   GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
-   GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
-   GLuint tex_shadow:1; /* do shadow comparison? */
+   GLuint tex_unit:4;   /* texture/sampler unit for texture instructions */
+   GLuint tex_target:4; /* TGSI_TEXTURE_x for texture instructions*/
    GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
    GLuint target:10;    /* target binding table index for FB_WRITE*/
 };
@@ -180,15 +175,17 @@ struct brw_wm_instruction {
 #define WM_FRONTFACING    (MAX_OPCODE + 8)
 #define MAX_WM_OPCODE     (MAX_OPCODE + 9)
 
-#define PROGRAM_PAYLOAD   (TGSI_FILE_COUNT)
-#define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
+#define BRW_FILE_PAYLOAD   (TGSI_FILE_COUNT)
+#define PAYLOAD_DEPTH      (FRAG_ATTRIB_MAX) /* ?? */
+
+struct brw_passfp_program;
 
 struct brw_wm_compile {
    struct brw_compile func;
    struct brw_wm_prog_key key;
    struct brw_wm_prog_data prog_data;
 
-   struct brw_fragment_program *fp;
+   struct brw_fragment_shader *fp;
 
    GLfloat (*env_param)[4];
 
@@ -201,15 +198,7 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct ureg_instruction prog_instructions[BRW_WM_MAX_INSN];
-   GLuint nr_fp_insns;
-   GLuint fp_temp;
-   GLuint fp_interp_emitted;
-   GLuint fp_fragcolor_emitted;
-
-   struct ureg_src pixel_xy;
-   struct ureg_src delta_xy;
-   struct ureg_src pixel_w;
+   struct brw_passfp_program *pass_fp;
 
 
    struct brw_wm_value vreg[BRW_WM_MAX_VREG];
@@ -298,8 +287,8 @@ void brw_wm_lookup_iz( GLuint line_aa,
 		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
-//GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
+GLboolean brw_wm_has_flow_control(const struct brw_fragment_shader *fp);
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 void emit_ddxy(struct brw_compile *p,
 	       const struct brw_reg *dst,
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index 04dec5ba39..65d7626eea 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -28,7 +28,8 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-               
+
+#include "tgsi/tgsi_info.h"
 
 #include "brw_context.h"
 #include "brw_wm.h"
@@ -49,10 +50,10 @@ void brw_wm_print_value( struct brw_wm_compile *c,
 	    value - c->creg < BRW_WM_MAX_PARAM)
       debug_printf("c%d", value - c->creg);
    else if (value - c->payload.input_interp >= 0 &&
-	    value - c->payload.input_interp < FRAG_ATTRIB_MAX)
+	    value - c->payload.input_interp < PIPE_MAX_SHADER_INPUTS)
       debug_printf("i%d", value - c->payload.input_interp);
    else if (value - c->payload.depth >= 0 &&
-	    value - c->payload.depth < FRAG_ATTRIB_MAX)
+	    value - c->payload.depth < PIPE_MAX_SHADER_INPUTS)
       debug_printf("d%d", value - c->payload.depth);
    else 
       debug_printf("?");
@@ -100,10 +101,10 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 
    if (inst->writemask != BRW_WRITEMASK_XYZW)
       debug_printf(".%s%s%s%s", 
-		   GET_BIT(inst->writemask, 0) ? "x" : "",
-		   GET_BIT(inst->writemask, 1) ? "y" : "",
-		   GET_BIT(inst->writemask, 2) ? "z" : "",
-		   GET_BIT(inst->writemask, 3) ? "w" : "");
+		   (inst->writemask & BRW_WRITEMASK_X) ? "x" : "",
+		   (inst->writemask & BRW_WRITEMASK_Y) ? "y" : "",
+		   (inst->writemask & BRW_WRITEMASK_Z) ? "z" : "",
+		   (inst->writemask & BRW_WRITEMASK_W) ? "w" : "");
 
    switch (inst->opcode) {
    case WM_PIXELXY:
@@ -134,7 +135,7 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
       debug_printf(" = FRONTFACING");
       break;
    default:
-      debug_printf(" = %s", _mesa_opcode_string(inst->opcode));
+      debug_printf(" = %s", tgsi_get_opcode_info(inst->opcode)->mnemonic);
       break;
    }
 
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 5f7ae6592c..a705d8b344 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -28,10 +28,13 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-               
+
+#include "util/u_math.h"
+#include "tgsi/tgsi_info.h"
 
 #include "brw_context.h"
 #include "brw_wm.h"
+#include "brw_debug.h"
 
 /* Not quite sure how correct this is - need to understand horiz
  * vs. vertical strides a little better.
@@ -45,15 +48,15 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
 
 /* Payload R0:
  *
- * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
+ * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 quads,
  *         corresponding to each of the 16 execution channels.
  * R0.1..8 -- ?
  * R1.0 -- triangle vertex 0.X
  * R1.1 -- triangle vertex 0.Y
- * R1.2 -- tile 0 x,y coords (2 packed uwords)
- * R1.3 -- tile 1 x,y coords (2 packed uwords)
- * R1.4 -- tile 2 x,y coords (2 packed uwords)
- * R1.5 -- tile 3 x,y coords (2 packed uwords)
+ * R1.2 -- quad 0 x,y coords (2 packed uwords)
+ * R1.3 -- quad 1 x,y coords (2 packed uwords)
+ * R1.4 -- quad 2 x,y coords (2 packed uwords)
+ * R1.5 -- quad 3 x,y coords (2 packed uwords)
  * R1.6 -- ?
  * R1.7 -- ?
  * R1.8 -- ?
@@ -134,11 +137,17 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
    /* XXX: is this needed any more, or is this a NOOP?
     */
    if (mask & BRW_WRITEMASK_Y) {
+#if 0
       /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
 	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 	      brw_imm_d(c->key.drawable_height - 1));
+#else
+      brw_MOV(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
+#endif
    }
 }
 
@@ -279,28 +288,28 @@ static void emit_frontfacing( struct brw_compile *p,
 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
  * looking like:
  *
- * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ * arg0: q0.tl q0.tr q0.bl q0.br q1.tl q1.tr q1.bl q1.br
  *
  * and we're trying to produce:
  *
  *           DDX                     DDY
- * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
- *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
- *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
- *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
- *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
- *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
- *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
- *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ * dst: (q0.tr - q0.tl)     (q0.tl - q0.bl)
+ *      (q0.tr - q0.tl)     (q0.tr - q0.br)
+ *      (q0.br - q0.bl)     (q0.tl - q0.bl)
+ *      (q0.br - q0.bl)     (q0.tr - q0.br)
+ *      (q1.tr - q1.tl)     (q1.tl - q1.bl)
+ *      (q1.tr - q1.tl)     (q1.tr - q1.br)
+ *      (q1.br - q1.bl)     (q1.tl - q1.bl)
+ *      (q1.br - q1.bl)     (q1.tr - q1.br)
  *
- * and add another set of two more subspans if in 16-pixel dispatch mode.
+ * and add two more quads if in 16-pixel dispatch mode.
  *
  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
  * between each other.  We could probably do it like ddx and swizzle the right
  * order later, but bail for now and just produce
- * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
+ * ((q0.tl - q0.bl)x4 (q1.tl - q1.bl)x4)
  */
 void emit_ddxy(struct brw_compile *p,
 	       const struct brw_reg *dst,
@@ -611,12 +620,12 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
    if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -633,12 +642,12 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
    if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -656,12 +665,12 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   const int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+   const int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
    if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -704,12 +713,12 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
-   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
    if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
@@ -732,12 +741,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
-   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
    if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -790,21 +799,32 @@ static void emit_tex( struct brw_wm_compile *c,
    GLuint i, nr;
    GLuint emit;
    GLuint msg_type;
+   GLboolean shadow = FALSE;
 
    /* How many input regs are there?
     */
-   switch (inst->tex_idx) {
-   case TEXTURE_1D_INDEX:
+   switch (inst->tex_target) {
+   case TGSI_TEXTURE_1D:
       emit = BRW_WRITEMASK_X;
       nr = 1;
       break;
-   case TEXTURE_2D_INDEX:
-   case TEXTURE_RECT_INDEX:
+   case TGSI_TEXTURE_SHADOW1D:
+      emit = BRW_WRITEMASK_XW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_2D:
       emit = BRW_WRITEMASK_XY;
       nr = 2;
       break;
-   case TEXTURE_3D_INDEX:
-   case TEXTURE_CUBE_INDEX:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      emit = BRW_WRITEMASK_XYW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
       emit = BRW_WRITEMASK_XYZ;
       nr = 3;
       break;
@@ -813,11 +833,6 @@ static void emit_tex( struct brw_wm_compile *c,
       abort();
    }
 
-   if (inst->tex_shadow) {
-      nr = 4;
-      emit |= BRW_WRITEMASK_W;
-   }
-
    msgLength = 1;
 
    for (i = 0; i < nr; i++) {
@@ -832,12 +847,12 @@ static void emit_tex( struct brw_wm_compile *c,
    responseLength = 8;		/* always */
 
    if (BRW_IS_IGDNG(p->brw)) {
-       if (inst->tex_shadow)
+       if (shadow)
            msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
        else
            msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
    } else {
-       if (inst->tex_shadow)
+       if (shadow)
            msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
        else
            msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
@@ -870,20 +885,23 @@ static void emit_txb( struct brw_wm_compile *c,
    GLuint msg_type;
    /* Shadow ignored for txb.
     */
-   switch (inst->tex_idx) {
-   case TEXTURE_1D_INDEX:
+   switch (inst->tex_target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
       brw_MOV(p, brw_message_reg(2), arg[0]);
       brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
-   case TEXTURE_2D_INDEX:
-   case TEXTURE_RECT_INDEX:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
       brw_MOV(p, brw_message_reg(2), arg[0]);
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
-   case TEXTURE_3D_INDEX:
-   case TEXTURE_CUBE_INDEX:
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
       brw_MOV(p, brw_message_reg(2), arg[0]);
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), arg[2]);
@@ -976,10 +994,10 @@ static void emit_kil( struct brw_wm_compile *c,
    }
 }
 
-/* KIL_NV kills the pixels that are currently executing, not based on a test
+/* KILLP kills the pixels that are currently executing, not based on a test
  * of the arguments.
  */
-static void emit_kil_nv( struct brw_wm_compile *c )
+static void emit_killp( struct brw_wm_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
@@ -1259,7 +1277,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
     */
    spill_values(c, c->payload.depth, 4);
    spill_values(c, c->creg, c->nr_creg);
-   spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
+   spill_values(c, c->payload.input_interp, PIPE_MAX_SHADER_INPUTS);
    
 
    for (insn = 0; insn < c->nr_insns; insn++) {
@@ -1328,89 +1346,89 @@ void brw_wm_emit( struct brw_wm_compile *c )
 
 	 /* Straightforward arithmetic:
 	  */
-      case OPCODE_ADD:
+      case TGSI_OPCODE_ADD:
 	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_FRC:
+      case TGSI_OPCODE_FRC:
 	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_FLR:
+      case TGSI_OPCODE_FLR:
 	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_DDX:
+      case TGSI_OPCODE_DDX:
 	 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
 	 break;
 
-      case OPCODE_DDY:
+      case TGSI_OPCODE_DDY:
 	 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
 	 break;
 
-      case OPCODE_DP3:
+      case TGSI_OPCODE_DP3:
 	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_DP4:
+      case TGSI_OPCODE_DP4:
 	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_DPH:
+      case TGSI_OPCODE_DPH:
 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_TRUNC:
+      case TGSI_OPCODE_TRUNC:
 	 emit_trunc(p, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_LRP:
+      case TGSI_OPCODE_LRP:
 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
 
-      case OPCODE_MAD:	
+      case TGSI_OPCODE_MAD:	
 	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
 
-      case OPCODE_MOV:
+      case TGSI_OPCODE_MOV:
 	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_MUL:
+      case TGSI_OPCODE_MUL:
 	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_XPD:
+      case TGSI_OPCODE_XPD:
 	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
 	 /* Higher math functions:
 	  */
-      case OPCODE_RCP:
+      case TGSI_OPCODE_RCP:
 	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_RSQ:
+      case TGSI_OPCODE_RSQ:
 	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_SIN:
+      case TGSI_OPCODE_SIN:
 	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_COS:
+      case TGSI_OPCODE_COS:
 	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_EX2:
+      case TGSI_OPCODE_EX2:
 	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_LG2:
+      case TGSI_OPCODE_LG2:
 	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_SCS:
+      case TGSI_OPCODE_SCS:
 	 /* There is an scs math function, but it would need some
 	  * fixup for 16-element execution.
 	  */
@@ -1420,71 +1438,70 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
 	 break;
 
-      case OPCODE_POW:
+      case TGSI_OPCODE_POW:
 	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
 	 break;
 
 	 /* Comparisons:
 	  */
-      case OPCODE_CMP:
+      case TGSI_OPCODE_CMP:
 	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
 
-      case OPCODE_MAX:
+      case TGSI_OPCODE_MAX:
 	 emit_max(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_MIN:
+      case TGSI_OPCODE_MIN:
 	 emit_min(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_SLT:
+      case TGSI_OPCODE_SLT:
 	 emit_slt(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
-      case OPCODE_SLE:
+      case TGSI_OPCODE_SLE:
 	 emit_sle(p, dst, dst_flags, args[0], args[1]);
 	break;
-      case OPCODE_SGT:
+      case TGSI_OPCODE_SGT:
 	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
 	break;
-      case OPCODE_SGE:
+      case TGSI_OPCODE_SGE:
 	 emit_sge(p, dst, dst_flags, args[0], args[1]);
 	 break;
-      case OPCODE_SEQ:
+      case TGSI_OPCODE_SEQ:
 	 emit_seq(p, dst, dst_flags, args[0], args[1]);
 	break;
-      case OPCODE_SNE:
+      case TGSI_OPCODE_SNE:
 	 emit_sne(p, dst, dst_flags, args[0], args[1]);
 	break;
 
-      case OPCODE_LIT:
+      case TGSI_OPCODE_LIT:
 	 emit_lit(p, dst, dst_flags, args[0]);
 	 break;
 
 	 /* Texturing operations:
 	  */
-      case OPCODE_TEX:
+      case TGSI_OPCODE_TEX:
 	 emit_tex(c, inst, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_TXB:
+      case TGSI_OPCODE_TXB:
 	 emit_txb(c, inst, dst, dst_flags, args[0]);
 	 break;
 
-      case OPCODE_KIL:
+      case TGSI_OPCODE_KIL:
 	 emit_kil(c, args[0]);
 	 break;
 
-      case OPCODE_KIL_NV:
-	 emit_kil_nv(c);
+      case TGSI_OPCODE_KILP:
+	 emit_killp(c);
 	 break;
 
       default:
 	 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
-		      inst->opcode, inst->opcode < MAX_OPCODE ?
-				    _mesa_opcode_string(inst->opcode) :
-				    "unknown");
+		      inst->opcode, 
+		      tgsi_get_opcode_info(inst->opcode)->mnemonic);
       }
       
       for (i = 0; i < 4; i++)
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index d594730730..8ba037cdae 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -30,9 +30,8 @@
   */
                
 
-#include "pipe/p_shader_constants.h"
+#include "pipe/p_shader_tokens.h"
 
-#include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_util.h"
 
@@ -43,7 +42,7 @@
 #define W    3
 
 
-static const char *wm_opcode_strings[] = {   
+static const char *wm_opcode_strings[] = {
    "PIXELXY",
    "DELTAXY",
    "PIXELW",
@@ -57,143 +56,6 @@ static const char *wm_opcode_strings[] = {
 
 
-/***********************************************************************
- * Source regs
- */
-
-static struct prog_src_register src_reg(GLuint file, GLuint idx)
-{
-   struct prog_src_register reg;
-   reg.File = file;
-   reg.Index = idx;
-   reg.Swizzle = SWIZZLE_NOOP;
-   reg.RelAddr = 0;
-   reg.Negate = NEGATE_NONE;
-   reg.Abs = 0;
-   return reg;
-}
-
-static struct prog_src_register src_reg_from_dst(struct prog_dst_register dst)
-{
-   return src_reg(dst.File, dst.Index);
-}
-
-static struct prog_src_register src_undef( void )
-{
-   return src_reg(PROGRAM_UNDEFINED, 0);
-}
-
-static GLboolean src_is_undef(struct prog_src_register src)
-{
-   return src.File == PROGRAM_UNDEFINED;
-}
-
-static struct prog_src_register src_swizzle( struct prog_src_register reg, int x, int y, int z, int w )
-{
-   reg.Swizzle = MAKE_SWIZZLE4(x,y,z,w);
-   return reg;
-}
-
-static struct prog_src_register src_swizzle1( struct prog_src_register reg, int x )
-{
-   return src_swizzle(reg, x, x, x, x);
-}
-
-static struct prog_src_register src_swizzle4( struct prog_src_register reg, uint swizzle )
-{
-   reg.Swizzle = swizzle;
-   return reg;
-}
-
-
-/***********************************************************************
- * Dest regs
- */
-
-static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
-{
-   struct prog_dst_register reg;
-   reg.File = file;
-   reg.Index = idx;
-   reg.WriteMask = BRW_WRITEMASK_XYZW;
-   reg.RelAddr = 0;
-   reg.CondMask = COND_TR;
-   reg.CondSwizzle = 0;
-   reg.CondSrc = 0;
-   reg.pad = 0;
-   return reg;
-}
-
-static struct prog_dst_register dst_mask( struct prog_dst_register reg, int mask )
-{
-   reg.WriteMask &= mask;
-   return reg;
-}
-
-static struct prog_dst_register dst_undef( void )
-{
-   return dst_reg(PROGRAM_UNDEFINED, 0);
-}
-
-
-
-static struct prog_dst_register get_temp( struct brw_wm_compile *c )
-{
-   int bit = _mesa_ffs( ~c->fp_temp );
-
-   if (!bit) {
-      debug_printf("%s: out of temporaries\n", __FILE__);
-      exit(1);
-   }
-
-   c->fp_temp |= 1<<(bit-1);
-   return dst_reg(PROGRAM_TEMPORARY, c->first_internal_temp+(bit-1));
-}
-
-
-static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
-{
-   c->fp_temp &= ~(1 << (temp.Index - c->first_internal_temp));
-}
-
-
-/***********************************************************************
- * Instructions 
- */
-
-static struct prog_instruction *get_fp_inst(struct brw_wm_compile *c)
-{
-   return &c->prog_instructions[c->nr_fp_insns++];
-}
-
-static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
-					const struct prog_instruction *inst0)
-{
-   struct prog_instruction *inst = get_fp_inst(c);
-   *inst = *inst0;
-   return inst;
-}
-
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
-					 GLuint op,
-					 struct prog_dst_register dest,
-					 GLuint saturate,
-					 struct prog_src_register src0,
-					 struct prog_src_register src1,
-					 struct prog_src_register src2 )
-{
-   struct prog_instruction *inst = get_fp_inst(c);
-      
-   memset(inst, 0, sizeof(*inst));
-
-   inst->Opcode = op;
-   inst->DstReg = dest;
-   inst->SaturateMode = saturate;   
-   inst->SrcReg[0] = src0;
-   inst->SrcReg[1] = src1;
-   inst->SrcReg[2] = src2;
-   return inst;
-}
 
 
 /* Many opcodes produce the same value across all the result channels.
@@ -202,32 +64,28 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
  * anyway.  We can easily get both by emitting the opcode to one channel, and
  * then MOVing it to the others, which brw_wm_pass*.c already understands.
  */
-static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
-						 const struct prog_instruction *inst0)
-{
-   struct prog_instruction *inst;
-   unsigned int dst_chan;
-   unsigned int other_channel_mask;
-
-   if (inst0->DstReg.WriteMask == 0)
-      return NULL;
-
-   dst_chan = _mesa_ffs(inst0->DstReg.WriteMask) - 1;
-   inst = get_fp_inst(c);
-   *inst = *inst0;
-   inst->DstReg.WriteMask = 1 << dst_chan;
-
-   other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
-   if (other_channel_mask != 0) {
-      inst = emit_op(c,
-		     TGSI_OPCODE_MOV,
-		     dst_mask(inst0->DstReg, other_channel_mask),
-		     0,
-		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
-		     src_undef(),
-		     src_undef());
+static void emit_scalar_insn(struct brw_wm_compile *c,
+			     unsigned opcode,
+			     struct brw_dst dst,
+			     struct brw_src src0,
+			     struct brw_src src1,
+			     struct brw_src src2 )
+{
+   unsigned first_chan = ffs(dst.writemask) - 1;
+   unsigned first_mask = 1 << first_chan;
+
+   if (dst.writemask == 0)
+      return;
+
+   emit_op( c, opcode,
+	    brw_writemask(dst, first_mask),
+	    src0, src1, src2 );
+
+   if (dst.writemask != first_mask) {
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       brw_writemask(dst, ~first_mask),
+	       src_swizzle1(brw_src(dst), first_chan));
    }
-   return inst;
 }
 
 
@@ -235,11 +93,11 @@ static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
  * Special instructions for interpolation and other tasks
  */
 
-static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
+static struct ureg_src get_pixel_xy( struct brw_wm_compile *c )
 {
    if (src_is_undef(c->pixel_xy)) {
-      struct prog_dst_register pixel_xy = get_temp(c);
-      struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
+      struct ureg_dst pixel_xy = get_temp(c);
+      struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
       
       
       /* Emit the out calculations, and hold onto the results.  Use
@@ -250,7 +108,6 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
       emit_op(c,
 	      WM_PIXELXY,
 	      dst_mask(pixel_xy, BRW_WRITEMASK_XY),
-	      0,
 	      payload_r0_depth,
 	      src_undef(),
 	      src_undef());
@@ -261,19 +118,18 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
    return c->pixel_xy;
 }
 
-static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
+static struct ureg_src get_delta_xy( struct brw_wm_compile *c )
 {
    if (src_is_undef(c->delta_xy)) {
-      struct prog_dst_register delta_xy = get_temp(c);
-      struct prog_src_register pixel_xy = get_pixel_xy(c);
-      struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
+      struct ureg_dst delta_xy = get_temp(c);
+      struct ureg_src pixel_xy = get_pixel_xy(c);
+      struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
       
       /* deltas.xy = DELTAXY pixel_xy, payload[0]
        */
       emit_op(c,
 	      WM_DELTAXY,
 	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
-	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
 	      src_undef());
@@ -284,19 +140,18 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
    return c->delta_xy;
 }
 
-static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
+static struct ureg_src get_pixel_w( struct brw_wm_compile *c )
 {
    if (src_is_undef(c->pixel_w)) {
-      struct prog_dst_register pixel_w = get_temp(c);
-      struct prog_src_register deltas = get_delta_xy(c);
-      struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS);
+      struct ureg_dst pixel_w = get_temp(c);
+      struct ureg_src deltas = get_delta_xy(c);
+      struct ureg_src interp_wpos = src_reg(TGSI_FILE_PAYLOAD, FRAG_ATTRIB_WPOS);
 
       /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
        */
       emit_op(c,
 	      WM_PIXELW,
 	      dst_mask(pixel_w, BRW_WRITEMASK_W),
-	      0,
 	      interp_wpos,
 	      deltas, 
 	      src_undef());
@@ -313,9 +168,9 @@ static void emit_interp( struct brw_wm_compile *c,
 			 GLuint semantic_index,
 			 GLuint interp_mode )
 {
-   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
-   struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-   struct prog_src_register deltas = get_delta_xy(c);
+   struct ureg_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
+   struct ureg_src interp = src_reg(TGSI_FILE_PAYLOAD, idx);
+   struct ureg_src deltas = get_delta_xy(c);
 
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
@@ -325,271 +180,197 @@ static void emit_interp( struct brw_wm_compile *c,
    case FRAG_ATTRIB_WPOS:
       /* Have to treat wpos.xy specially:
        */
-      emit_op(c,
+      emit_op1(c,
 	      WM_WPOSXY,
 	      dst_mask(dst, BRW_WRITEMASK_XY),
-	      0,
-	      get_pixel_xy(c),
-	      src_undef(),
-	      src_undef());
+	      get_pixel_xy(c));
       
-      dst = dst_mask(dst, BRW_WRITEMASK_ZW);
-
-      /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+      /* TGSI_FILE_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
        */
-      emit_op(c,
-	      WM_LINTERP,
-	      dst,
-	      0,
-	      interp,
-	      deltas,
-	      src_undef());
+      emit_op2(c,
+	       WM_LINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_ZW),
+	       interp,
+	       deltas);
       break;
 
    case TGSI_SEMANTIC_COLOR:
       if (c->key.flat_shade) {
-	 emit_op(c,
+	 emit_op1(c,
 		 WM_CINTERP,
 		 dst,
-		 0,
-		 interp,
-		 src_undef(),
-		 src_undef());
+		 interp);
+      }
+      else if (interp_mode == TGSI_INTERPOLATE_LINEAR) {
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
       }
       else {
-	 emit_op(c,
-		 translate_interp_mode(interp_mode),
-		 dst,
-		 0,
-		 interp,
-		 deltas,
-		 src_undef());
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
       }
+
       break;
    case FRAG_ATTRIB_FOGC:
       /* Interpolate the fog coordinate */
-      emit_op(c,
+      emit_op3(c,
 	      WM_PINTERP,
 	      dst_mask(dst, BRW_WRITEMASK_X),
-	      0,
 	      interp,
 	      deltas,
 	      get_pixel_w(c));
 
-      emit_op(c,
+      emit_op1(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, BRW_WRITEMASK_YZW),
-	      0,
-	      src_swizzle(interp,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ONE),
-	      src_undef(),
-	      src_undef());
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
+	      brw_imm1f(0.0));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	      brw_imm1f(1.0));
       break;
 
    case FRAG_ATTRIB_FACE:
       /* XXX review/test this case */
-      emit_op(c,
-              WM_FRONTFACING,
-              dst_mask(dst, BRW_WRITEMASK_X),
-              0,
-              src_undef(),
-              src_undef(),
-              src_undef());
+      emit_op0(c,
+	       WM_FRONTFACING,
+	       dst_mask(dst, BRW_WRITEMASK_X));
+      
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
+	      brw_imm1f(0.0));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	      brw_imm1f(1.0));
       break;
 
    case FRAG_ATTRIB_PNTC:
       /* XXX review/test this case */
-      emit_op(c,
-	      WM_PINTERP,
-	      dst_mask(dst, BRW_WRITEMASK_XY),
-	      0,
-	      interp,
-	      deltas,
-	      get_pixel_w(c));
-
-      emit_op(c,
+      emit_op3(c,
+	       WM_PINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_XY),
+	       interp,
+	       deltas,
+	       get_pixel_w(c));
+
+      emit_op1(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, BRW_WRITEMASK_ZW),
-	      0,
-	      src_swizzle(interp,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ZERO,
-			  SWIZZLE_ONE),
-	      src_undef(),
-	      src_undef());
-      break;
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      brw_imm1f(c->pass_fp, 0.0f));
 
-   default:
-      emit_op(c,
-	      translate_interp_mode(interp_mode),
-	      dst,
-	      0,
-	      interp,
-	      deltas,
-	      get_pixel_w(c));
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	      brw_imm1f(c->pass_fp, 1.0f));
       break;
-   }
-}
-
-/***********************************************************************
- * Hacks to extend the program parameter and constant lists.
- */
-
-/* Add the fog parameters to the parameter list of the original
- * program, rather than creating a new list.  Doesn't really do any
- * harm and it's not as if the parameter handling isn't a big hack
- * anyway.
- */
-static struct prog_src_register search_or_add_param5(struct brw_wm_compile *c, 
-                                                     GLint s0,
-                                                     GLint s1,
-                                                     GLint s2,
-                                                     GLint s3,
-                                                     GLint s4)
-{
-   struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters;
-   gl_state_index tokens[STATE_LENGTH];
-   GLuint idx;
-   tokens[0] = s0;
-   tokens[1] = s1;
-   tokens[2] = s2;
-   tokens[3] = s3;
-   tokens[4] = s4;
-   
-   for (idx = 0; idx < paramList->NumParameters; idx++) {
-      if (paramList->Parameters[idx].Type == PROGRAM_STATE_VAR &&
-	  memcmp(paramList->Parameters[idx].StateIndexes, tokens, sizeof(tokens)) == 0)
-	 return src_reg(PROGRAM_STATE_VAR, idx);
-   }
-
-   idx = _mesa_add_state_reference( paramList, tokens );
-
-   return src_reg(PROGRAM_STATE_VAR, idx);
-}
 
+   default: 
+      switch (interp_mode) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 emit_op1(c,
+		  WM_CINTERP,
+		  dst,
+		  interp);
+	 break;
 
-static struct prog_src_register search_or_add_const4f( struct brw_wm_compile *c, 
-						     GLfloat s0,
-						     GLfloat s1,
-						     GLfloat s2,
-						     GLfloat s3)
-{
-   struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters;
-   GLfloat values[4];
-   GLuint idx;
-   GLuint swizzle;
-
-   values[0] = s0;
-   values[1] = s1;
-   values[2] = s2;
-   values[3] = s3;
-
-   /* Have to search, otherwise multiple compilations will each grow
-    * the parameter list.
-    */
-   for (idx = 0; idx < paramList->NumParameters; idx++) {
-      if (paramList->Parameters[idx].Type == PROGRAM_CONSTANT &&
-	  memcmp(paramList->ParameterValues[idx], values, sizeof(values)) == 0)
+      case TGSI_INTERPOLATE_LINEAR:
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
+	 break;
 
-	 /* XXX: this mimics the mesa bug which puts all constants and
-	  * parameters into the "PROGRAM_STATE_VAR" category:
-	  */
-	 return src_reg(PROGRAM_STATE_VAR, idx);
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
+	 break;
+      }
+      break;
    }
-   
-   idx = _mesa_add_unnamed_constant( paramList, values, 4, &swizzle );
-   assert(swizzle == SWIZZLE_NOOP); /* Need to handle swizzle in reg setup */
-   return src_reg(PROGRAM_STATE_VAR, idx);
 }
 
 
-
 /***********************************************************************
  * Expand various instructions here to simpler forms.  
  */
 static void precalc_dst( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
+			 struct brw_dst dst,
+			 struct brw_src src0,
+			 struct brw_src src1 )
 {
-   struct prog_src_register src0 = inst->SrcReg[0];
-   struct prog_src_register src1 = inst->SrcReg[1];
-   struct prog_dst_register dst = inst->DstReg;
-   
    if (dst.WriteMask & BRW_WRITEMASK_Y) {      
       /* dst.y = mul src0.y, src1.y
        */
-      emit_op(c,
-	      TGSI_OPCODE_MUL,
-	      dst_mask(dst, BRW_WRITEMASK_Y),
-	      inst->SaturateMode,
-	      src0,
-	      src1,
-	      src_undef());
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src0,
+	       src1);
    }
 
    if (dst.WriteMask & BRW_WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
 
-      /* dst.xz = swz src0.1zzz
+      /* dst.z = mov src0.zzzz
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      src_swizzle1(src0, Z));
+
+      /* dst.x = immf(1.0)
        */
-      swz = emit_op(c,
-		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, BRW_WRITEMASK_XZ),
-		    inst->SaturateMode,
-		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
-		    src_undef(),
-		    src_undef());
-      /* Avoid letting negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].Negate &= ~NEGATE_X;
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      brw_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
+	      src_immf(c, 1.0));
    }
    if (dst.WriteMask & BRW_WRITEMASK_W) {
       /* dst.w = mov src1.w
        */
-      emit_op(c,
-	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, BRW_WRITEMASK_W),
-	      inst->SaturateMode,
-	      src1,
-	      src_undef(),
-	      src_undef());
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src1);
    }
 }
 
 
 static void precalc_lit( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
+			 struct ureg_dst dst,
+			 struct ureg_src src0 )
 {
-   struct prog_src_register src0 = inst->SrcReg[0];
-   struct prog_dst_register dst = inst->DstReg;
-   
    if (dst.WriteMask & BRW_WRITEMASK_XW) {
-      struct prog_instruction *swz;
-
-      /* dst.xw = swz src0.1111
+      /* dst.xw = imm(1.0f)
        */
-      swz = emit_op(c,
-		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, BRW_WRITEMASK_XW),
-		    0,
-		    src_swizzle1(src0, SWIZZLE_ONE),
-		    src_undef(),
-		    src_undef());
-      /* Avoid letting the negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].Negate = NEGATE_NONE;
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       brw_saturate(brw_writemask(dst, BRW_WRITEMASK_XW), 0),
+	       brw_imm1f(1.0f));
    }
 
    if (dst.WriteMask & BRW_WRITEMASK_YZ) {
-      emit_op(c,
-	      TGSI_OPCODE_LIT,
-	      dst_mask(dst, BRW_WRITEMASK_YZ),
-	      inst->SaturateMode,
-	      src0,
-	      src_undef(),
-	      src_undef());
+      emit_op1(c,
+	       TGSI_OPCODE_LIT,
+	       brw_writemask(dst, BRW_WRITEMASK_YZ),
+	       src0);
    }
 }
 
@@ -601,99 +382,62 @@ static void precalc_lit( struct brw_wm_compile *c,
  * instruction itself.
  */
 static void precalc_tex( struct brw_wm_compile *c,
-			 const struct prog_instruction *inst )
+			 struct brw_dst dst,
+			 unsigned unit,
+			 struct brw_src src0 )
 {
-   struct prog_src_register coord;
-   struct prog_dst_register tmpcoord;
-   const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+   struct ureg_src coord = src_undef();
+   struct ureg_dst tmp = dst_undef();
 
    assert(unit < BRW_MAX_TEX_UNIT);
 
+   /* Cubemap: find longest component of coord vector and normalize
+    * it.
+    */
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) {
-       struct prog_instruction *out;
-       struct prog_dst_register tmp0 = get_temp(c);
-       struct prog_src_register tmp0src = src_reg_from_dst(tmp0);
-       struct prog_dst_register tmp1 = get_temp(c);
-       struct prog_src_register tmp1src = src_reg_from_dst(tmp1);
-       struct prog_src_register src0 = inst->SrcReg[0];
-
-       /* find longest component of coord vector and normalize it */
-       tmpcoord = get_temp(c);
-       coord = src_reg_from_dst(tmpcoord);
-
-       /* tmpcoord = src0 (i.e.: coord = src0) */
-       out = emit_op(c, TGSI_OPCODE_MOV,
-                     tmpcoord,
-                     0,
-                     src0,
-                     src_undef(),
-                     src_undef());
-       out->SrcReg[0].Negate = NEGATE_NONE;
-       out->SrcReg[0].Abs = 1;
-
-       /* tmp0 = MAX(coord.X, coord.Y) */
-       emit_op(c, TGSI_OPCODE_MAX,
-               tmp0,
-               0,
-               src_swizzle1(coord, X),
-               src_swizzle1(coord, Y),
-               src_undef());
-
-       /* tmp1 = MAX(tmp0, coord.Z) */
-       emit_op(c, TGSI_OPCODE_MAX,
-               tmp1,
-               0,
-               tmp0src,
-               src_swizzle1(coord, Z),
-               src_undef());
-
-       /* tmp0 = 1 / tmp1 */
-       emit_op(c, TGSI_OPCODE_RCP,
-               dst_mask(tmp0, BRW_WRITEMASK_X),
-               0,
-               tmp1src,
-               src_undef(),
-               src_undef());
-
-       /* tmpCoord = src0 * tmp0 */
-       emit_op(c, TGSI_OPCODE_MUL,
-               tmpcoord,
-               0,
-               src0,
-               src_swizzle1(tmp0src, SWIZZLE_X),
-               src_undef());
-
-       release_temp(c, tmp0);
-       release_temp(c, tmp1);
+      struct ureg_src tmpsrc;
+
+      tmp = get_temp(c);
+      tmpsrc = brw_src(tmpcoord)
+
+      /* tmp = abs(src0) */
+      emit_op1(c, 
+	       TGSI_OPCODE_MOV,
+	       tmp,
+	       brw_abs(src0));
+
+      /* tmp.X = MAX(tmp.X, tmp.Y) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       brw_writemask(tmp, BRW_WRITEMASK_X),
+	       src_swizzle1(tmpsrc, X),
+	       src_swizzle1(tmpsrc, Y));
+
+      /* tmp.X = MAX(tmp.X, tmp.Z) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       brw_writemask(tmp, BRW_WRITEMASK_X),
+	       tmpsrc,
+	       src_swizzle1(tmpsrc, Z));
+
+      /* tmp.X = 1 / tmp.X */
+      emit_op1(c, TGSI_OPCODE_RCP,
+	      dst_mask(tmp, BRW_WRITEMASK_X),
+	      tmpsrc);
+
+      /* tmp = src0 * tmp.xxxx */
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       tmp,
+	       src0,
+	       src_swizzle1(tmpsrc, SWIZZLE_X));
+
+      coord = tmpsrc;
    }
    else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
-      struct prog_src_register scale = 
-	 search_or_add_param5( c, 
-			       STATE_INTERNAL, 
-			       STATE_TEXRECT_SCALE,
-			       unit,
-			       0,0 );
-
-      tmpcoord = get_temp(c);
-
-      /* coord.xy   = MUL inst->SrcReg[0], { 1/width, 1/height }
+      /* XXX: need a mechanism for internally generated constants.
        */
-      emit_op(c,
-	      TGSI_OPCODE_MUL,
-	      tmpcoord,
-	      0,
-	      inst->SrcReg[0],
-	      src_swizzle(scale,
-			  SWIZZLE_X,
-			  SWIZZLE_Y,
-			  SWIZZLE_ONE,
-			  SWIZZLE_ONE),
-	      src_undef());
-
-      coord = src_reg_from_dst(tmpcoord);
+      coord = src0;
    }
    else {
-      coord = inst->SrcReg[0];
+      coord = src0;
    }
 
    /* Need to emit YUV texture conversions by hand.  Probably need to
@@ -704,58 +448,36 @@ static void precalc_tex( struct brw_wm_compile *c,
    if (c->key.yuvtex_mask & (1 << unit)) {
       /* convert ycbcr to RGBA */
       GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
-
-      /* 
-	 CONST C0 = { -.5, -.0625,  -.5, 1.164 }
-	 CONST C1 = { 1.596, -0.813, 2.018, -.391 }
-	 UYV     = TEX ...
-	 UYV.xyz = ADD UYV,     C0
-	 UYV.y   = MUL UYV.y,   C0.w
- 	 if (UV swaped)
-	    RGB.xyz = MAD UYV.zzx, C1,   UYV.y
-	 else
-	    RGB.xyz = MAD UYV.xxz, C1,   UYV.y 
-	 RGB.y   = MAD UYV.z,   C1.w, RGB.y
-      */
-      struct prog_dst_register dst = inst->DstReg;
-      struct prog_dst_register tmp = get_temp(c);
-      struct prog_src_register tmpsrc = src_reg_from_dst(tmp);
-      struct prog_src_register C0 = search_or_add_const4f( c,  -.5, -.0625, -.5, 1.164 );
-      struct prog_src_register C1 = search_or_add_const4f( c, 1.596, -0.813, 2.018, -.391 );
+      struct ureg_dst dst = inst->DstReg;
+      struct ureg_dst tmp = get_temp(c);
+      struct ureg_src tmpsrc = src_reg_from_dst(tmp);
+      struct ureg_src C0 = ureg_imm4f( c->ureg,  -.5, -.0625, -.5, 1.164 );
+      struct ureg_src C1 = ureg_imm4f( c->ureg, 1.596, -0.813, 2.018, -.391 );
      
       /* tmp     = TEX ...
        */
       emit_tex_op(c, 
                   TGSI_OPCODE_TEX,
-                  tmp,
-                  inst->SaturateMode,
+                  brw_saturate(tmp, dst.Saturate),
                   unit,
                   inst->TexSrcTarget,
-                  inst->TexShadow,
                   coord,
                   src_undef(),
                   src_undef());
 
       /* tmp.xyz =  ADD TMP, C0
        */
-      emit_op(c,
-	      TGSI_OPCODE_ADD,
-	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
-	      0,
-	      tmpsrc,
-	      C0,
-	      src_undef());
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       tmpsrc,
+	       C0);
 
       /* YUV.y   = MUL YUV.y, C0.w
        */
-
-      emit_op(c,
-	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, BRW_WRITEMASK_Y),
-	      0,
-	      tmpsrc,
-	      src_swizzle1(C0, W),
-	      src_undef());
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_Y),
+	       tmpsrc,
+	       src_swizzle1(C0, W));
 
       /* 
        * if (UV swaped)
@@ -764,23 +486,22 @@ static void precalc_tex( struct brw_wm_compile *c,
        *     RGB.xyz = MAD YUV.xxz, C1, YUV.y
        */
 
-      emit_op(c,
-	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, BRW_WRITEMASK_XYZ),
-	      0,
-	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
-	      C1,
-	      src_swizzle1(tmpsrc, Y));
+      emit_op3(c, TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
+	       ( swap_uv ? 
+		 src_swizzle(tmpsrc, Z,Z,X,X) : 
+		 src_swizzle(tmpsrc, X,X,Z,Z)),
+	       C1,
+	       src_swizzle1(tmpsrc, Y));
 
       /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
        */
-      emit_op(c,
-	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, BRW_WRITEMASK_Y),
-	      0,
-	      src_swizzle1(tmpsrc, Z),
-	      src_swizzle1(C1, W),
-	      src_swizzle1(src_reg_from_dst(dst), Y));
+      emit_op3(c,
+	       TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src_swizzle1(tmpsrc, Z),
+	       src_swizzle1(C1, W),
+	       src_swizzle1(src_reg_from_dst(dst), Y));
 
       release_temp(c, tmp);
    }
@@ -789,29 +510,20 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_tex_op(c, 
                   TGSI_OPCODE_TEX,
                   inst->DstReg,
-                  inst->SaturateMode,
                   unit,
                   inst->TexSrcTarget,
-                  inst->TexShadow,
                   coord,
                   src_undef(),
                   src_undef());
    }
 
-   /* For GL_EXT_texture_swizzle: */
-   if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
-      /* swizzle the result of the TEX instruction */
-      struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
-      emit_op(c, TGSI_OPCODE_MOV,
-              inst->DstReg,
-              SATURATE_OFF, /* saturate already done above */
-              src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
-              src_undef(),
-              src_undef());
-   }
+   /* XXX: add GL_EXT_texture_swizzle support to gallium -- by
+    * generating shader varients in mesa state tracker.
+    */
 
-   if ((inst->TexSrcTarget == TEXTURE_RECT_INDEX) ||
-       (inst->TexSrcTarget == TEXTURE_CUBE_INDEX))
+   /* Release this temp if we ended up allocating it:
+    */
+   if (!brw_dst_is_undef(tmpcoord))
       release_temp(c, tmpcoord);
 }
 
@@ -822,7 +534,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 static GLboolean projtex( struct brw_wm_compile *c,
 			  const struct prog_instruction *inst )
 {
-   const struct prog_src_register src = inst->SrcReg[0];
+   const struct ureg_src src = inst->SrcReg[0];
    GLboolean retVal;
 
    assert(inst->Opcode == TGSI_OPCODE_TXP);
@@ -836,7 +548,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
     */
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)
       retVal = GL_FALSE;  /* ut2004 gun rendering !?! */
-   else if (src.File == PROGRAM_INPUT && 
+   else if (src.File == TGSI_FILE_INPUT && 
 	    GET_SWZ(src.Swizzle, W) == W &&
             (c->key.proj_attrib_mask & (1 << src.Index)) == 0)
       retVal = GL_FALSE;
@@ -853,10 +565,10 @@ static GLboolean projtex( struct brw_wm_compile *c,
 static void precalc_txp( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
-   struct prog_src_register src0 = inst->SrcReg[0];
+   struct ureg_src src0 = inst->SrcReg[0];
 
    if (projtex(c, inst)) {
-      struct prog_dst_register tmp = get_temp(c);
+      struct ureg_dst tmp = get_temp(c);
       struct prog_instruction tmp_inst;
 
       /* tmp0.w = RCP inst.arg[0][3]
@@ -864,7 +576,6 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      TGSI_OPCODE_RCP,
 	      dst_mask(tmp, BRW_WRITEMASK_W),
-	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
 	      src_undef());
@@ -874,7 +585,6 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      TGSI_OPCODE_MUL,
 	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
-	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
 	      src_undef());
@@ -899,43 +609,30 @@ static void precalc_txp( struct brw_wm_compile *c,
 
 static void emit_fb_write( struct brw_wm_compile *c )
 {
-   struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
-   struct prog_src_register outcolor;
+   struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   struct ureg_src outdepth = src_reg(TGSI_FILE_OUTPUT, FRAG_RESULT_DEPTH);
+   struct ureg_src outcolor;
+   struct prog_instruction *inst;
    GLuint i;
 
-   struct prog_instruction *inst, *last_inst;
-   struct brw_context *brw = c->func.brw;
 
    /* The inst->Aux field is used for FB write target and the EOT marker */
 
-   if (brw->state.nr_color_regions > 1) {
-      for (i = 0 ; i < brw->state.nr_color_regions; i++) {
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
-         last_inst = inst = emit_op(c,
-                                    WM_FB_WRITE, dst_mask(dst_undef(),0), 0,
-                                    outcolor, payload_r0_depth, outdepth);
-         inst->Aux = (i<<1);
-         if (c->fp_fragcolor_emitted) {
-            outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
-            last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-                                       0, outcolor, payload_r0_depth, outdepth);
-            inst->Aux = (i<<1);
-         }
-      }
-      last_inst->Aux |= 1; //eot
-   }
-   else {
-      /* if gl_FragData[0] is written, use it, else use gl_FragColor */
-      if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
-      else 
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
-
-      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-                     0, outcolor, payload_r0_depth, outdepth);
-      inst->Aux = 1|(0<<1);
+   for (i = 0 ; i < c->key.nr_cbufs; i++) {
+      outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
+
+      inst = emit_op(c, WM_FB_WRITE,
+		     dst_mask(dst_undef(), 0),
+		     outcolor,
+		     payload_r0_depth,
+		     outdepth);
+
+      inst->Aux = (i<<1);
    }
+ 
+   /* Set EOT flag on last inst:
+    */
+   inst->Aux |= 1; //eot
 }
 
 
@@ -952,7 +649,7 @@ static void validate_src_regs( struct brw_wm_compile *c,
    GLuint i;
 
    for (i = 0; i < nr_args; i++) {
-      if (inst->SrcReg[i].File == PROGRAM_INPUT) {
+      if (inst->SrcReg[i].File == TGSI_FILE_INPUT) {
 	 GLuint idx = inst->SrcReg[i].Index;
 	 if (!(c->fp_interp_emitted & (1<<idx))) {
 	    emit_interp(c, idx);
@@ -965,34 +662,86 @@ static void validate_src_regs( struct brw_wm_compile *c,
 static void validate_dst_regs( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
-   if (inst->DstReg.File == PROGRAM_OUTPUT) {
+   if (inst->DstReg.File == TGSI_FILE_OUTPUT) {
       GLuint idx = inst->DstReg.Index;
       if (idx == FRAG_RESULT_COLOR)
-         c->fp_fragcolor_emitted = 1;
+         c->fp_fragcolor_emitted |= inst->DstReg.WriteMask;
    }
 }
 
-static void print_insns( const struct prog_instruction *insn,
-			 GLuint nr )
+
+
+static void emit_insn( struct brw_wm_compile *c,
+		       const struct tgsi_full_instruction *inst )
 {
-   GLuint i;
-   for (i = 0; i < nr; i++, insn++) {
-      debug_printf("%3d: ", i);
-      if (insn->Opcode < MAX_OPCODE)
-	 _mesa_print_instruction(insn);
-      else if (insn->Opcode < MAX_WM_OPCODE) {
-	 GLuint idx = insn->Opcode - MAX_OPCODE;
-
-	 _mesa_print_alu_instruction(insn,
-				     wm_opcode_strings[idx],
-				     3);
-      }
-      else 
-	 debug_printf("965 Opcode %d\n", insn->Opcode);
+
+   switch (inst->Opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       dst, 
+	       brw_abs(src[0]));
+      break;
+
+   case TGSI_OPCODE_SUB: 
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst,
+	       src[0],
+	       brw_negate(src[1]));
+      break;
+
+   case TGSI_OPCODE_SCS: 
+      emit_op1(c, TGSI_OPCODE_SCS,
+	       brw_writemask(dst, BRW_WRITEMASK_XY),
+	       src[0]);
+      break;
+	 
+   case TGSI_OPCODE_DST:
+      precalc_dst(c, inst);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      precalc_lit(c, inst);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      precalc_tex(c, inst);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      precalc_txp(c, inst);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      out = emit_insn(c, inst);
+      out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+      assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
+      break;
+
+   case TGSI_OPCODE_XPD: 
+      emit_op2(c, TGSI_OPCODE_XPD,
+	       brw_writemask(dst, BRW_WRITEMASK_XYZ),
+	       src[0], 
+	       src[1]);
+      break;
+
+   case TGSI_OPCODE_KIL: 
+      emit_op1(c, TGSI_OPCODE_KIL,
+	       brw_writemask(dst_undef(), 0),
+	       src[0]);
+      break;
+
+   case TGSI_OPCODE_END:
+      emit_fb_write(c);
+      break;
+   default:
+      if (brw_wm_is_scalar_result(inst->Opcode))
+	 emit_scalar_insn(c, opcode, dst, src[0], src[1], src[2]);
+      else
+	 emit_op(c, opcode, dst, src[0], src[1], src[2]);
+      break;
    }
 }
 
-
 /**
  * Initial pass for fragment program code generation.
  * This function is used by both the GLSL and non-GLSL paths.
@@ -1004,108 +753,62 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 
    if (BRW_DEBUG & DEBUG_WM) {
       debug_printf("pre-fp:\n");
-      _mesa_print_program(&fp->program.Base); 
-      debug_printf("\n");
+      tgsi_dump(fp->tokens, 0); 
    }
 
-   c->pixel_xy = src_undef();
-   c->delta_xy = src_undef();
-   c->pixel_w = src_undef();
+   c->pixel_xy = brw_src_undef();
+   c->delta_xy = brw_src_undef();
+   c->pixel_w = brw_src_undef();
    c->nr_fp_insns = 0;
    c->fp->tex_units_used = 0x0;
 
-   /* Emit preamble instructions.  This is where special instructions such as
-    * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
-    * compute shader inputs from varying vars.
-    */
-   for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
-      const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
-      validate_src_regs(c, inst);
-      validate_dst_regs(c, inst);
-   }
 
    /* Loop over all instructions doing assorted simplifications and
     * transformations.
     */
-   for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
-      const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
-      struct prog_instruction *out;
-
-      /* Check for INPUT values, emit INTERP instructions where
-       * necessary:
-       */
-
-      switch (inst->Opcode) {
-      case TGSI_OPCODE_ABS:
-	 out = emit_insn(c, inst);
-	 out->Opcode = TGSI_OPCODE_MOV;
-	 out->SrcReg[0].Negate = NEGATE_NONE;
-	 out->SrcReg[0].Abs = 1;
-	 break;
-
-      case TGSI_OPCODE_SUB: 
-	 out = emit_insn(c, inst);
-	 out->Opcode = TGSI_OPCODE_ADD;
-	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
-	 break;
-
-      case TGSI_OPCODE_SCS: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* If branching shader, emit preamble instructions at decl time, as
+	  * instruction order in the shader does not correspond to the order
+	  * instructions are executed in the wild.
+	  *
+	  * This is where special instructions such as WM_CINTERP,
+	  * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to compute
+	  * shader inputs from varying vars.
+	  *
+	  * XXX: For non-branching shaders, consider deferring variable
+	  * initialization as late as possible to minimize register
+	  * usage.  This is how the original BRW driver worked.
 	  */
-	 out->DstReg.WriteMask &= BRW_WRITEMASK_XY;
-	 break;
-	 
-      case TGSI_OPCODE_DST:
-	 precalc_dst(c, inst);
-	 break;
-
-      case TGSI_OPCODE_LIT:
-	 precalc_lit(c, inst);
-	 break;
-
-      case TGSI_OPCODE_TEX:
-	 precalc_tex(c, inst);
-	 break;
-
-      case TGSI_OPCODE_TXP:
-	 precalc_txp(c, inst);
-	 break;
-
-      case TGSI_OPCODE_TXB:
-	 out = emit_insn(c, inst);
-	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-         assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
-	 break;
-
-      case TGSI_OPCODE_XPD: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
+	 validate_src_regs(c, inst);
+	 validate_dst_regs(c, inst);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 /* Unlike VS programs we can probably manage fine encoding
+	  * immediate values directly into the emitted EU
+	  * instructions, as we probably only need to reference one
+	  * float value per instruction.  Just save the data for now
+	  * and use directly later.
 	  */
-	 out->DstReg.WriteMask &= BRW_WRITEMASK_XYZ;
 	 break;
 
-      case TGSI_OPCODE_KIL: 
-	 out = emit_insn(c, inst);
-	 /* This should probably be done in the parser. 
-	  */
-	 out->DstReg.WriteMask = 0;
-	 break;
-      case TGSI_OPCODE_END:
-	 emit_fb_write(c);
-	 break;
-      default:
-	 if (brw_wm_is_scalar_result(inst->Opcode))
-	    emit_scalar_insn(c, inst);
-	 else
-	    emit_insn(c, inst);
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn( c, inst );
 	 break;
       }
    }
 
+   c->brw_program = brw_finalize( c->builder );
+
    if (BRW_DEBUG & DEBUG_WM) {
       debug_printf("pass_fp:\n");
-      print_insns( c->prog_instructions, c->nr_fp_insns );
+      brw_print_program( c->brw_program );
       debug_printf("\n");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 59bc4ef701..cdc10484a6 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -332,7 +332,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  for (j = 0; j < 4; j++)
 	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
        }
-       if (c->key.vp_outputs_written & (1 << i)) {
+       if (c->key.nr_vp_outputs > i) {
 	  reg_index += 2;
        }
     }
@@ -1670,7 +1670,7 @@ get_argument_regs(struct brw_wm_compile *c,
     }
 }
 
-static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
+static void brw_wm_emit_branching_shader(struct brw_context *brw, struct brw_wm_compile *c)
 {
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
@@ -1943,20 +1943,20 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
  * Do GPU code generation for shaders that use GLSL features such as
  * flow control.  Other shaders will be compiled with the 
  */
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
     if (BRW_DEBUG & DEBUG_WM) {
-        debug_printf("brw_wm_glsl_emit:\n");
+       debug_printf("%s:\n", __FUNCTION__);
     }
 
     /* initial instruction translation/simplification */
     brw_wm_pass_fp(c);
 
     /* actual code generation */
-    brw_wm_emit_glsl(brw, c);
+    brw_wm_emit_branching_shader(brw, c);
 
     if (BRW_DEBUG & DEBUG_WM) {
-        brw_wm_print_program(c, "brw_wm_glsl_emit done");
+        brw_wm_print_program(c, "brw_wm_branching_shader_emit done");
     }
 
     c->prog_data.total_grf = num_grf_used(c);
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 71e4c56835..d8b9028927 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -168,54 +168,20 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 
    if (!ref) {
       switch (file) {
-      case PROGRAM_INPUT:
-      case PROGRAM_PAYLOAD:
-      case PROGRAM_TEMPORARY:
-      case PROGRAM_OUTPUT:
-      case PROGRAM_VARYING:
+      case TGSI_FILE_INPUT:
+      case TGSI_FILE_TEMPORARY:
+      case TGSI_FILE_OUTPUT:
+      case BRW_FILE_PAYLOAD:
+	 /* should already be done?? */
 	 break;
 
-      case PROGRAM_LOCAL_PARAM:
-	 ref = get_param_ref(c, &c->fp->program.Base.LocalParams[idx][component]);
-	 break;
-
-      case PROGRAM_ENV_PARAM:
+      case TGSI_FILE_CONSTANT:
 	 ref = get_param_ref(c, &c->env_param[idx][component]);
 	 break;
 
-      case PROGRAM_STATE_VAR:
-      case PROGRAM_UNIFORM:
-      case PROGRAM_CONSTANT:
-      case PROGRAM_NAMED_PARAM: {
-	 struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters;
-	 
-	 /* There's something really hokey about parameters parsed in
-	  * arb programs - they all end up in here, whether they be
-	  * state values, parameters or constants.  This duplicates the
-	  * structure above & also seems to subvert the limits set for
-	  * each type of constant/param.
-	  */ 
-	 switch (plist->Parameters[idx].Type) {
-	 case PROGRAM_NAMED_PARAM:
-	 case PROGRAM_CONSTANT:
-	    /* These are invarient:
-	     */
-	    ref = get_imm_ref(c, &plist->ParameterValues[idx][component]);
-	    break;
-
-	 case PROGRAM_STATE_VAR:
-	 case PROGRAM_UNIFORM:
-	    /* These may change from run to run:
-	     */
-	    ref = get_param_ref(c, &plist->ParameterValues[idx][component] );
-	    break;
-
-	 default:
-	    assert(0);
-	    break;
-	 }
+      case TGSI_FILE_IMMEDIATE:
+	 ref = get_imm_ref(c, &plist->ParameterValues[idx][component]);
 	 break;
-      }
 
       default:
 	 assert(0);
@@ -310,17 +276,16 @@ translate_insn(struct brw_wm_compile *c,
                const struct prog_instruction *inst)
 {
    struct brw_wm_instruction *out = get_instruction(c);
-   GLuint writemask = inst->DstReg.WriteMask;
+   GLuint writemask = inst->dst.WriteMask;
    GLuint nr_args = brw_wm_nr_args(inst->Opcode);
    GLuint i, j;
 
    /* Copy some data out of the instruction
     */
    out->opcode = inst->Opcode;
-   out->saturate = (inst->SaturateMode != SATURATE_OFF);
+   out->saturate = inst->dst.Saturate;
    out->tex_unit = inst->TexSrcUnit;
-   out->tex_idx = inst->TexSrcTarget;
-   out->tex_shadow = inst->TexShadow;
+   out->tex_target = inst->TexSrcTarget;
    out->eot = inst->Aux & 1;
    out->target = inst->Aux >> 1;
 
@@ -328,7 +293,7 @@ translate_insn(struct brw_wm_compile *c,
     */
    for (i = 0; i < nr_args; i++) {
       for (j = 0; j < 4; j++) {
-	 out->src[i][j] = get_new_ref(c, inst->SrcReg[i], j, out);
+	 out->src[i][j] = get_new_ref(c, inst->src[i], j, out);
       }
    }
 
@@ -380,15 +345,6 @@ static void pass0_init_payload( struct brw_wm_compile *c )
 			     &c->payload.depth[j] );
    }
 
-#if 0
-   /* This seems to be an alternative to the INTERP_WPOS stuff I do
-    * elsewhere:
-    */
-   if (c->key.source_depth_reg)
-      pass0_set_fpreg_value(c, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, 2,
-			    &c->payload.depth[c->key.source_depth_reg/2]);
-#endif
-   
    for (i = 0; i < FRAG_ATTRIB_MAX; i++)
       pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, i, 0, 
 			     &c->payload.input_interp[i] );      
@@ -403,6 +359,9 @@ static void pass0_init_payload( struct brw_wm_compile *c )
  * the same number.
  *
  * Translate away swizzling and eliminate non-saturating moves.
+ *
+ * Translate instructions from Mesa's prog_instruction structs to our
+ * internal brw_wm_instruction representation.
  */
 void brw_wm_pass0( struct brw_wm_compile *c )
 {
@@ -421,7 +380,7 @@ void brw_wm_pass0( struct brw_wm_compile *c )
        */      
       switch (inst->Opcode) {
       case OPCODE_MOV: 
-	 if (!inst->SaturateMode) {
+	 if (!inst->dst.Saturate) {
 	    pass0_precalc_mov(c, inst);
 	 }
 	 else {
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index 85a3a55ca4..b0356b1bd5 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -90,17 +90,24 @@ static void track_arg(struct brw_wm_compile *c,
 static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
-   case TEXTURE_1D_INDEX:
+   case TGSI_TEXTURE_1D:
       return BRW_WRITEMASK_X;
-   case TEXTURE_2D_INDEX:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
       return BRW_WRITEMASK_XY;
-   case TEXTURE_3D_INDEX:
+   case TGSI_TEXTURE_3D:
       return BRW_WRITEMASK_XYZ;
-   case TEXTURE_CUBE_INDEX:
+   case TGSI_TEXTURE_CUBE:
       return BRW_WRITEMASK_XYZ;
-   case TEXTURE_RECT_INDEX:
-      return BRW_WRITEMASK_XY;
-   default: return 0;
+
+   case TGSI_TEXTURE_SHADOW1D:
+      return BRW_WRITEMASK_XZ;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return BRW_WRITEMASK_XYZ;
+   default: 
+      assert(0);
+      return 0;
    }
 }
 
@@ -217,14 +224,9 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case TGSI_OPCODE_TEX:
       case TGSI_OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
-
-         if (inst->tex_shadow)
-	    read0 |= BRW_WRITEMASK_Z;
 	 break;
 
       case TGSI_OPCODE_TXB:
-	 /* Shadow ignored for txb.
-	  */
 	 read0 = get_texcoord_mask(inst->tex_idx) | BRW_WRITEMASK_W;
 	 break;
 
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
index edabf6ceb6..1898f38cef 100644
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -52,7 +52,7 @@ struct brw_wm_unit_key {
    unsigned int max_threads;
 
    unsigned int nr_surfaces, sampler_count;
-   GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
+   GLboolean uses_depth, computes_depth, uses_kill, has_flow_control;
    GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
    GLfloat offset_units, offset_factor;
 };
@@ -114,10 +114,10 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = bfp->isGLSL;
+   key->has_flow_control = bfp->has_flow_control;
 
    /* temporary sanity check assertion */
-   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
+   ASSERT(bfp->has_flow_control == brw_wm_has_flow_control(fp));
 
    /* _NEW_QUERY */
    key->stats_wm = (brw->query.stats_wm != 0);
@@ -184,7 +184,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.wm5.program_computes_depth = key->computes_depth;
    wm.wm5.program_uses_killpixel = key->uses_kill;
 
-   if (key->is_glsl)
+   if (key->has_flow_control)
       wm.wm5.enable_8_pix = 1;
    else
       wm.wm5.enable_16_pix = 1;
-- 
cgit v1.2.3


From 5d61b6f1f64ca26dd038af0679873ef0353660dd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 31 Oct 2009 15:05:01 +0000
Subject: i965g: wip on fragment shaders

---
 src/gallium/drivers/i965/brw_wm.h    |  63 ++-
 src/gallium/drivers/i965/brw_wm_fp.c | 871 ++++++++++++++++++++++++++---------
 2 files changed, 698 insertions(+), 236 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 2cd5bb7081..8ee99420aa 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -74,6 +74,7 @@ struct brw_wm_prog_key {
 
    GLuint vp_nr_outputs:6;
    GLuint nr_cbufs:3;
+   GLuint has_flow_control:1;
 
    GLuint program_string_id;
 };
@@ -176,9 +177,36 @@ struct brw_wm_instruction {
 #define MAX_WM_OPCODE     (MAX_OPCODE + 9)
 
 #define BRW_FILE_PAYLOAD   (TGSI_FILE_COUNT)
-#define PAYLOAD_DEPTH      (FRAG_ATTRIB_MAX) /* ?? */
+#define PAYLOAD_DEPTH      (PIPE_MAX_SHADER_INPUTS) /* ?? */
+
+
+struct brw_fp_src {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned swizzle:8;
+   unsigned indirect:1;
+   unsigned negate:1;
+   unsigned abs:1;
+};
+
+struct brw_fp_dst {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned writemask:4;
+   unsigned indirect:1;
+   unsigned saturate:1;
+};
+
+struct brw_fp_instruction {
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   unsigned opcode:8;
+   unsigned tex_unit:4;
+   unsigned tex_target:4;
+   unsigned target:10;		/* destination surface for FB_WRITE */
+   unsigned eot:1;		/* mark last instruction (usually FB_WRITE) */
+};
 
-struct brw_passfp_program;
 
 struct brw_wm_compile {
    struct brw_compile func;
@@ -198,9 +226,26 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct brw_passfp_program *pass_fp;
-
-
+   struct {
+      GLfloat v[4];
+      unsigned nr;
+   } immediate[BRW_WM_MAX_CONST+3];
+   GLuint nr_immediates;
+   
+   struct brw_fp_instruction fp_instructions[BRW_WM_MAX_INSN];
+   GLuint nr_fp_insns;
+   GLuint fp_temp;
+   GLuint fp_interp_emitted;
+   GLuint fp_fragcolor_emitted;
+   GLuint fp_first_internal_temp;
+
+   struct brw_fp_src fp_pixel_xy;
+   struct brw_fp_src fp_delta_xy;
+   struct brw_fp_src fp_pixel_w;
+
+
+   /* Subsequent passes using SSA representation:
+    */
    struct brw_wm_value vreg[BRW_WM_MAX_VREG];
    GLuint nr_vreg;
 
@@ -213,7 +258,7 @@ struct brw_wm_compile {
    } payload;
 
 
-   const struct brw_wm_ref *pass0_fp_reg[PROGRAM_PAYLOAD+1][256][4];
+   const struct brw_wm_ref *pass0_fp_reg[BRW_FILE_PAYLOAD+1][256][4];
 
    struct brw_wm_ref undef_ref;
    struct brw_wm_value undef_value;
@@ -241,7 +286,7 @@ struct brw_wm_compile {
    struct {
       GLboolean inited;
       struct brw_reg reg;
-   } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+   } wm_regs[BRW_FILE_PAYLOAD+1][256][4];
 
    GLboolean used_grf[BRW_WM_MAX_GRF];
    GLuint first_free_grf;
@@ -258,13 +303,15 @@ struct brw_wm_compile {
       GLint index;
       struct brw_reg reg;
    } current_const[3];
+
+   GLuint error;
 };
 
 
 GLuint brw_wm_nr_args( GLuint opcode );
 GLuint brw_wm_is_scalar_result( GLuint opcode );
 
-void brw_wm_pass_fp( struct brw_wm_compile *c );
+int brw_wm_pass_fp( struct brw_wm_compile *c );
 void brw_wm_pass0( struct brw_wm_compile *c );
 void brw_wm_pass1( struct brw_wm_compile *c );
 void brw_wm_pass2( struct brw_wm_compile *c );
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 8ba037cdae..57933afbbe 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -31,15 +31,26 @@
                
 
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_error.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_util.h"
 
 #include "brw_wm.h"
 #include "brw_util.h"
+#include "brw_debug.h"
 
 
 #define X    0
 #define Y    1
 #define Z    2
 #define W    3
+#define GET_SWZ(swz, comp) (((swz) >> ((comp)*2)) & 0x3)
 
 
 static const char *wm_opcode_strings[] = {
@@ -54,7 +65,294 @@ static const char *wm_opcode_strings[] = {
    "FRONTFACING",
 };
 
+/***********************************************************************
+ * Source regs
+ */
+
+static struct brw_fp_src src_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_src reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.swizzle = BRW_SWIZZLE_XYZW;
+   reg.indirect = 0;
+   reg.negate = 0;
+   reg.abs = 0;
+   return reg;
+}
+
+static struct brw_fp_src src_reg_from_dst(struct brw_fp_dst dst)
+{
+   return src_reg(dst.file, dst.index);
+}
+
+static struct brw_fp_src src_undef( void )
+{
+   return src_reg(TGSI_FILE_NULL, 0);
+}
+
+static GLboolean src_is_undef(struct brw_fp_src src)
+{
+   return src.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_src src_swizzle( struct brw_fp_src reg, int x, int y, int z, int w )
+{
+   unsigned swz = reg.swizzle;
+
+   reg.swizzle = ( GET_SWZ(swz, x) << 0 |
+		   GET_SWZ(swz, y) << 2 |
+		   GET_SWZ(swz, z) << 4 |
+		   GET_SWZ(swz, w) << 6 );
+
+   return reg;
+}
+
+static struct brw_fp_src src_scalar( struct brw_fp_src reg, int x )
+{
+   return src_swizzle(reg, x, x, x, x);
+}
+
+static struct brw_fp_src src_abs( struct brw_fp_src src )
+{
+   src.negate = 0;
+   src.abs = 1;
+   return src;
+}
+
+static struct brw_fp_src src_negate( struct brw_fp_src src )
+{
+   src.negate = 1;
+   src.abs = 0;
+   return src;
+}
+
+
+static int match_or_expand_immediate( const float *v,
+                                      unsigned nr,
+                                      float *v2,
+                                      unsigned *nr2,
+                                      unsigned *swizzle )
+{
+   unsigned i, j;
+   
+   *swizzle = 0;
+
+   for (i = 0; i < nr; i++) {
+      boolean found = FALSE;
+
+      for (j = 0; j < *nr2 && !found; j++) {
+         if (v[i] == v2[j]) {
+            *swizzle |= j << (i * 2);
+            found = TRUE;
+         }
+      }
+
+      if (!found) {
+         if (*nr2 >= 4) 
+            return FALSE;
+
+         v2[*nr2] = v[i];
+         *swizzle |= *nr2 << (i * 2);
+         (*nr2)++;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+/* Internally generated immediates: overkill...
+ */
+static struct brw_fp_src src_imm( struct brw_wm_compile *c, 
+				  const GLfloat *v, 
+				  unsigned nr)
+{
+   unsigned i, j;
+   unsigned swizzle;
+
+   /* Could do a first pass where we examine all existing immediates
+    * without expanding.
+    */
+
+   for (i = 0; i < c->nr_immediates; i++) {
+      if (match_or_expand_immediate( v, 
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   if (c->nr_immediates < Elements(c->immediate)) {
+      i = c->nr_immediates++;
+      if (match_or_expand_immediate( v,
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   c->error = 1;
+   return src_undef();
+
+out:
+   /* Make sure that all referenced elements are from this immediate.
+    * Has the effect of making size-one immediates into scalars.
+    */
+   for (j = nr; j < 4; j++)
+      swizzle |= (swizzle & 0x3) << (j * 2);
+
+   return src_swizzle( src_reg( TGSI_FILE_IMMEDIATE, i ),
+		       GET_SWZ(swizzle, X),
+		       GET_SWZ(swizzle, Y),
+		       GET_SWZ(swizzle, Z),
+		       GET_SWZ(swizzle, W) );
+}
+
+
+
+static struct brw_fp_src src_imm1f( struct brw_wm_compile *c,
+				    GLfloat f )
+{
+   return src_imm(c, &f, 1);
+}
+
+static struct brw_fp_src src_imm4f( struct brw_wm_compile *c,
+				    GLfloat x,
+				    GLfloat y,
+				    GLfloat z,
+				    GLfloat w)
+{
+   GLfloat f[4] = {x,y,z,w};
+   return src_imm(c, f, 4);
+}
+
+
+
+/***********************************************************************
+ * Dest regs
+ */
+
+static struct brw_fp_dst dst_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_dst reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.writemask = BRW_WRITEMASK_XYZW;
+   reg.indirect = 0;
+   return reg;
+}
+
+static struct brw_fp_dst dst_mask( struct brw_fp_dst reg, int mask )
+{
+   reg.writemask &= mask;
+   return reg;
+}
+
+static struct brw_fp_dst dst_undef( void )
+{
+   return dst_reg(TGSI_FILE_NULL, 0);
+}
+
+static boolean dst_is_undef( struct brw_fp_dst dst )
+{
+   return dst.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_dst dst_saturate( struct brw_fp_dst reg, boolean flag )
+{
+   reg.saturate = flag;
+   return reg;
+}
+
+static struct brw_fp_dst get_temp( struct brw_wm_compile *c )
+{
+   int bit = ffs( ~c->fp_temp );
+
+   if (!bit) {
+      debug_printf("%s: out of temporaries\n", __FILE__);
+   }
+
+   c->fp_temp |= 1<<(bit-1);
+   return dst_reg(TGSI_FILE_TEMPORARY, c->fp_first_internal_temp+(bit-1));
+}
+
+
+static void release_temp( struct brw_wm_compile *c, struct brw_fp_dst temp )
+{
+   c->fp_temp &= ~(1 << (temp.index - c->fp_first_internal_temp));
+}
+
+
+/***********************************************************************
+ * Instructions 
+ */
+
+static struct brw_fp_instruction *get_fp_inst(struct brw_wm_compile *c)
+{
+   return &c->fp_instructions[c->nr_fp_insns++];
+}
+
+static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
+					     GLuint op,
+					     struct brw_fp_dst dest,
+					     GLuint tex_src_unit,
+					     GLuint tex_src_target,
+					     struct brw_fp_src src0,
+					     struct brw_fp_src src1,
+					     struct brw_fp_src src2 )
+{
+   struct brw_fp_instruction *inst = get_fp_inst(c);
+
+   inst->opcode = op;
+   inst->dst = dest;
+   inst->tex_unit = tex_src_unit;
+   inst->tex_target = tex_src_target;
+   inst->src[0] = src0;
+   inst->src[1] = src1;
+   inst->src[2] = src2;
 
+   return inst;
+}
+   
+
+static INLINE void emit_op3(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1,
+			    struct brw_fp_src src2 )
+{
+   emit_tex_op(c, op, dest, 0, 0, src0, src1, src2);
+}
+
+
+static INLINE void emit_op2(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1)
+{
+   emit_tex_op(c, op, dest, 0, 0, src0, src1, src_undef());
+}
+
+static INLINE void emit_op1(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0)
+{
+   emit_tex_op(c, op, dest, 0, 0, src0, src_undef(), src_undef());
+}
+
+static INLINE void emit_op0(struct brw_wm_compile *c,
+			   GLuint op,
+			   struct brw_fp_dst dest)
+{
+   emit_tex_op(c, op, dest, 0, 0, src_undef(), src_undef(), src_undef());
+}
 
 
@@ -66,10 +364,10 @@ static const char *wm_opcode_strings[] = {
  */
 static void emit_scalar_insn(struct brw_wm_compile *c,
 			     unsigned opcode,
-			     struct brw_dst dst,
-			     struct brw_src src0,
-			     struct brw_src src1,
-			     struct brw_src src2 )
+			     struct brw_fp_dst dst,
+			     struct brw_fp_src src0,
+			     struct brw_fp_src src1,
+			     struct brw_fp_src src2 )
 {
    unsigned first_chan = ffs(dst.writemask) - 1;
    unsigned first_mask = 1 << first_chan;
@@ -77,14 +375,14 @@ static void emit_scalar_insn(struct brw_wm_compile *c,
    if (dst.writemask == 0)
       return;
 
-   emit_op( c, opcode,
-	    brw_writemask(dst, first_mask),
-	    src0, src1, src2 );
+   emit_op3( c, opcode,
+	     dst_mask(dst, first_mask),
+	     src0, src1, src2 );
 
    if (dst.writemask != first_mask) {
       emit_op1(c, TGSI_OPCODE_MOV,
-	       brw_writemask(dst, ~first_mask),
-	       src_swizzle1(brw_src(dst), first_chan));
+	       dst_mask(dst, ~first_mask),
+	       src_scalar(src_reg_from_dst(dst), first_chan));
    }
 }
 
@@ -93,11 +391,11 @@ static void emit_scalar_insn(struct brw_wm_compile *c,
  * Special instructions for interpolation and other tasks
  */
 
-static struct ureg_src get_pixel_xy( struct brw_wm_compile *c )
+static struct brw_fp_src get_pixel_xy( struct brw_wm_compile *c )
 {
-   if (src_is_undef(c->pixel_xy)) {
-      struct ureg_dst pixel_xy = get_temp(c);
-      struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   if (src_is_undef(c->fp_pixel_xy)) {
+      struct brw_fp_dst pixel_xy = get_temp(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
       
       
       /* Emit the out calculations, and hold onto the results.  Use
@@ -105,79 +403,85 @@ static struct ureg_src get_pixel_xy( struct brw_wm_compile *c )
        */   
       /* pixel_xy.xy = PIXELXY payload[0];
        */
-      emit_op(c,
-	      WM_PIXELXY,
-	      dst_mask(pixel_xy, BRW_WRITEMASK_XY),
-	      payload_r0_depth,
-	      src_undef(),
-	      src_undef());
+      emit_op1(c,
+	       WM_PIXELXY,
+	       dst_mask(pixel_xy, BRW_WRITEMASK_XY),
+	       payload_r0_depth);
 
-      c->pixel_xy = src_reg_from_dst(pixel_xy);
+      c->fp_pixel_xy = src_reg_from_dst(pixel_xy);
    }
 
-   return c->pixel_xy;
+   return c->fp_pixel_xy;
 }
 
-static struct ureg_src get_delta_xy( struct brw_wm_compile *c )
+static struct brw_fp_src get_delta_xy( struct brw_wm_compile *c )
 {
-   if (src_is_undef(c->delta_xy)) {
-      struct ureg_dst delta_xy = get_temp(c);
-      struct ureg_src pixel_xy = get_pixel_xy(c);
-      struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   if (src_is_undef(c->fp_delta_xy)) {
+      struct brw_fp_dst delta_xy = get_temp(c);
+      struct brw_fp_src pixel_xy = get_pixel_xy(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
       
       /* deltas.xy = DELTAXY pixel_xy, payload[0]
        */
-      emit_op(c,
+      emit_op3(c,
 	      WM_DELTAXY,
 	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
 	      pixel_xy, 
 	      payload_r0_depth,
 	      src_undef());
       
-      c->delta_xy = src_reg_from_dst(delta_xy);
+      c->fp_delta_xy = src_reg_from_dst(delta_xy);
    }
 
-   return c->delta_xy;
+   return c->fp_delta_xy;
 }
 
-static struct ureg_src get_pixel_w( struct brw_wm_compile *c )
+static struct brw_fp_src get_pixel_w( struct brw_wm_compile *c )
 {
-   if (src_is_undef(c->pixel_w)) {
-      struct ureg_dst pixel_w = get_temp(c);
-      struct ureg_src deltas = get_delta_xy(c);
-      struct ureg_src interp_wpos = src_reg(TGSI_FILE_PAYLOAD, FRAG_ATTRIB_WPOS);
+   if (src_is_undef(c->fp_pixel_w)) {
+      struct brw_fp_dst pixel_w = get_temp(c);
+      struct brw_fp_src deltas = get_delta_xy(c);
+
+      /* XXX: assuming position is always first -- valid? 
+       */
+      struct brw_fp_src interp_wpos = src_reg(BRW_FILE_PAYLOAD, 0);
 
       /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
        */
-      emit_op(c,
-	      WM_PIXELW,
-	      dst_mask(pixel_w, BRW_WRITEMASK_W),
-	      interp_wpos,
-	      deltas, 
-	      src_undef());
+      emit_op3(c,
+	       WM_PIXELW,
+	       dst_mask(pixel_w, BRW_WRITEMASK_W),
+	       interp_wpos,
+	       deltas, 
+	       src_undef());
       
 
-      c->pixel_w = src_reg_from_dst(pixel_w);
+      c->fp_pixel_w = src_reg_from_dst(pixel_w);
    }
 
-   return c->pixel_w;
+   return c->fp_pixel_w;
 }
 
+
+/***********************************************************************
+ * Emit INTERP instructions ahead of first use of each attrib.
+ */
+
 static void emit_interp( struct brw_wm_compile *c,
+			 GLuint idx,
 			 GLuint semantic,
-			 GLuint semantic_index,
 			 GLuint interp_mode )
 {
-   struct ureg_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
-   struct ureg_src interp = src_reg(TGSI_FILE_PAYLOAD, idx);
-   struct ureg_src deltas = get_delta_xy(c);
+   struct brw_fp_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
+   struct brw_fp_src interp = src_reg(BRW_FILE_PAYLOAD, idx);
+   struct brw_fp_src deltas = get_delta_xy(c);
 
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
     * which have not:
     */
    switch (semantic) {
-   case FRAG_ATTRIB_WPOS:
+   case TGSI_SEMANTIC_POSITION:
       /* Have to treat wpos.xy specially:
        */
       emit_op1(c,
@@ -218,7 +522,8 @@ static void emit_interp( struct brw_wm_compile *c,
       }
 
       break;
-   case FRAG_ATTRIB_FOGC:
+
+   case TGSI_SEMANTIC_FOG:
       /* Interpolate the fog coordinate */
       emit_op3(c,
 	      WM_PINTERP,
@@ -228,17 +533,17 @@ static void emit_interp( struct brw_wm_compile *c,
 	      get_pixel_w(c));
 
       emit_op1(c,
-	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, BRW_WRITEMASK_YZ),
-	      brw_imm1f(0.0));
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src_imm1f(c, 0.0));
 
       emit_op1(c,
-	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, BRW_WRITEMASK_W),
-	      brw_imm1f(1.0));
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src_imm1f(c, 1.0));
       break;
 
-   case FRAG_ATTRIB_FACE:
+   case TGSI_SEMANTIC_FACE:
       /* XXX review/test this case */
       emit_op0(c,
 	       WM_FRONTFACING,
@@ -247,15 +552,15 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, BRW_WRITEMASK_YZ),
-	      brw_imm1f(0.0));
+	       src_imm1f(c, 0.0));
 
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, BRW_WRITEMASK_W),
-	      brw_imm1f(1.0));
+	       src_imm1f(c, 1.0));
       break;
 
-   case FRAG_ATTRIB_PNTC:
+   case TGSI_SEMANTIC_PSIZE:
       /* XXX review/test this case */
       emit_op3(c,
 	       WM_PINTERP,
@@ -267,12 +572,12 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, BRW_WRITEMASK_Z),
-	      brw_imm1f(c->pass_fp, 0.0f));
+	      src_imm1f(c, 0.0f));
 
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, BRW_WRITEMASK_W),
-	      brw_imm1f(c->pass_fp, 1.0f));
+	      src_imm1f(c, 1.0f));
       break;
 
    default: 
@@ -310,11 +615,11 @@ static void emit_interp( struct brw_wm_compile *c,
  * Expand various instructions here to simpler forms.  
  */
 static void precalc_dst( struct brw_wm_compile *c,
-			 struct brw_dst dst,
-			 struct brw_src src0,
-			 struct brw_src src1 )
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0,
+			 struct brw_fp_src src1 )
 {
-   if (dst.WriteMask & BRW_WRITEMASK_Y) {      
+   if (dst.writemask & BRW_WRITEMASK_Y) {      
       /* dst.y = mul src0.y, src1.y
        */
       emit_op2(c,
@@ -324,25 +629,22 @@ static void precalc_dst( struct brw_wm_compile *c,
 	       src1);
    }
 
-   if (dst.WriteMask & BRW_WRITEMASK_XZ) {
-      struct prog_instruction *swz;
-      GLuint z = GET_SWZ(src0.Swizzle, Z);
-
+   if (dst.writemask & BRW_WRITEMASK_XZ) {
       /* dst.z = mov src0.zzzz
        */
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
 	      dst_mask(dst, BRW_WRITEMASK_Z),
-	      src_swizzle1(src0, Z));
+	      src_scalar(src0, Z));
 
-      /* dst.x = immf(1.0)
+      /* dst.x = imm1f(1.0)
        */
       emit_op1(c,
 	      TGSI_OPCODE_MOV,
-	      brw_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
-	      src_immf(c, 1.0));
+	      dst_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
+	      src_imm1f(c, 1.0));
    }
-   if (dst.WriteMask & BRW_WRITEMASK_W) {
+   if (dst.writemask & BRW_WRITEMASK_W) {
       /* dst.w = mov src1.w
        */
       emit_op1(c,
@@ -354,22 +656,22 @@ static void precalc_dst( struct brw_wm_compile *c,
 
 
 static void precalc_lit( struct brw_wm_compile *c,
-			 struct ureg_dst dst,
-			 struct ureg_src src0 )
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0 )
 {
-   if (dst.WriteMask & BRW_WRITEMASK_XW) {
+   if (dst.writemask & BRW_WRITEMASK_XW) {
       /* dst.xw = imm(1.0f)
        */
       emit_op1(c,
 	       TGSI_OPCODE_MOV,
-	       brw_saturate(brw_writemask(dst, BRW_WRITEMASK_XW), 0),
-	       brw_imm1f(1.0f));
+	       dst_saturate(dst_mask(dst, BRW_WRITEMASK_XW), 0),
+	       src_imm1f(c, 1.0f));
    }
 
-   if (dst.WriteMask & BRW_WRITEMASK_YZ) {
+   if (dst.writemask & BRW_WRITEMASK_YZ) {
       emit_op1(c,
 	       TGSI_OPCODE_LIT,
-	       brw_writemask(dst, BRW_WRITEMASK_YZ),
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
 	       src0);
    }
 }
@@ -382,41 +684,42 @@ static void precalc_lit( struct brw_wm_compile *c,
  * instruction itself.
  */
 static void precalc_tex( struct brw_wm_compile *c,
-			 struct brw_dst dst,
+			 struct brw_fp_dst dst,
+			 unsigned target,
 			 unsigned unit,
-			 struct brw_src src0 )
+			 struct brw_fp_src src0 )
 {
-   struct ureg_src coord = src_undef();
-   struct ureg_dst tmp = dst_undef();
+   struct brw_fp_src coord = src_undef();
+   struct brw_fp_dst tmp = dst_undef();
 
    assert(unit < BRW_MAX_TEX_UNIT);
 
    /* Cubemap: find longest component of coord vector and normalize
     * it.
     */
-   if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) {
-      struct ureg_src tmpsrc;
+   if (target == TGSI_TEXTURE_CUBE) {
+      struct brw_fp_src tmpsrc;
 
       tmp = get_temp(c);
-      tmpsrc = brw_src(tmpcoord)
+      tmpsrc = src_reg_from_dst(tmp);
 
       /* tmp = abs(src0) */
       emit_op1(c, 
 	       TGSI_OPCODE_MOV,
 	       tmp,
-	       brw_abs(src0));
+	       src_abs(src0));
 
       /* tmp.X = MAX(tmp.X, tmp.Y) */
       emit_op2(c, TGSI_OPCODE_MAX,
-	       brw_writemask(tmp, BRW_WRITEMASK_X),
-	       src_swizzle1(tmpsrc, X),
-	       src_swizzle1(tmpsrc, Y));
+	       dst_mask(tmp, BRW_WRITEMASK_X),
+	       src_scalar(tmpsrc, X),
+	       src_scalar(tmpsrc, Y));
 
       /* tmp.X = MAX(tmp.X, tmp.Z) */
       emit_op2(c, TGSI_OPCODE_MAX,
-	       brw_writemask(tmp, BRW_WRITEMASK_X),
+	       dst_mask(tmp, BRW_WRITEMASK_X),
 	       tmpsrc,
-	       src_swizzle1(tmpsrc, Z));
+	       src_scalar(tmpsrc, Z));
 
       /* tmp.X = 1 / tmp.X */
       emit_op1(c, TGSI_OPCODE_RCP,
@@ -427,11 +730,12 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op2(c, TGSI_OPCODE_MUL,
 	       tmp,
 	       src0,
-	       src_swizzle1(tmpsrc, SWIZZLE_X));
+	       src_scalar(tmpsrc, X));
 
       coord = tmpsrc;
    }
-   else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
+   else if (target == TGSI_TEXTURE_RECT ||
+	    target == TGSI_TEXTURE_SHADOWRECT) {
       /* XXX: need a mechanism for internally generated constants.
        */
       coord = src0;
@@ -448,19 +752,18 @@ static void precalc_tex( struct brw_wm_compile *c,
    if (c->key.yuvtex_mask & (1 << unit)) {
       /* convert ycbcr to RGBA */
       GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
-      struct ureg_dst dst = inst->DstReg;
-      struct ureg_dst tmp = get_temp(c);
-      struct ureg_src tmpsrc = src_reg_from_dst(tmp);
-      struct ureg_src C0 = ureg_imm4f( c->ureg,  -.5, -.0625, -.5, 1.164 );
-      struct ureg_src C1 = ureg_imm4f( c->ureg, 1.596, -0.813, 2.018, -.391 );
+      struct brw_fp_dst tmp = get_temp(c);
+      struct brw_fp_src tmpsrc = src_reg_from_dst(tmp);
+      struct brw_fp_src C0 = src_imm4f( c,  -.5, -.0625, -.5, 1.164 );
+      struct brw_fp_src C1 = src_imm4f( c, 1.596, -0.813, 2.018, -.391 );
      
       /* tmp     = TEX ...
        */
       emit_tex_op(c, 
                   TGSI_OPCODE_TEX,
-                  brw_saturate(tmp, dst.Saturate),
+                  dst_saturate(tmp, dst.saturate),
                   unit,
-                  inst->TexSrcTarget,
+                  target,
                   coord,
                   src_undef(),
                   src_undef());
@@ -477,7 +780,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op2(c, TGSI_OPCODE_MUL,
 	       dst_mask(tmp, BRW_WRITEMASK_Y),
 	       tmpsrc,
-	       src_swizzle1(C0, W));
+	       src_scalar(C0, W));
 
       /* 
        * if (UV swaped)
@@ -492,16 +795,16 @@ static void precalc_tex( struct brw_wm_compile *c,
 		 src_swizzle(tmpsrc, Z,Z,X,X) : 
 		 src_swizzle(tmpsrc, X,X,Z,Z)),
 	       C1,
-	       src_swizzle1(tmpsrc, Y));
+	       src_scalar(tmpsrc, Y));
 
       /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
        */
       emit_op3(c,
 	       TGSI_OPCODE_MAD,
 	       dst_mask(dst, BRW_WRITEMASK_Y),
-	       src_swizzle1(tmpsrc, Z),
-	       src_swizzle1(C1, W),
-	       src_swizzle1(src_reg_from_dst(dst), Y));
+	       src_scalar(tmpsrc, Z),
+	       src_scalar(C1, W),
+	       src_scalar(src_reg_from_dst(dst), Y));
 
       release_temp(c, tmp);
    }
@@ -509,9 +812,9 @@ static void precalc_tex( struct brw_wm_compile *c,
       /* ordinary RGBA tex instruction */
       emit_tex_op(c, 
                   TGSI_OPCODE_TEX,
-                  inst->DstReg,
+                  dst,
                   unit,
-                  inst->TexSrcTarget,
+                  target,
                   coord,
                   src_undef(),
                   src_undef());
@@ -523,8 +826,8 @@ static void precalc_tex( struct brw_wm_compile *c,
 
    /* Release this temp if we ended up allocating it:
     */
-   if (!brw_dst_is_undef(tmpcoord))
-      release_temp(c, tmpcoord);
+   if (!dst_is_undef(tmp))
+      release_temp(c, tmp);
 }
 
 
@@ -532,13 +835,9 @@ static void precalc_tex( struct brw_wm_compile *c,
  * Check if the given TXP instruction really needs the divide-by-W step.
  */
 static GLboolean projtex( struct brw_wm_compile *c,
-			  const struct prog_instruction *inst )
+			  unsigned target, 
+			  struct brw_fp_src src )
 {
-   const struct ureg_src src = inst->SrcReg[0];
-   GLboolean retVal;
-
-   assert(inst->Opcode == TGSI_OPCODE_TXP);
-
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
     * {1.0}, and so on.
@@ -546,16 +845,15 @@ static GLboolean projtex( struct brw_wm_compile *c,
     * More complex cases than this typically only arise from
     * user-provided fragment programs anyway:
     */
-   if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)
-      retVal = GL_FALSE;  /* ut2004 gun rendering !?! */
-   else if (src.File == TGSI_FILE_INPUT && 
-	    GET_SWZ(src.Swizzle, W) == W &&
-            (c->key.proj_attrib_mask & (1 << src.Index)) == 0)
-      retVal = GL_FALSE;
-   else
-      retVal = GL_TRUE;
-
-   return retVal;
+   if (target == TGSI_TEXTURE_CUBE)
+      return GL_FALSE;  /* ut2004 gun rendering !?! */
+   
+   if (src.file == TGSI_FILE_INPUT && 
+       GET_SWZ(src.swizzle, W) == W &&
+       (c->key.proj_attrib_mask & (1 << src.index)) == 0)
+      return GL_FALSE;
+
+   return GL_TRUE;
 }
 
 
@@ -563,110 +861,168 @@ static GLboolean projtex( struct brw_wm_compile *c,
  * Emit code for TXP.
  */
 static void precalc_txp( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
+			 struct brw_fp_dst dst,
+			 unsigned target,
+			 unsigned unit,
+			 struct brw_fp_src src0 )
 {
-   struct ureg_src src0 = inst->SrcReg[0];
-
-   if (projtex(c, inst)) {
-      struct ureg_dst tmp = get_temp(c);
-      struct prog_instruction tmp_inst;
+   if (projtex(c, target, src0)) {
+      struct brw_fp_dst tmp = get_temp(c);
 
       /* tmp0.w = RCP inst.arg[0][3]
        */
-      emit_op(c,
+      emit_op1(c,
 	      TGSI_OPCODE_RCP,
 	      dst_mask(tmp, BRW_WRITEMASK_W),
-	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
-	      src_undef(),
-	      src_undef());
+	      src_scalar(src0, W));
 
       /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
        */
-      emit_op(c,
-	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
-	      src0,
-	      src_swizzle1(src_reg_from_dst(tmp), W),
-	      src_undef());
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       src0,
+	       src_scalar(src_reg_from_dst(tmp), W));
 
-      /* dst = precalc(TEX tmp0)
+      /* dst = TEX tmp0
        */
-      tmp_inst = *inst;
-      tmp_inst.SrcReg[0] = src_reg_from_dst(tmp);
-      precalc_tex(c, &tmp_inst);
+      precalc_tex(c, 
+		  dst,
+		  target,
+		  unit,
+		  src_reg_from_dst(tmp));
 
       release_temp(c, tmp);
    }
    else
    {
-      /* dst = precalc(TEX src0)
+      /* dst = TEX src0
        */
-      precalc_tex(c, inst);
+      precalc_tex(c, dst, target, unit, src0);
    }
 }
 
 
+/* XXX: note this returns a src_reg.
+ */
+static struct brw_fp_src
+find_output_by_semantic( struct brw_wm_compile *c,
+			 unsigned semantic,
+			 unsigned index )
+{
+   const struct tgsi_shader_info *info = &c->fp->info;
+   unsigned i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return src_reg( TGSI_FILE_OUTPUT, i );
+
+   /* If not found, return some arbitrary immediate value:
+    */
+   return src_imm1f(c, 1.0);
+}
+
 
 static void emit_fb_write( struct brw_wm_compile *c )
 {
-   struct ureg_src payload_r0_depth = src_reg(TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH);
-   struct ureg_src outdepth = src_reg(TGSI_FILE_OUTPUT, FRAG_RESULT_DEPTH);
-   struct ureg_src outcolor;
-   struct prog_instruction *inst;
+   struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   struct brw_fp_src outdepth = find_output_by_semantic(c, TGSI_SEMANTIC_POSITION, 0);
    GLuint i;
 
 
-   /* The inst->Aux field is used for FB write target and the EOT marker */
+   outdepth = src_scalar(outdepth, Z);
 
    for (i = 0 ; i < c->key.nr_cbufs; i++) {
-      outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
+      struct brw_fp_src outcolor;
+      unsigned target = 1<<i;
 
-      inst = emit_op(c, WM_FB_WRITE,
-		     dst_mask(dst_undef(), 0),
-		     outcolor,
-		     payload_r0_depth,
-		     outdepth);
+      /* Set EOT flag on last inst:
+       */
+      if (i == c->key.nr_cbufs - 1)
+	 target |= 1;
+      
+      outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
 
-      inst->Aux = (i<<1);
+      /* Use emit_tex_op so that we can specify the inst->tex_target
+       * field, which is abused to contain the FB write target and the
+       * EOT marker
+       */
+      emit_tex_op(c, WM_FB_WRITE,
+		  dst_undef(),
+		  target,
+		  0,
+		  outcolor,
+		  payload_r0_depth,
+		  outdepth);
    }
- 
-   /* Set EOT flag on last inst:
-    */
-   inst->Aux |= 1; //eot
 }
 
 
+static struct brw_fp_dst translate_dst( struct brw_wm_compile *c,
+					const struct tgsi_full_dst_register *dst,
+					unsigned saturate )
+{
+   struct brw_fp_dst out;
+
+   out.file = dst->DstRegister.File;
+   out.index = dst->DstRegister.Index;
+   out.writemask = dst->DstRegister.WriteMask;
+   out.indirect = dst->DstRegister.Indirect;
+   out.saturate = (saturate == TGSI_SAT_ZERO_ONE);
+   
+   if (out.indirect) {
+      assert(dst->DstRegisterInd.File == TGSI_FILE_ADDRESS);
+      assert(dst->DstRegisterInd.Index == 0);
+   }
+   
+   return out;
+}
 
 
-/***********************************************************************
- * Emit INTERP instructions ahead of first use of each attrib.
- */
-
-static void validate_src_regs( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
+static struct brw_fp_src translate_src( struct brw_wm_compile *c,
+					const struct tgsi_full_src_register *src )
 {
-   GLuint nr_args = brw_wm_nr_args( inst->Opcode );
-   GLuint i;
+   struct brw_fp_src out;
+
+   out.file = src->SrcRegister.File;
+   out.index = src->SrcRegister.Index;
+   out.indirect = src->SrcRegister.Indirect;
+
+   out.swizzle = ((src->SrcRegister.SwizzleX << 0) |
+		  (src->SrcRegister.SwizzleY << 2) |
+		  (src->SrcRegister.SwizzleZ << 4) |
+		  (src->SrcRegister.SwizzleW << 6));
+   
+   switch (tgsi_util_get_full_src_register_sign_mode( src, 0 )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      out.abs = 1;
+      out.negate = 0;
+      break;
 
-   for (i = 0; i < nr_args; i++) {
-      if (inst->SrcReg[i].File == TGSI_FILE_INPUT) {
-	 GLuint idx = inst->SrcReg[i].Index;
-	 if (!(c->fp_interp_emitted & (1<<idx))) {
-	    emit_interp(c, idx);
-	    c->fp_interp_emitted |= 1<<idx;
-	 }
-      }
+   case TGSI_UTIL_SIGN_SET:
+      out.abs = 1;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      out.abs = 0;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+   default:
+      out.abs = 0;
+      out.negate = 0;
+      break;
    }
-}
-	 
-static void validate_dst_regs( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
-{
-   if (inst->DstReg.File == TGSI_FILE_OUTPUT) {
-      GLuint idx = inst->DstReg.Index;
-      if (idx == FRAG_RESULT_COLOR)
-         c->fp_fragcolor_emitted |= inst->DstReg.WriteMask;
+
+   if (out.indirect) {
+      assert(src->SrcRegisterInd.File == TGSI_FILE_ADDRESS);
+      assert(src->SrcRegisterInd.Index == 0);
    }
+   
+   return out;
 }
 
 
@@ -674,59 +1030,78 @@ static void validate_dst_regs( struct brw_wm_compile *c,
 static void emit_insn( struct brw_wm_compile *c,
 		       const struct tgsi_full_instruction *inst )
 {
-
-   switch (inst->Opcode) {
+   unsigned opcode = inst->Instruction.Opcode;
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   int i;
+
+   dst = translate_dst( c, &inst->FullDstRegisters[0],
+			inst->Instruction.Saturate );
+
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++)
+      src[i] = translate_src( c, &inst->FullSrcRegisters[0] );
+   
+   switch (opcode) {
    case TGSI_OPCODE_ABS:
       emit_op1(c, TGSI_OPCODE_MOV,
 	       dst, 
-	       brw_abs(src[0]));
+	       src_abs(src[0]));
       break;
 
    case TGSI_OPCODE_SUB: 
       emit_op2(c, TGSI_OPCODE_ADD,
 	       dst,
 	       src[0],
-	       brw_negate(src[1]));
+	       src_negate(src[1]));
       break;
 
    case TGSI_OPCODE_SCS: 
       emit_op1(c, TGSI_OPCODE_SCS,
-	       brw_writemask(dst, BRW_WRITEMASK_XY),
+	       dst_mask(dst, BRW_WRITEMASK_XY),
 	       src[0]);
       break;
 	 
    case TGSI_OPCODE_DST:
-      precalc_dst(c, inst);
+      precalc_dst(c, dst, src[0], src[1]);
       break;
 
    case TGSI_OPCODE_LIT:
-      precalc_lit(c, inst);
+      precalc_lit(c, dst, src[0]);
       break;
 
    case TGSI_OPCODE_TEX:
-      precalc_tex(c, inst);
+      precalc_tex(c, dst,
+		  inst->InstructionExtTexture.Texture,
+		  src[0].file,	/* sampler unit */
+		  src[1] );
       break;
 
    case TGSI_OPCODE_TXP:
-      precalc_txp(c, inst);
+      precalc_txp(c, dst,
+		  inst->InstructionExtTexture.Texture,
+		  src[0].file,	/* sampler unit */
+		  src[1] );
       break;
 
    case TGSI_OPCODE_TXB:
-      out = emit_insn(c, inst);
-      out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-      assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
+      /* XXX: TXB not done
+       */
+      precalc_tex(c, dst,
+		  inst->InstructionExtTexture.Texture,
+		  src[0].file,	/* sampler unit */
+		  src[1] );
       break;
 
    case TGSI_OPCODE_XPD: 
       emit_op2(c, TGSI_OPCODE_XPD,
-	       brw_writemask(dst, BRW_WRITEMASK_XYZ),
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
 	       src[0], 
 	       src[1]);
       break;
 
    case TGSI_OPCODE_KIL: 
       emit_op1(c, TGSI_OPCODE_KIL,
-	       brw_writemask(dst_undef(), 0),
+	       dst_mask(dst_undef(), 0),
 	       src[0]);
       break;
 
@@ -734,10 +1109,11 @@ static void emit_insn( struct brw_wm_compile *c,
       emit_fb_write(c);
       break;
    default:
-      if (brw_wm_is_scalar_result(inst->Opcode))
+      if (!c->key.has_flow_control &&
+	  brw_wm_is_scalar_result(opcode))
 	 emit_scalar_insn(c, opcode, dst, src[0], src[1], src[2]);
       else
-	 emit_op(c, opcode, dst, src[0], src[1], src[2]);
+	 emit_op3(c, opcode, dst, src[0], src[1], src[2]);
       break;
    }
 }
@@ -746,46 +1122,70 @@ static void emit_insn( struct brw_wm_compile *c,
  * Initial pass for fragment program code generation.
  * This function is used by both the GLSL and non-GLSL paths.
  */
-void brw_wm_pass_fp( struct brw_wm_compile *c )
+int brw_wm_pass_fp( struct brw_wm_compile *c )
 {
-   struct brw_fragment_program *fp = c->fp;
-   GLuint insn;
+   struct brw_fragment_shader *fs = c->fp;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+   struct tgsi_full_declaration *decl;
+   const float *imm;
+   GLuint size;
+   GLuint i;
 
    if (BRW_DEBUG & DEBUG_WM) {
       debug_printf("pre-fp:\n");
-      tgsi_dump(fp->tokens, 0); 
+      tgsi_dump(fs->tokens, 0); 
    }
 
-   c->pixel_xy = brw_src_undef();
-   c->delta_xy = brw_src_undef();
-   c->pixel_w = brw_src_undef();
+   c->fp_pixel_xy = src_undef();
+   c->fp_delta_xy = src_undef();
+   c->fp_pixel_w = src_undef();
    c->nr_fp_insns = 0;
-   c->fp->tex_units_used = 0x0;
+   c->nr_immediates = 0;
 
 
    /* Loop over all instructions doing assorted simplifications and
     * transformations.
     */
-   tgsi_parse_init( &parse, tokens );
+   tgsi_parse_init( &parse, fs->tokens );
    while( !tgsi_parse_end_of_tokens( &parse ) ) {
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-	 /* If branching shader, emit preamble instructions at decl time, as
-	  * instruction order in the shader does not correspond to the order
-	  * instructions are executed in the wild.
-	  *
-	  * This is where special instructions such as WM_CINTERP,
-	  * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to compute
-	  * shader inputs from varying vars.
+	 /* Turn intput declarations into special WM_* instructions.
 	  *
 	  * XXX: For non-branching shaders, consider deferring variable
 	  * initialization as late as possible to minimize register
 	  * usage.  This is how the original BRW driver worked.
+	  *
+	  * In a branching shader, must preamble instructions at decl
+	  * time, as instruction order in the shader does not
+	  * correspond to the order instructions are executed in the
+	  * wild.
+	  *
+	  * This is where special instructions such as WM_CINTERP,
+	  * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+	  * compute shader inputs from the payload registers and pixel
+	  * position.
 	  */
-	 validate_src_regs(c, inst);
-	 validate_dst_regs(c, inst);
+         decl = &parse.FullToken.FullDeclaration;
+         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+            unsigned first, last, mask;
+            unsigned attrib;
+
+            first = decl->DeclarationRange.First;
+            last = decl->DeclarationRange.Last;
+            mask = decl->Declaration.UsageMask;
+
+            for (attrib = first; attrib <= last; attrib++) {
+	       emit_interp(c, 
+			   attrib, 
+			   decl->Semantic.SemanticName,
+			   decl->Declaration.Interpolate );
+            }
+         }
+	 
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
@@ -795,21 +1195,36 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	  * float value per instruction.  Just save the data for now
 	  * and use directly later.
 	  */
+	 i = c->nr_immediates++;
+	 imm = &parse.FullToken.FullImmediate.u[i].Float;
+	 size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+
+	 if (c->nr_immediates >= BRW_WM_MAX_CONST)
+	    return PIPE_ERROR_OUT_OF_MEMORY;
+
+	 for (i = 0; i < size; i++)
+	    c->immediate[c->nr_immediates].v[i] = imm[i];
+
+	 for (; i < 4; i++)
+	    c->immediate[c->nr_immediates].v[i] = 0.0;
+
+	 c->immediate[c->nr_immediates].nr = size;
+	 c->nr_immediates++;
 	 break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          inst = &parse.FullToken.FullInstruction;
-	 emit_insn( c, inst );
+	 emit_insn(c, inst);
 	 break;
       }
    }
 
-   c->brw_program = brw_finalize( c->builder );
-
    if (BRW_DEBUG & DEBUG_WM) {
       debug_printf("pass_fp:\n");
-      brw_print_program( c->brw_program );
+      //brw_print_program( c->fp_brw_program );
       debug_printf("\n");
    }
+
+   return c->error;
 }
 
-- 
cgit v1.2.3


From f202a34cb1eca41cf5d12bd72016f284bc81ccf8 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 31 Oct 2009 18:23:14 +0000
Subject: i965g: non-glsl fragment shader path is compiling

Disabled glsl code for now, probably want to clean this up somehow.
---
 src/gallium/drivers/i965/Makefile       |   1 -
 src/gallium/drivers/i965/brw_wm.c       |  14 +-
 src/gallium/drivers/i965/brw_wm.h       |  10 +-
 src/gallium/drivers/i965/brw_wm_fp.c    |   7 +-
 src/gallium/drivers/i965/brw_wm_glsl.c  | 268 ++++++++++++++++++++------------
 src/gallium/drivers/i965/brw_wm_pass0.c |  87 +++++------
 src/gallium/drivers/i965/brw_wm_pass1.c |   8 +-
 src/gallium/drivers/i965/brw_wm_pass2.c |  27 +---
 8 files changed, 230 insertions(+), 192 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
index c3dbad72ae..896cb234a6 100644
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@@ -47,7 +47,6 @@ C_SOURCES = \
 	brw_wm_debug.c \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
-	brw_wm_glsl.c \
 	brw_wm_iz.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 33602b59c1..4fbf9de9bb 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -177,7 +177,10 @@ static int do_wm_prog( struct brw_context *brw,
     */
    if (fp->has_flow_control) {
       c->dispatch_width = 8;
-      brw_wm_branching_shader_emit(brw, c);
+      /* XXX: GLSL support
+       */
+      exit(1);
+      //brw_wm_branching_shader_emit(brw, c);
    }
    else {
       c->dispatch_width = 16;
@@ -239,18 +242,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
 		    brw->curr.fragment_shader->uses_depth,
 		    key);
 
-   /* Revisit this, figure out if it's really useful, and either push
-    * it into the state tracker so that everyone benefits (use to
-    * create fs varients with TEX rather than TXP), or discard.
-    */
-   key->proj_attrib_mask = ~0; /*brw->wm.input_size_masks[4-1];*/
-
    /* PIPE_NEW_RAST */
    key->flat_shade = brw->curr.rast->templ.flatshade;
 
-   /* This can be determined by looking at the INTERP mode each input decl.
-    */
-   key->linear_attrib_mask = 0;
 
    /* PIPE_NEW_BOUND_TEXTURES */
    for (i = 0; i < brw->curr.num_textures; i++) {
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 8ee99420aa..48dac39756 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -56,9 +56,6 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
-   unsigned proj_attrib_mask;    /**< one bit per fragment program attribute */
-   unsigned linear_attrib_mask;  /**< linear interpolation vs perspective interp */
-
    GLuint source_depth_reg:3;
    GLuint aa_dest_stencil_reg:3;
    GLuint dest_depth_reg:3;
@@ -73,6 +70,7 @@ struct brw_wm_prog_key {
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
 
    GLuint vp_nr_outputs:6;
+   GLuint nr_inputs:6;
    GLuint nr_cbufs:3;
    GLuint has_flow_control:1;
 
@@ -179,6 +177,12 @@ struct brw_wm_instruction {
 #define BRW_FILE_PAYLOAD   (TGSI_FILE_COUNT)
 #define PAYLOAD_DEPTH      (PIPE_MAX_SHADER_INPUTS) /* ?? */
 
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+#define GET_SWZ(swz, comp) (((swz) >> ((comp)*2)) & 0x3)
+
 
 struct brw_fp_src {
    unsigned file:4;
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 57933afbbe..58f1d35b7d 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -46,11 +46,6 @@
 #include "brw_debug.h"
 
 
-#define X    0
-#define Y    1
-#define Z    2
-#define W    3
-#define GET_SWZ(swz, comp) (((swz) >> ((comp)*2)) & 0x3)
 
 
 static const char *wm_opcode_strings[] = {
@@ -850,7 +845,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
    
    if (src.file == TGSI_FILE_INPUT && 
        GET_SWZ(src.swizzle, W) == W &&
-       (c->key.proj_attrib_mask & (1 << src.index)) == 0)
+       c->fp->info.input_interpolate[src.index] != TGSI_INTERPOLATE_PERSPECTIVE)
       return GL_FALSE;
 
    return GL_TRUE;
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index cdc10484a6..a06b0a446e 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1,10 +1,13 @@
+#include "util/u_math.h"
+
+
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
 
 
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-                                  const struct prog_instruction *inst,
+                                  const struct brw_fp_instruction *inst,
                                   GLuint component);
 
 
@@ -63,7 +66,7 @@ alloc_grf(struct brw_wm_compile *c)
    /* really, no free GRF regs found */
    if (!c->out_of_regs) {
       /* print warning once per compilation */
-      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
+      debug_printf("%s: ran out of registers for fragment program", __FUNCTION__);
       c->out_of_regs = GL_TRUE;
    }
 
@@ -154,20 +157,18 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 {
     struct brw_reg reg;
     switch (file) {
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_UNIFORM:
-	    file = PROGRAM_STATE_VAR;
-	    break;
-	case PROGRAM_UNDEFINED:
+	case TGSI_FILE_NULL:
 	    return brw_null_reg();	
-	case PROGRAM_TEMPORARY:
-	case PROGRAM_INPUT:
-	case PROGRAM_OUTPUT:
-	case PROGRAM_PAYLOAD:
+
+	case TGSI_FILE_CONSTANT:
+	case TGSI_FILE_TEMPORARY:
+	case TGSI_FILE_INPUT:
+	case TGSI_FILE_OUTPUT:
+	case BRW_FILE_PAYLOAD:
 	    break;
+
 	default:
-	   debug_printf("Unexpected file in get_reg()");
+	   debug_printf("%s: Unexpected file type\n", __FUNCTION__);
 	   return brw_null_reg();
     }
 
@@ -204,6 +205,76 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 
 
+
+/**
+ * Find first/last instruction that references each temporary register.
+ */
+GLboolean
+_mesa_find_temp_intervals(const struct prog_instruction *instructions,
+                          GLuint numInstructions,
+                          GLint intBegin[MAX_PROGRAM_TEMPS],
+                          GLint intEnd[MAX_PROGRAM_TEMPS])
+{
+   struct loop_info
+   {
+      GLuint Start, End;  /**< Start, end instructions of loop */
+   };
+   struct loop_info loopStack[MAX_LOOP_NESTING];
+   GLuint loopStackDepth = 0;
+   GLuint i;
+
+   for (i = 0; i < MAX_PROGRAM_TEMPS; i++){
+      intBegin[i] = intEnd[i] = -1;
+   }
+
+   /* Scan instructions looking for temporary registers */
+   for (i = 0; i < numInstructions; i++) {
+      const struct prog_instruction *inst = instructions + i;
+      if (inst->Opcode == OPCODE_BGNLOOP) {
+         loopStack[loopStackDepth].Start = i;
+         loopStack[loopStackDepth].End = inst->BranchTarget;
+         loopStackDepth++;
+      }
+      else if (inst->Opcode == OPCODE_ENDLOOP) {
+         loopStackDepth--;
+      }
+      else if (inst->Opcode == OPCODE_CAL) {
+         return GL_FALSE;
+      }
+      else {
+         const GLuint numSrc = 3;
+         GLuint j;
+         for (j = 0; j < numSrc; j++) {
+            if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+               const GLuint index = inst->SrcReg[j].Index;
+               if (inst->SrcReg[j].RelAddr)
+                  return GL_FALSE;
+               update_interval(intBegin, intEnd, index, i);
+               if (loopStackDepth > 0) {
+                  /* extend temp register's interval to end of loop */
+                  GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+                  update_interval(intBegin, intEnd, index, loopEnd);
+               }
+            }
+         }
+         if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+            const GLuint index = inst->DstReg.Index;
+            if (inst->DstReg.RelAddr)
+               return GL_FALSE;
+            update_interval(intBegin, intEnd, index, i);
+            if (loopStackDepth > 0) {
+               /* extend temp register's interval to end of loop */
+               GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+               update_interval(intBegin, intEnd, index, loopEnd);
+            }
+         }
+      }
+   }
+
+   return GL_TRUE;
+}
+
+
 /**
  * This is called if we run out of GRF registers.  Examine the live intervals
  * of temp regs in the program and free those which won't be used again.
@@ -211,29 +282,29 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 static void
 reclaim_temps(struct brw_wm_compile *c)
 {
-   GLint intBegin[MAX_PROGRAM_TEMPS];
-   GLint intEnd[MAX_PROGRAM_TEMPS];
+   GLint intBegin[BRW_WM_MAX_TEMPS];
+   GLint intEnd[BRW_WM_MAX_TEMPS];
    int index;
 
    /*printf("Reclaim temps:\n");*/
 
-   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+   _mesa_find_temp_intervals(c->fp_instructions, c->nr_fp_insns,
                              intBegin, intEnd);
 
-   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+   for (index = 0; index < BRW_WM_MAX_TEMPS; index++) {
       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
          /* program temp[i] can be freed */
          int component;
          /*printf("  temp[%d] is dead\n", index);*/
          for (component = 0; component < 4; component++) {
-            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
-               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+            if (c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[TGSI_FILE_TEMPORARY][index][component].reg.nr;
                release_grf(c, r);
                /*
                printf("  Reclaim temp %d, reg %d at inst %d\n",
                       index, r, c->cur_inst);
                */
-               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+               c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited = GL_FALSE;
             }
          }
       }
@@ -264,7 +335,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
             reg = brw_vec8_grf(i * 2, 0);
         else
             reg = brw_vec8_grf(0, 0);
-	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
+	set_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
     reg_index += 2 * c->key.nr_depth_regs;
 
@@ -306,7 +377,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
                   * Constants will be copied in prepare_constant_buffer()
                   */
                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
-                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+                 set_reg(c, TGSI_FILE_STATE_VAR, i, j, reg);
               }
            }
            /* number of constant regs used (each reg is float[8]) */
@@ -330,7 +401,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  urb_read_length = reg_index;
 	  reg = brw_vec8_grf(reg_index, 0);
 	  for (j = 0; j < 4; j++)
-	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
+	     set_reg(c, TGSI_FILE_PAYLOAD, fp_input, j, reg);
        }
        if (c->key.nr_vp_outputs > i) {
 	  reg_index += 2;
@@ -354,7 +425,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
     prealloc_grf(c, 127);
 
     for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
+	const struct brw_fp_instruction *inst = &c->fp_instructions[i];
 	struct brw_reg dst[4];
 
 	switch (inst->Opcode) {
@@ -397,7 +468,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
  * the three GRF slots.
  */
 static void fetch_constants(struct brw_wm_compile *c,
-                            const struct prog_instruction *inst)
+                            const struct brw_fp_instruction *inst)
 {
    struct brw_compile *p = &c->func;
    GLuint i;
@@ -405,9 +476,8 @@ static void fetch_constants(struct brw_wm_compile *c,
    /* loop over instruction src regs */
    for (i = 0; i < 3; i++) {
       const struct prog_src_register *src = &inst->SrcReg[i];
-      if (src->File == PROGRAM_STATE_VAR ||
-          src->File == PROGRAM_CONSTANT ||
-          src->File == PROGRAM_UNIFORM) {
+      if (src->File == TGSI_FILE_IMMEDIATE ||
+          src->File == TGSI_FILE_CONSTANT) {
 	 c->current_const[i].index = src->Index;
 
 #if 0
@@ -431,7 +501,7 @@ static void fetch_constants(struct brw_wm_compile *c,
  * Convert Mesa dst register to brw register.
  */
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
+                                  const struct brw_fp_instruction *inst,
                                   GLuint component)
 {
     const int nr = 1;
@@ -442,7 +512,7 @@ static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 
 static struct brw_reg
 get_src_reg_const(struct brw_wm_compile *c,
-                  const struct prog_instruction *inst,
+                  const struct brw_fp_instruction *inst,
                   GLuint srcRegIndex, GLuint component)
 {
    /* We should have already fetched the constant from the constant
@@ -462,7 +532,7 @@ get_src_reg_const(struct brw_wm_compile *c,
    const_reg = stride(const_reg, 0, 1, 0);
    const_reg.subnr = component * 4;
 
-   if (src->Negate & (1 << component))
+   if (src->Negate)
       const_reg = negate(const_reg);
    if (src->Abs)
       const_reg = brw_abs(const_reg);
@@ -483,7 +553,7 @@ get_src_reg_const(struct brw_wm_compile *c,
  * Convert Mesa src register to brw register.
  */
 static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
+                                  const struct brw_fp_instruction *inst,
                                   GLuint srcRegIndex, GLuint channel)
 {
     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
@@ -499,9 +569,9 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     }
 
     if (c->fp->use_const_buffer &&
-        (src->File == PROGRAM_STATE_VAR ||
-         src->File == PROGRAM_CONSTANT ||
-         src->File == PROGRAM_UNIFORM)) {
+        (src->File == TGSI_FILE_STATE_VAR ||
+         src->File == TGSI_FILE_CONSTANT ||
+         src->File == TGSI_FILE_UNIFORM)) {
        return get_src_reg_const(c, inst, srcRegIndex, component);
     }
     else {
@@ -513,26 +583,26 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 
 
 /**
- * Same as \sa get_src_reg() but if the register is a literal, emit
- * a brw_reg encoding the literal.
- * Note that a brw instruction only allows one src operand to be a literal.
+ * Same as \sa get_src_reg() but if the register is a immediate, emit
+ * a brw_reg encoding the immediate.
+ * Note that a brw instruction only allows one src operand to be a immediate.
  * For instructions with more than one operand, only the second can be a
- * literal.  This means that we treat some literals as constants/uniforms
- * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
+ * immediate.  This means that we treat some immediates as constants
+ * (which why TGSI_FILE_IMMEDIATE is checked in fetch_constants()).
  * 
  */
 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
-                                      const struct prog_instruction *inst,
+                                      const struct brw_fp_instruction *inst,
                                       GLuint srcRegIndex, GLuint channel)
 {
     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    if (src->File == PROGRAM_CONSTANT) {
-       /* a literal */
+    if (src->File == TGSI_FILE_IMMEDIATE) {
+       /* an immediate */
        const int component = GET_SWZ(src->Swizzle, channel);
        const GLfloat *param =
           c->fp->program.Base.Parameters->ParameterValues[src->Index];
        GLfloat value = param[component];
-       if (src->Negate & (1 << channel))
+       if (src->Negate)
           value = -value;
        if (src->Abs)
           value = FABSF(value);
@@ -612,7 +682,7 @@ static void invoke_subroutine( struct brw_wm_compile *c,
 }
 
 static void emit_trunc( struct brw_wm_compile *c,
-                        const struct prog_instruction *inst)
+                        const struct brw_fp_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -630,7 +700,7 @@ static void emit_trunc( struct brw_wm_compile *c,
 }
 
 static void emit_mov( struct brw_wm_compile *c,
-                      const struct prog_instruction *inst)
+                      const struct brw_fp_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -650,7 +720,7 @@ static void emit_mov( struct brw_wm_compile *c,
 }
 
 static void emit_pixel_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+                          const struct brw_fp_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -680,7 +750,7 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
 }
 
 static void emit_delta_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+                          const struct brw_fp_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg dst0, dst1, src0, src1;
@@ -740,7 +810,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
 }
 
 static void emit_fb_write(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+                          const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     int nr = 2;
@@ -808,7 +878,7 @@ static void emit_fb_write(struct brw_wm_compile *c,
 }
 
 static void emit_pixel_w( struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+                          const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -838,7 +908,7 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 }
 
 static void emit_linterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
+                         const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -867,7 +937,7 @@ static void emit_linterp(struct brw_wm_compile *c,
 }
 
 static void emit_cinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
+                         const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -893,7 +963,7 @@ static void emit_cinterp(struct brw_wm_compile *c,
 }
 
 static void emit_pinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
+                         const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -927,7 +997,7 @@ static void emit_pinterp(struct brw_wm_compile *c,
 
 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 static void emit_frontfacing(struct brw_wm_compile *c,
-			     const struct prog_instruction *inst)
+			     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
@@ -956,7 +1026,7 @@ static void emit_frontfacing(struct brw_wm_compile *c,
 }
 
 static void emit_xpd(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -981,13 +1051,13 @@ static void emit_xpd(struct brw_wm_compile *c,
 }
 
 static void emit_dp3(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_reg src0[3], src1[3], dst;
     int i;
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 
     if (!(mask & WRITEMASK_XYZW))
 	return;
@@ -1008,13 +1078,13 @@ static void emit_dp3(struct brw_wm_compile *c,
 }
 
 static void emit_dp4(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 
     if (!(mask & WRITEMASK_XYZW))
 	return;
@@ -1035,13 +1105,13 @@ static void emit_dp4(struct brw_wm_compile *c,
 }
 
 static void emit_dph(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 
     if (!(mask & WRITEMASK_XYZW))
 	return;
@@ -1067,12 +1137,12 @@ static void emit_dph(struct brw_wm_compile *c,
  * register's X, Y, Z and W channels (subject to writemasking of course).
  */
 static void emit_math1(struct brw_wm_compile *c,
-                       const struct prog_instruction *inst, GLuint func)
+                       const struct brw_fp_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
     GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 
     if (!(mask & WRITEMASK_XYZW))
 	return;
@@ -1095,43 +1165,43 @@ static void emit_math1(struct brw_wm_compile *c,
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
 }
 
 static void emit_rsq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
 }
 
 static void emit_sin(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
 }
 
 static void emit_cos(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
 }
 
 static void emit_ex2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
 }
 
 static void emit_lg2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
 }
 
 static void emit_add(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -1150,7 +1220,7 @@ static void emit_add(struct brw_wm_compile *c,
 }
 
 static void emit_arl(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, addr_reg;
@@ -1164,7 +1234,7 @@ static void emit_arl(struct brw_wm_compile *c,
 
 
 static void emit_mul(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -1183,7 +1253,7 @@ static void emit_mul(struct brw_wm_compile *c,
 }
 
 static void emit_frc(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -1202,7 +1272,7 @@ static void emit_frc(struct brw_wm_compile *c,
 }
 
 static void emit_flr(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -1221,7 +1291,7 @@ static void emit_flr(struct brw_wm_compile *c,
 
 
 static void emit_min_max(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
+                         const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     const GLuint mask = inst->DstReg.WriteMask;
@@ -1269,12 +1339,12 @@ static void emit_min_max(struct brw_wm_compile *c,
 }
 
 static void emit_pow(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
     GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 
     if (!(mask & WRITEMASK_XYZW))
 	return;
@@ -1299,7 +1369,7 @@ static void emit_pow(struct brw_wm_compile *c,
 }
 
 static void emit_lrp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1352,7 +1422,7 @@ static void emit_kil(struct brw_wm_compile *c)
 }
 
 static void emit_mad(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1375,7 +1445,7 @@ static void emit_mad(struct brw_wm_compile *c,
 }
 
 static void emit_sop(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst, GLuint cond)
+                     const struct brw_fp_instruction *inst, GLuint cond)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1399,37 +1469,37 @@ static void emit_sop(struct brw_wm_compile *c,
 }
 
 static void emit_slt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_L);
 }
 
 static void emit_sle(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_LE);
 }
 
 static void emit_sgt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_G);
 }
 
 static void emit_sge(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_GE);
 }
 
 static void emit_seq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
 }
 
 static void emit_sne(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
 }
@@ -1459,7 +1529,7 @@ static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
+                         const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1494,25 +1564,25 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
    BIAS on SIMD8 not working yet...
  */	
 static void emit_txb(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
     GLuint i;
     GLuint msg_type;
 
     assert(unit < BRW_MAX_TEX_UNIT);
 
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
     for (i = 0; i < 4; i++) 
 	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
 	src[i] = get_src_reg(c, inst, 0, i);
 
-    switch (inst->TexSrcTarget) {
+    switch (inst->tex_target) {
 	case TEXTURE_1D_INDEX:
 	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
 	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
@@ -1561,12 +1631,12 @@ static void emit_txb(struct brw_wm_compile *c,
 
 
 static void emit_tex(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
+                     const struct brw_fp_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
     GLuint msg_len;
     GLuint i, nr;
     GLuint emit;
@@ -1575,14 +1645,14 @@ static void emit_tex(struct brw_wm_compile *c,
 
     assert(unit < BRW_MAX_TEX_UNIT);
 
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
     for (i = 0; i < 4; i++) 
 	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
 	src[i] = get_src_reg(c, inst, 0, i);
 
-    switch (inst->TexSrcTarget) {
+    switch (inst->tex_target) {
 	case TEXTURE_1D_INDEX:
 	    emit = WRITEMASK_X;
 	    nr = 1;
@@ -1657,7 +1727,7 @@ static void post_wm_emit( struct brw_wm_compile *c )
 
 static void
 get_argument_regs(struct brw_wm_compile *c,
-		  const struct prog_instruction *inst,
+		  const struct brw_fp_instruction *inst,
 		  int index,
 		  struct brw_reg *regs,
 		  int mask)
@@ -1686,7 +1756,7 @@ static void brw_wm_emit_branching_shader(struct brw_context *brw, struct brw_wm_
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
     for (i = 0; i < c->nr_fp_insns; i++) {
-        const struct prog_instruction *inst = &c->prog_instructions[i];
+        const struct brw_fp_instruction *inst = &c->fp_instructions[i];
 	int dst_flags;
 	struct brw_reg args[3][4], dst[4];
 	int j;
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index d8b9028927..7b18335dec 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -28,9 +28,10 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-                 
 
-#include "brw_context.h"
+#include "util/u_memory.h"
+
+#include "brw_debug.h"
 #include "brw_wm.h"
 
 
@@ -133,19 +134,19 @@ static const struct brw_wm_ref *get_imm_ref( struct brw_wm_compile *c,
    /* Search for an existing const value matching the request:
     */
    for (i = 0; i < c->nr_imm_refs; i++) {
-      if (c->imm_ref[i].imm_val == *imm1f) 
+      if (c->imm_ref[i].imm1f == *imm1f) 
 	 return c->imm_ref[i].ref;
    }
 
    /* Else try to add a new one:
     */
-   if (c->nr_imm_refs < BRW_WM_MAX_IMM) {
+   if (c->nr_imm_refs < Elements(c->imm_ref)) {
       GLuint i = c->nr_imm_refs++;
 
       /* An immediate is a special type of parameter:
        */
-      c->imm_ref[i].imm_val = *imm_val;
-      c->imm_ref[i].ref = get_param_ref(c, imm_val);
+      c->imm_ref[i].imm1f = *imm1f;
+      c->imm_ref[i].ref = get_param_ref(c, imm1f);
 
       return c->imm_ref[i].ref;
    }
@@ -180,7 +181,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 break;
 
       case TGSI_FILE_IMMEDIATE:
-	 ref = get_imm_ref(c, &plist->ParameterValues[idx][component]);
+	 ref = get_imm_ref(c, &c->immediate[idx].v[component]);
 	 break;
 
       default:
@@ -205,16 +206,16 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 
 static void pass0_set_dst( struct brw_wm_compile *c,
 			   struct brw_wm_instruction *out,
-			   const struct prog_instruction *inst,
+			   const struct brw_fp_instruction *inst,
 			   GLuint writemask )
 {
-   const struct prog_dst_register *dst = &inst->DstReg;
+   const struct brw_fp_dst dst = inst->dst;
    GLuint i;
 
    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
 	 out->dst[i] = get_value(c);
-	 pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[i]);
+	 pass0_set_fpreg_value(c, dst.file, dst.index, i, out->dst[i]);
       }
    }
 
@@ -223,27 +224,15 @@ static void pass0_set_dst( struct brw_wm_compile *c,
 
 
 static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
-						    struct prog_src_register src,
+						    struct brw_fp_src src,
 						    GLuint i )
 {
-   GLuint component = GET_SWZ(src.Swizzle,i);
-   const struct brw_wm_ref *src_ref;
-   static const GLfloat const_zero = 0.0;
-   static const GLfloat const_one = 1.0;
-
-   if (component == SWIZZLE_ZERO) 
-      src_ref = get_imm_ref(c, &const_zero);
-   else if (component == SWIZZLE_ONE) 
-      src_ref = get_imm_ref(c, &const_one);
-   else 
-      src_ref = pass0_get_reg(c, src.File, src.Index, component);
-
-   return src_ref;
+   return pass0_get_reg(c, src.file, src.index, GET_SWZ(src.swizzle,i));
 }
 
 
 static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
-				       struct prog_src_register src,
+				       struct brw_fp_src src,
 				       GLuint i,
 				       struct brw_wm_instruction *insn)
 {
@@ -259,10 +248,10 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
       newref->value->lastuse = newref;
    }
 
-   if (src.Negate & (1 << i))
+   if (src.negate)
       newref->hw_reg.negate ^= 1;
 
-   if (src.Abs) {
+   if (src.abs) {
       newref->hw_reg.negate = 0;
       newref->hw_reg.abs = 1;
    }
@@ -273,21 +262,21 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 
 static void
 translate_insn(struct brw_wm_compile *c,
-               const struct prog_instruction *inst)
+               const struct brw_fp_instruction *inst)
 {
    struct brw_wm_instruction *out = get_instruction(c);
-   GLuint writemask = inst->dst.WriteMask;
-   GLuint nr_args = brw_wm_nr_args(inst->Opcode);
+   GLuint writemask = inst->dst.writemask;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
    GLuint i, j;
 
    /* Copy some data out of the instruction
     */
-   out->opcode = inst->Opcode;
-   out->saturate = inst->dst.Saturate;
-   out->tex_unit = inst->TexSrcUnit;
-   out->tex_target = inst->TexSrcTarget;
-   out->eot = inst->Aux & 1;
-   out->target = inst->Aux >> 1;
+   out->opcode = inst->opcode;
+   out->saturate = inst->dst.saturate;
+   out->tex_unit = inst->tex_unit;
+   out->tex_target = inst->tex_target;
+   out->eot = inst->eot; //inst->Aux & 1;
+   out->target = inst->target; //inst->Aux >> 1;
 
    /* Args:
     */
@@ -308,10 +297,10 @@ translate_insn(struct brw_wm_compile *c,
  * Optimize moves and swizzles away:
  */ 
 static void pass0_precalc_mov( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
+			       const struct brw_fp_instruction *inst )
 {
-   const struct prog_dst_register *dst = &inst->DstReg;
-   GLuint writemask = inst->DstReg.WriteMask;
+   const struct brw_fp_dst dst = inst->dst;
+   GLuint writemask = dst.writemask;
    struct brw_wm_ref *refs[4];
    GLuint i;
 
@@ -323,11 +312,11 @@ static void pass0_precalc_mov( struct brw_wm_compile *c,
     * one loop and the above case was incorrectly handled.
     */
    for (i = 0; i < 4; i++) {
-      refs[i] = get_new_ref(c, inst->SrcReg[0], i, NULL);
+      refs[i] = get_new_ref(c, inst->src[0], i, NULL);
    }
    for (i = 0; i < 4; i++) {
       if (writemask & (1 << i)) {	    
-         pass0_set_fpreg_ref( c, dst->File, dst->Index, i, refs[i]);
+         pass0_set_fpreg_ref( c, dst.file, dst.index, i, refs[i]);
       }
    }
 }
@@ -341,12 +330,12 @@ static void pass0_init_payload( struct brw_wm_compile *c )
 
    for (i = 0; i < 4; i++) {
       GLuint j = i >= c->key.nr_depth_regs ? 0 : i;
-      pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, 
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, PAYLOAD_DEPTH, i, 
 			     &c->payload.depth[j] );
    }
 
-   for (i = 0; i < FRAG_ATTRIB_MAX; i++)
-      pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, i, 0, 
+   for (i = 0; i < c->key.nr_inputs; i++)
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, i, 0, 
 			     &c->payload.input_interp[i] );      
 }
 
@@ -360,7 +349,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
  *
  * Translate away swizzling and eliminate non-saturating moves.
  *
- * Translate instructions from Mesa's prog_instruction structs to our
+ * Translate instructions from our fp_instruction structs to our
  * internal brw_wm_instruction representation.
  */
 void brw_wm_pass0( struct brw_wm_compile *c )
@@ -374,13 +363,13 @@ void brw_wm_pass0( struct brw_wm_compile *c )
    pass0_init_payload(c);
 
    for (insn = 0; insn < c->nr_fp_insns; insn++) {
-      const struct prog_instruction *inst = &c->prog_instructions[insn];
+      const struct brw_fp_instruction *inst = &c->fp_instructions[insn];
 
       /* Optimize away moves, otherwise emit translated instruction:
        */      
-      switch (inst->Opcode) {
-      case OPCODE_MOV: 
-	 if (!inst->dst.Saturate) {
+      switch (inst->opcode) {
+      case TGSI_OPCODE_MOV: 
+	 if (!inst->dst.saturate) {
 	    pass0_precalc_mov(c, inst);
 	 }
 	 else {
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index b0356b1bd5..09ad2b8f5b 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -30,8 +30,8 @@
   */
                   
 
-#include "brw_context.h"
 #include "brw_wm.h"
+#include "brw_debug.h"
 
 
 static GLuint get_tracked_mask(struct brw_wm_compile *c,
@@ -223,11 +223,11 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case TGSI_OPCODE_TEX:
       case TGSI_OPCODE_TXP:
-	 read0 = get_texcoord_mask(inst->tex_idx);
+	 read0 = get_texcoord_mask(inst->tex_target);
 	 break;
 
       case TGSI_OPCODE_TXB:
-	 read0 = get_texcoord_mask(inst->tex_idx) | BRW_WRITEMASK_W;
+	 read0 = get_texcoord_mask(inst->tex_target) | BRW_WRITEMASK_W;
 	 break;
 
       case WM_WPOSXY:
@@ -276,7 +276,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case TGSI_OPCODE_DST:
       case WM_FRONTFACING:
-      case TGSI_OPCODE_KIL_NV:
+      case TGSI_OPCODE_KILP:
       default:
 	 break;
       }
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
index a19ca62328..d3d678a5e6 100644
--- a/src/gallium/drivers/i965/brw_wm_pass2.c
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -30,7 +30,7 @@
   */
                    
 
-#include "brw_context.h"
+#include "brw_debug.h"
 #include "brw_wm.h"
 
 
@@ -82,27 +82,14 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
-      if (c->key.vp_outputs_written & (1<<j)) {
-	 int fp_index;
-
-	 if (j >= VERT_RESULT_VAR0)
-	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
-	 else if (j <= VERT_RESULT_TEX7)
-	    fp_index = j;
-	 else
-	    fp_index = -1;
-
-	 nr_interp_regs++;
-	 if (fp_index >= 0)
-	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
-      }
+   for (j = 0; j < c->key.vp_nr_outputs; j++) {
+      prealloc_reg(c, &c->payload.input_interp[j], i++);
    }
 
    assert(nr_interp_regs >= 1);
 
    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-   c->prog_data.urb_read_length = nr_interp_regs * 2;
+   c->prog_data.urb_read_length = c->key.vp_nr_outputs * 2;
    c->prog_data.curb_read_length = c->nr_creg * 2;
 
    c->max_wm_grf = i * 2;
@@ -308,9 +295,9 @@ void brw_wm_pass2( struct brw_wm_compile *c )
       /* Allocate registers to hold results:
        */
       switch (inst->opcode) {
-      case OPCODE_TEX:
-      case OPCODE_TXB:
-      case OPCODE_TXP:
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXB:
+      case TGSI_OPCODE_TXP:
 	 alloc_contiguous_dest(c, inst->dst, 4, insn);
 	 break;
 
-- 
cgit v1.2.3


From 212fb8adbd0e5e28a5d20b0cc03cde46df2831f4 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 6 Nov 2009 10:24:19 +0000
Subject: i965g: don't set up vs stack register for non-branching shaders

---
 src/gallium/drivers/i965/brw_context.h     |  2 ++
 src/gallium/drivers/i965/brw_pipe_shader.c | 20 ++++++++++----------
 src/gallium/drivers/i965/brw_vs_emit.c     | 11 ++++++++---
 src/gallium/drivers/i965/brw_wm.c          |  3 ---
 src/gallium/drivers/i965/brw_wm.h          |  1 -
 5 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 34799d5211..b81dff0aa0 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -154,6 +154,8 @@ struct brw_vertex_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
+   unsigned  has_flow_control:1;
+
    unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 662c43c3e5..44f9ad6f9c 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -43,15 +43,15 @@
  * Determine if the given shader uses complex features such as flow
  * conditionals, loops, subroutines.
  */
-GLboolean brw_wm_has_flow_control(const struct brw_fragment_shader *fp)
+static GLboolean has_flow_control(const struct tgsi_shader_info *info)
 {
-    return (fp->info.opcode_count[TGSI_OPCODE_ARL] > 0 ||
-	    fp->info.opcode_count[TGSI_OPCODE_IF] > 0 ||
-	    fp->info.opcode_count[TGSI_OPCODE_ENDIF] > 0 || /* redundant - IF */
-	    fp->info.opcode_count[TGSI_OPCODE_CAL] > 0 ||
-	    fp->info.opcode_count[TGSI_OPCODE_BRK] > 0 ||   /* redundant - BGNLOOP */
-	    fp->info.opcode_count[TGSI_OPCODE_RET] > 0 ||   /* redundant - CAL */
-	    fp->info.opcode_count[TGSI_OPCODE_BGNLOOP] > 0);
+    return (info->opcode_count[TGSI_OPCODE_ARL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_IF] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_ENDIF] > 0 || /* redundant - IF */
+	    info->opcode_count[TGSI_OPCODE_CAL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_BRK] > 0 ||   /* redundant - BGNLOOP */
+	    info->opcode_count[TGSI_OPCODE_RET] > 0 ||   /* redundant - CAL */
+	    info->opcode_count[TGSI_OPCODE_BGNLOOP] > 0);
 }
 
 
@@ -88,7 +88,7 @@ static void *brw_create_fs_state( struct pipe_context *pipe,
    /* Duplicate tokens, scan shader
     */
    fs->id = brw->program_id++;
-   fs->has_flow_control = brw_wm_has_flow_control(fs);
+   fs->has_flow_control = has_flow_control(&fs->info);
 
    fs->tokens = tgsi_dup_tokens(shader->tokens);
    if (fs->tokens == NULL)
@@ -126,7 +126,7 @@ static void *brw_create_vs_state( struct pipe_context *pipe,
    /* Duplicate tokens, scan shader
     */
    vs->id = brw->program_id++;
-   //vs->has_flow_control = brw_wm_has_flow_control(vs);
+   vs->has_flow_control = has_flow_control(&vs->info);
 
    vs->tokens = tgsi_dup_tokens(shader->tokens);
    if (vs->tokens == NULL)
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 25aea87b8f..e0fadc8dce 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -252,8 +252,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    }
 #endif
 
-   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
-   reg += 2;
+   if (c->vp->has_flow_control) {
+      c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+      reg += 2;
+   }
 
    /* Some opcodes need an internal temporary:
     */
@@ -1592,7 +1594,10 @@ void brw_vs_emit(struct brw_vs_compile *c)
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
-   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+
+   if (c->vp->has_flow_control) {
+      brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+   }
 
    /* Instructions
     */
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 93f90bf329..7f2cb15256 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -162,9 +162,6 @@ static enum pipe_error do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   /* temporary sanity check assertion */
-   assert(fp->has_flow_control == brw_wm_has_flow_control(c->fp));
-
    /*
     * Shader which use GLSL features such as flow control are handled
     * differently from "simple" shaders.
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 48dac39756..28d216260e 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -338,7 +338,6 @@ void brw_wm_lookup_iz( GLuint line_aa,
 		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
-GLboolean brw_wm_has_flow_control(const struct brw_fragment_shader *fp);
 void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 void emit_ddxy(struct brw_compile *p,
-- 
cgit v1.2.3


From a485341455bb270001aad8b39c7b9fa36ac74478 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 6 Nov 2009 11:56:52 +0000
Subject: i965g: add dumping for our new pass_fp output

---
 src/gallium/drivers/i965/brw_screen.c   |   2 +-
 src/gallium/drivers/i965/brw_wm.h       |   4 +-
 src/gallium/drivers/i965/brw_wm_debug.c | 163 ++++++++++++++++++++++++--------
 src/gallium/drivers/i965/brw_wm_fp.c    |  35 ++-----
 src/gallium/drivers/i965/brw_wm_glsl.c  |   4 +-
 src/gallium/drivers/i965/brw_wm_pass0.c |   2 +-
 6 files changed, 139 insertions(+), 71 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_screen.c b/src/gallium/drivers/i965/brw_screen.c
index 9d8066442b..575a418b7d 100644
--- a/src/gallium/drivers/i965/brw_screen.c
+++ b/src/gallium/drivers/i965/brw_screen.c
@@ -293,7 +293,7 @@ brw_create_screen(struct brw_winsys_screen *sws, uint pci_id)
 #ifdef DEBUG
    BRW_DEBUG = debug_get_flags_option("BRW_DEBUG", debug_names, 0);
    BRW_DEBUG |= debug_get_flags_option("INTEL_DEBUG", debug_names, 0);
-   BRW_DEBUG |= DEBUG_STATS | DEBUG_MIN_URB;
+   BRW_DEBUG |= DEBUG_STATS | DEBUG_MIN_URB | DEBUG_WM;
 #endif
 
    memset(&chipset, 0, sizeof chipset);
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 28d216260e..7d044ff6ec 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -181,7 +181,6 @@ struct brw_wm_instruction {
 #define Y    1
 #define Z    2
 #define W    3
-#define GET_SWZ(swz, comp) (((swz) >> ((comp)*2)) & 0x3)
 
 
 struct brw_fp_src {
@@ -333,6 +332,9 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 void brw_wm_print_program( struct brw_wm_compile *c,
 			   const char *stage );
 
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage );
+
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
 		       GLboolean ps_uses_depth,
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index 65d7626eea..3d11fa074c 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -34,6 +34,62 @@
 #include "brw_context.h"
 #include "brw_wm.h"
 
+static void print_writemask( unsigned writemask )
+{
+   if (writemask != BRW_WRITEMASK_XYZW)
+      debug_printf(".%s%s%s%s", 
+		   (writemask & BRW_WRITEMASK_X) ? "x" : "",
+		   (writemask & BRW_WRITEMASK_Y) ? "y" : "",
+		   (writemask & BRW_WRITEMASK_Z) ? "z" : "",
+		   (writemask & BRW_WRITEMASK_W) ? "w" : "");
+}
+
+static void print_swizzle( unsigned swizzle )
+{
+   char *swz = "xyzw";
+   if (swizzle != BRW_SWIZZLE_XYZW)
+      debug_printf(".%c%c%c%c", 
+		   swz[BRW_GET_SWZ(swizzle, X)],
+		   swz[BRW_GET_SWZ(swizzle, Y)],
+		   swz[BRW_GET_SWZ(swizzle, Z)],
+		   swz[BRW_GET_SWZ(swizzle, W)]);
+}
+
+static void print_opcode( unsigned opcode )
+{
+   switch (opcode) {
+   case WM_PIXELXY:
+      debug_printf("PIXELXY");
+      break;
+   case WM_DELTAXY:
+      debug_printf("DELTAXY");
+      break;
+   case WM_PIXELW:
+      debug_printf("PIXELW");
+      break;
+   case WM_WPOSXY:
+      debug_printf("WPOSXY");
+      break;
+   case WM_PINTERP:
+      debug_printf("PINTERP");
+      break;
+   case WM_LINTERP:
+      debug_printf("LINTERP");
+      break;
+   case WM_CINTERP:
+      debug_printf("CINTERP");
+      break;
+   case WM_FB_WRITE:
+      debug_printf("FB_WRITE");
+      break;
+   case WM_FRONTFACING:
+      debug_printf("FRONTFACING");
+      break;
+   default:
+      debug_printf("%s", tgsi_get_opcode_info(opcode)->mnemonic);
+      break;
+   }
+}
 
 void brw_wm_print_value( struct brw_wm_compile *c,
 		       struct brw_wm_value *value )
@@ -98,47 +154,11 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 	 debug_printf(",");
    }
    debug_printf("]");
-
-   if (inst->writemask != BRW_WRITEMASK_XYZW)
-      debug_printf(".%s%s%s%s", 
-		   (inst->writemask & BRW_WRITEMASK_X) ? "x" : "",
-		   (inst->writemask & BRW_WRITEMASK_Y) ? "y" : "",
-		   (inst->writemask & BRW_WRITEMASK_Z) ? "z" : "",
-		   (inst->writemask & BRW_WRITEMASK_W) ? "w" : "");
-
-   switch (inst->opcode) {
-   case WM_PIXELXY:
-      debug_printf(" = PIXELXY");
-      break;
-   case WM_DELTAXY:
-      debug_printf(" = DELTAXY");
-      break;
-   case WM_PIXELW:
-      debug_printf(" = PIXELW");
-      break;
-   case WM_WPOSXY:
-      debug_printf(" = WPOSXY");
-      break;
-   case WM_PINTERP:
-      debug_printf(" = PINTERP");
-      break;
-   case WM_LINTERP:
-      debug_printf(" = LINTERP");
-      break;
-   case WM_CINTERP:
-      debug_printf(" = CINTERP");
-      break;
-   case WM_FB_WRITE:
-      debug_printf(" = FB_WRITE");
-      break;
-   case WM_FRONTFACING:
-      debug_printf(" = FRONTFACING");
-      break;
-   default:
-      debug_printf(" = %s", tgsi_get_opcode_info(inst->opcode)->mnemonic);
-      break;
-   }
-
+   print_writemask(inst->writemask);
+   
+   debug_printf(" = ");
+   print_opcode(inst->opcode);
+  
    if (inst->saturate)
       debug_printf("_SAT");
 
@@ -173,3 +193,64 @@ void brw_wm_print_program( struct brw_wm_compile *c,
    debug_printf("\n");
 }
 
+static const char *file_strings[TGSI_FILE_COUNT+1] = {
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMPLER",
+   "ADDR",
+   "IMM",
+   "LOOP",
+   "PAYLOAD"
+};
+
+static void brw_wm_print_fp_insn( struct brw_wm_compile *c,
+                                  struct brw_fp_instruction *inst )
+{
+   GLuint i;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   print_opcode(inst->opcode);
+   if (inst->dst.saturate)
+      debug_printf("_SAT");
+   debug_printf(" ");
+
+   if (inst->dst.indirect)
+      debug_printf("[");
+
+   debug_printf("%s[%d]",
+                file_strings[inst->dst.file],
+                inst->dst.index );
+   print_writemask(inst->dst.writemask);
+
+   if (inst->dst.indirect)
+      debug_printf("]");
+
+   debug_printf(nr_args ? ", " : "\n");
+   
+   for (i = 0; i < nr_args; i++) {
+      debug_printf("%s%s%s[%d]%s",
+                   inst->src[i].negate ? "-" : "",
+                   inst->src[i].abs ? "ABS(" : "",
+                   file_strings[inst->src[i].file],
+                   inst->src[i].index,
+                   inst->src[i].abs ? ")" : "");
+      print_swizzle(inst->src[i].swizzle);
+      debug_printf("%s", i == nr_args - 1 ? "\n" : ", ");
+   }
+}
+
+
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage )
+{
+   GLuint insn;
+
+   debug_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_fp_insns; insn++)
+      brw_wm_print_fp_insn(c, &c->fp_instructions[insn]);
+   debug_printf("\n");
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index bba448815b..74aa02f198 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -45,20 +45,6 @@
 #include "brw_debug.h"
 
 
-
-
-static const char *wm_opcode_strings[] = {
-   "PIXELXY",
-   "DELTAXY",
-   "PIXELW",
-   "LINTERP",
-   "PINTERP",
-   "CINTERP",
-   "WPOSXY",
-   "FB_WRITE",
-   "FRONTFACING",
-};
-
 /***********************************************************************
  * Source regs
  */
@@ -94,10 +80,10 @@ static struct brw_fp_src src_swizzle( struct brw_fp_src reg, int x, int y, int z
 {
    unsigned swz = reg.swizzle;
 
-   reg.swizzle = ( GET_SWZ(swz, x) << 0 |
-		   GET_SWZ(swz, y) << 2 |
-		   GET_SWZ(swz, z) << 4 |
-		   GET_SWZ(swz, w) << 6 );
+   reg.swizzle = ( BRW_GET_SWZ(swz, x) << 0 |
+		   BRW_GET_SWZ(swz, y) << 2 |
+		   BRW_GET_SWZ(swz, z) << 4 |
+		   BRW_GET_SWZ(swz, w) << 6 );
 
    return reg;
 }
@@ -200,10 +186,10 @@ out:
       swizzle |= (swizzle & 0x3) << (j * 2);
 
    return src_swizzle( src_reg( TGSI_FILE_IMMEDIATE, i ),
-		       GET_SWZ(swizzle, X),
-		       GET_SWZ(swizzle, Y),
-		       GET_SWZ(swizzle, Z),
-		       GET_SWZ(swizzle, W) );
+		       BRW_GET_SWZ(swizzle, X),
+		       BRW_GET_SWZ(swizzle, Y),
+		       BRW_GET_SWZ(swizzle, Z),
+		       BRW_GET_SWZ(swizzle, W) );
 }
 
 
@@ -843,7 +829,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
       return GL_FALSE;  /* ut2004 gun rendering !?! */
    
    if (src.file == TGSI_FILE_INPUT && 
-       GET_SWZ(src.swizzle, W) == W &&
+       BRW_GET_SWZ(src.swizzle, W) == W &&
        c->fp->info.input_interpolate[src.index] != TGSI_INTERPOLATE_PERSPECTIVE)
       return GL_FALSE;
 
@@ -1214,8 +1200,7 @@ int brw_wm_pass_fp( struct brw_wm_compile *c )
    }
 
    if (BRW_DEBUG & DEBUG_WM) {
-      debug_printf("pass_fp:\n");
-      //brw_print_program( c->fp_brw_program );
+      brw_wm_print_fp_program( c, "pass_fp" );
       debug_printf("\n");
    }
 
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 284f819bf8..3b3afc39d3 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -558,7 +558,7 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 {
     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
     const GLuint nr = 1;
-    const GLuint component = GET_SWZ(src->Swizzle, channel);
+    const GLuint component = BRW_GET_SWZ(src->Swizzle, channel);
 
     /* Extended swizzle terms */
     if (component == SWIZZLE_ZERO) {
@@ -598,7 +598,7 @@ static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
     if (src->File == TGSI_FILE_IMMEDIATE) {
        /* an immediate */
-       const int component = GET_SWZ(src->Swizzle, channel);
+       const int component = BRW_GET_SWZ(src->Swizzle, channel);
        const GLfloat *param =
           c->fp->program.Base.Parameters->ParameterValues[src->Index];
        GLfloat value = param[component];
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 7b18335dec..53232325d2 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -227,7 +227,7 @@ static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
 						    struct brw_fp_src src,
 						    GLuint i )
 {
-   return pass0_get_reg(c, src.file, src.index, GET_SWZ(src.swizzle,i));
+   return pass0_get_reg(c, src.file, src.index, BRW_GET_SWZ(src.swizzle,i));
 }
 
 
-- 
cgit v1.2.3


From eacd13bcc809e1e877a48c2942eb6285aa21f6be Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 6 Nov 2009 13:09:12 +0000
Subject: i965g: plumb through fb_write target and eot data

---
 src/gallium/drivers/i965/brw_wm.h       | 10 +++++-----
 src/gallium/drivers/i965/brw_wm_emit.c  |  4 ++--
 src/gallium/drivers/i965/brw_wm_fp.c    | 26 +++++++++++++-------------
 src/gallium/drivers/i965/brw_wm_pass0.c | 10 +++++++---
 src/gallium/drivers/i965/brw_wm_pass1.c |  4 ++--
 5 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 7d044ff6ec..f85a8af878 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -142,9 +142,10 @@ struct brw_wm_instruction {
    GLuint saturate:1;
    GLuint writemask:4;
    GLuint tex_unit:4;   /* texture/sampler unit for texture instructions */
-   GLuint tex_target:4; /* TGSI_TEXTURE_x for texture instructions*/
+   GLuint target:4;     /* TGSI_TEXTURE_x for texture instructions,
+                         * target binding table index for FB_WRITE
+                         */
    GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
-   GLuint target:10;    /* target binding table index for FB_WRITE*/
 };
 
 
@@ -204,10 +205,9 @@ struct brw_fp_instruction {
    struct brw_fp_dst dst;
    struct brw_fp_src src[3];
    unsigned opcode:8;
+   unsigned target:8; /* XXX: special usage for FB_WRITE */
    unsigned tex_unit:4;
-   unsigned tex_target:4;
-   unsigned target:10;		/* destination surface for FB_WRITE */
-   unsigned eot:1;		/* mark last instruction (usually FB_WRITE) */
+   unsigned pad:12;
 };
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 1c38f80cda..a14e12f35b 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -803,7 +803,7 @@ static void emit_tex( struct brw_wm_compile *c,
 
    /* How many input regs are there?
     */
-   switch (inst->tex_target) {
+   switch (inst->target) {
    case TGSI_TEXTURE_1D:
       emit = BRW_WRITEMASK_X;
       nr = 1;
@@ -885,7 +885,7 @@ static void emit_txb( struct brw_wm_compile *c,
    GLuint msg_type;
    /* Shadow ignored for txb.
     */
-   switch (inst->tex_target) {
+   switch (inst->target) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
       brw_MOV(p, brw_message_reg(2), arg[0]);
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index d27a768a0c..2a207958eb 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -280,18 +280,24 @@ static struct brw_fp_instruction *get_fp_inst(struct brw_wm_compile *c)
 static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
 					     GLuint op,
 					     struct brw_fp_dst dest,
-					     GLuint tex_src_unit,
-					     GLuint tex_src_target,
+					     GLuint tex_unit,
+					     GLuint target,
 					     struct brw_fp_src src0,
 					     struct brw_fp_src src1,
 					     struct brw_fp_src src2 )
 {
    struct brw_fp_instruction *inst = get_fp_inst(c);
 
+   if (tex_unit || target)
+      assert(op == TGSI_OPCODE_TXP ||
+             op == TGSI_OPCODE_TXB ||
+             op == TGSI_OPCODE_TEX ||
+             op == WM_FB_WRITE);
+
    inst->opcode = op;
    inst->dst = dest;
-   inst->tex_unit = tex_src_unit;
-   inst->tex_target = tex_src_target;
+   inst->tex_unit = tex_unit;
+   inst->target = target;
    inst->src[0] = src0;
    inst->src[1] = src1;
    inst->src[2] = src2;
@@ -916,23 +922,17 @@ static void emit_fb_write( struct brw_wm_compile *c )
 
    for (i = 0 ; i < c->key.nr_cbufs; i++) {
       struct brw_fp_src outcolor;
-      unsigned target = 1<<i;
-
-      /* Set EOT flag on last inst:
-       */
-      if (i == c->key.nr_cbufs - 1)
-	 target |= 1;
       
       outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
 
-      /* Use emit_tex_op so that we can specify the inst->tex_target
+      /* Use emit_tex_op so that we can specify the inst->target
        * field, which is abused to contain the FB write target and the
        * EOT marker
        */
       emit_tex_op(c, WM_FB_WRITE,
 		  dst_undef(),
-		  target,
-		  0,
+		  (i == c->key.nr_cbufs - 1), /* EOT */
+		  i,
 		  outcolor,
 		  payload_r0_depth,
 		  outdepth);
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 53232325d2..7bb341e2c2 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -274,9 +274,13 @@ translate_insn(struct brw_wm_compile *c,
    out->opcode = inst->opcode;
    out->saturate = inst->dst.saturate;
    out->tex_unit = inst->tex_unit;
-   out->tex_target = inst->tex_target;
-   out->eot = inst->eot; //inst->Aux & 1;
-   out->target = inst->target; //inst->Aux >> 1;
+   out->target = inst->target;
+
+   /* Nasty hack:
+    */
+   out->eot = (inst->opcode == WM_FB_WRITE &&
+               inst->tex_unit != 0);
+
 
    /* Args:
     */
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index 09ad2b8f5b..005747f00b 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -223,11 +223,11 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case TGSI_OPCODE_TEX:
       case TGSI_OPCODE_TXP:
-	 read0 = get_texcoord_mask(inst->tex_target);
+	 read0 = get_texcoord_mask(inst->target);
 	 break;
 
       case TGSI_OPCODE_TXB:
-	 read0 = get_texcoord_mask(inst->tex_target) | BRW_WRITEMASK_W;
+	 read0 = get_texcoord_mask(inst->target) | BRW_WRITEMASK_W;
 	 break;
 
       case WM_WPOSXY:
-- 
cgit v1.2.3


From 9507a6c206627b3ae76e2ae8398fff518e39941a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 19 Nov 2009 20:02:42 -0800
Subject: i965g: fragment shader immediates working

---
 src/gallium/drivers/i965/brw_curbe.c    | 30 ++++++++++++++++-----
 src/gallium/drivers/i965/brw_wm.h       |  9 -------
 src/gallium/drivers/i965/brw_wm_pass0.c | 48 ++++++++-------------------------
 3 files changed, 34 insertions(+), 53 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 3e821d5afe..3f031577d5 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -182,16 +182,32 @@ static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
 
    /* fragment shader constants */
    if (brw->curbe.wm_size) {
+      const struct brw_fragment_shader *fs = brw->curr.fragment_shader;
       GLuint offset = brw->curbe.wm_start * 16;
-      unsigned nr = brw->wm.prog_data->nr_params;
+      GLuint nr_immediate, nr_const;
+
+      nr_immediate = fs->immediates.nr;
+      if (nr_immediate) {
+         memcpy(&buf[offset], 
+                fs->immediates.data,
+                nr_immediate * 4 * sizeof(float));
 
-      const GLfloat *value = screen->buffer_map( screen,
-						 brw->curr.fragment_constants,
-						 PIPE_BUFFER_USAGE_CPU_READ);
+         offset += nr_immediate * 4;
+      }
 
-      memcpy(&buf[offset], value, nr * 4 * sizeof(float));
+      nr_const = fs->info.file_max[TGSI_FILE_CONSTANT] + 1;
+/*      nr_const = brw->wm.prog_data->nr_params; */
+      if (nr_const) {
+         const GLfloat *value = screen->buffer_map( screen,
+                                                    brw->curr.fragment_constants,
+                                                    PIPE_BUFFER_USAGE_CPU_READ);
 
-      screen->buffer_unmap( screen, brw->curr.fragment_constants );
+         memcpy(&buf[offset], value,
+                nr_const * 4 * sizeof(float));
+         
+         screen->buffer_unmap( screen, 
+                               brw->curr.fragment_constants );
+      }
    }
 
 
@@ -226,7 +242,7 @@ static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
    /* vertex shader constants */
    if (brw->curbe.vs_size) {
       GLuint offset = brw->curbe.vs_start * 16;
-      struct brw_vertex_shader *vs = brw->curr.vertex_shader;
+      const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
       GLuint nr_immediate, nr_const;
 
       nr_immediate = vs->immediates.nr;
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index f85a8af878..b7d807dcb3 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -129,12 +129,6 @@ struct brw_wm_ref {
    GLuint insn:24;
 };
 
-struct brw_wm_imm_ref {
-   const struct brw_wm_ref *ref;
-   GLfloat imm1f;
-};
-
-
 struct brw_wm_instruction {
    struct brw_wm_value *dst[4];
    struct brw_wm_ref *src[3][4];
@@ -272,9 +266,6 @@ struct brw_wm_compile {
    struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
    GLuint nr_insns;
 
-   struct brw_wm_imm_ref imm_ref[BRW_WM_MAX_CONST];
-   GLuint nr_imm_refs;
-
    struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
 
    GLuint grf_limit;
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 7bb341e2c2..0bacad2b0f 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -30,6 +30,7 @@
   */
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include "brw_debug.h"
 #include "brw_wm.h"
@@ -97,9 +98,10 @@ static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
 }
 
 static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c, 
-					       const GLfloat *param_ptr )
+					       unsigned idx,
+                                               unsigned component)
 {
-   GLuint i = c->prog_data.nr_params++;
+   GLuint i = idx * 4 + component;
    
    if (i >= BRW_WM_MAX_PARAM) {
       debug_printf("%s: out of params\n", __FUNCTION__);
@@ -109,8 +111,7 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
    else {
       struct brw_wm_ref *ref = get_ref(c);
 
-      c->prog_data.param[i] = param_ptr;
-      c->nr_creg = (i+16)/16;
+      c->nr_creg = MAX2(c->nr_creg, (i+16)/16);
 
       /* Push the offsets into hw_reg.  These will be added to the
        * real register numbers once one is allocated in pass2.
@@ -125,37 +126,6 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
 }
 
 
-/** Return a ref to an immediate value */
-static const struct brw_wm_ref *get_imm_ref( struct brw_wm_compile *c,
-					     const GLfloat *imm1f )
-{
-   GLuint i;
-
-   /* Search for an existing const value matching the request:
-    */
-   for (i = 0; i < c->nr_imm_refs; i++) {
-      if (c->imm_ref[i].imm1f == *imm1f) 
-	 return c->imm_ref[i].ref;
-   }
-
-   /* Else try to add a new one:
-    */
-   if (c->nr_imm_refs < Elements(c->imm_ref)) {
-      GLuint i = c->nr_imm_refs++;
-
-      /* An immediate is a special type of parameter:
-       */
-      c->imm_ref[i].imm1f = *imm1f;
-      c->imm_ref[i].ref = get_param_ref(c, imm1f);
-
-      return c->imm_ref[i].ref;
-   }
-   else {
-      debug_printf("%s: out of imm_refs\n", __FUNCTION__);
-      c->prog_data.error = 1;
-      return NULL;
-   }
-}
 
 
 /* Lookup our internal registers
@@ -177,11 +147,15 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 break;
 
       case TGSI_FILE_CONSTANT:
-	 ref = get_param_ref(c, &c->env_param[idx][component]);
+	 ref = get_param_ref(c, 
+                             c->fp->info.immediate_count + idx,
+                             component);
 	 break;
 
       case TGSI_FILE_IMMEDIATE:
-	 ref = get_imm_ref(c, &c->immediate[idx].v[component]);
+	 ref = get_param_ref(c, 
+                             idx,
+                             component);
 	 break;
 
       default:
-- 
cgit v1.2.3


From 8bf75f28de161173d1cdaad8c74bcac074e1211e Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 21 Nov 2009 01:52:22 +0000
Subject: i965g: get basic texturing working again

Revert to fixed-layout surface binding table -- it's probably the best
way to do this.  Pass sampler and texture numbers separately even
though we're always keeping them the same at present.
---
 src/gallium/drivers/i965/brw_context.h          | 13 +++--
 src/gallium/drivers/i965/brw_pipe_fb.c          |  4 +-
 src/gallium/drivers/i965/brw_pipe_sampler.c     |  3 +-
 src/gallium/drivers/i965/brw_sf.c               |  3 +-
 src/gallium/drivers/i965/brw_wm.c               |  9 +++
 src/gallium/drivers/i965/brw_wm.h               |  4 +-
 src/gallium/drivers/i965/brw_wm_emit.c          | 34 ++++++------
 src/gallium/drivers/i965/brw_wm_fp.c            | 39 ++++++++-----
 src/gallium/drivers/i965/brw_wm_surface_state.c | 74 ++++++++++++++++---------
 9 files changed, 117 insertions(+), 66 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_wm.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 096c8cf12b..598e747fe0 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -209,9 +209,9 @@ struct brw_fragment_shader {
 
 
 struct brw_sampler {
-   float border_color[4];
    struct brw_ss0 ss0;
    struct brw_ss1 ss1;
+   float border_color[4];
    struct brw_ss3 ss3;
 };
 
@@ -355,20 +355,23 @@ struct brw_vs_ouput_sizes {
 /** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 16
 
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 4
+
 /**
  * Size of our surface binding table for the WM.
  * This contains pointers to the drawing surfaces and current texture
  * objects and shader constant buffers (+2).
  */
-#define BRW_WM_MAX_SURF (PIPE_MAX_COLOR_BUFS + BRW_MAX_TEX_UNIT + 1)
+#define BRW_WM_MAX_SURF (BRW_MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
 
 /**
  * Helpers to convert drawing buffers, textures and constant buffers
  * to surface binding table indexes, for WM.
  */
-#define SURF_INDEX_DRAW(d)           (d)
-#define SURF_INDEX_FRAG_CONST_BUFFER (PIPE_MAX_COLOR_BUFS) 
-#define SURF_INDEX_TEXTURE(t)        (PIPE_MAX_COLOR_BUFS + 1 + (t))
+#define BTI_COLOR_BUF(d)          (d)
+#define BTI_FRAGMENT_CONSTANTS    (BRW_MAX_DRAW_BUFFERS) 
+#define BTI_TEXTURE(t)            (BRW_MAX_DRAW_BUFFERS + 1 + (t))
 
 /**
  * Size of surface binding table for the VS.
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
index 1511220447..6b03094f50 100644
--- a/src/gallium/drivers/i965/brw_pipe_fb.c
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -31,7 +31,7 @@ static void brw_set_framebuffer_state( struct pipe_context *pipe,
 
    /* Color buffers:
     */
-   for (i = 0; i < MAX2(fb->nr_cbufs, brw->curr.fb.nr_cbufs); i++) {
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       if (brw->curr.fb.cbufs[i] != fb->cbufs[i]) {
 	 brw->state.dirty.mesa |= PIPE_NEW_COLOR_BUFFERS;
 	 pipe_surface_reference(&brw->curr.fb.cbufs[i], fb->cbufs[i]);
@@ -39,7 +39,7 @@ static void brw_set_framebuffer_state( struct pipe_context *pipe,
    }
    
    if (brw->curr.fb.nr_cbufs != fb->nr_cbufs) {
-      brw->curr.fb.nr_cbufs = fb->nr_cbufs;
+      brw->curr.fb.nr_cbufs = MIN2(BRW_MAX_DRAW_BUFFERS, fb->nr_cbufs);
       brw->state.dirty.mesa |= PIPE_NEW_NR_CBUFS;
    }
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
index f0a765ecf5..5cd38a43a6 100644
--- a/src/gallium/drivers/i965/brw_pipe_sampler.c
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -107,7 +107,7 @@ static void *
 brw_create_sampler_state( struct pipe_context *pipe,
                           const struct pipe_sampler_state *template )
 {
-   struct brw_sampler_state *sampler = CALLOC_STRUCT(brw_sampler_state);
+   struct brw_sampler *sampler = CALLOC_STRUCT(brw_sampler);
 
    sampler->ss0.min_filter = translate_img_filter( template->min_img_filter );
    sampler->ss0.mag_filter = translate_img_filter( template->mag_img_filter );
@@ -214,7 +214,6 @@ void brw_pipe_sampler_init( struct brw_context *brw )
 
    brw->base.set_sampler_textures = brw_set_sampler_textures;
 }
-
 void brw_pipe_sampler_cleanup( struct brw_context *brw )
 {
 }
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index e1986a9dbb..a28fb71589 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -153,9 +153,10 @@ static enum pipe_error upload_sf_prog(struct brw_context *brw)
       case TGSI_INTERPOLATE_CONSTANT:
          break;
       case TGSI_INTERPOLATE_LINEAR:
+      case TGSI_INTERPOLATE_PERSPECTIVE:
          key.linear_attrs |= 1 << (i+1);
          break;
-      case TGSI_INTERPOLATE_PERSPECTIVE:
+//      case TGSI_INTERPOLATE_PERSPECTIVE:
          key.persp_attrs |= 1 << (i+1);
          break;
       }
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 3c5a2dab7a..2c9d3e5e87 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -56,6 +56,15 @@ GLuint brw_wm_nr_args( GLuint opcode )
    case WM_FB_WRITE:
    case WM_PINTERP:
       return 3;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+      /* sampler arg is held as a field in the instruction, not in an
+       * actual register:
+       */
+      return tgsi_get_opcode_info(opcode)->num_src - 1;
+
    default:
       assert(opcode < MAX_OPCODE);
       return tgsi_get_opcode_info(opcode)->num_src;
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index b7d807dcb3..f1ca9f6369 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -135,6 +135,7 @@ struct brw_wm_instruction {
    GLuint opcode:8;
    GLuint saturate:1;
    GLuint writemask:4;
+   GLuint sampler:4;
    GLuint tex_unit:4;   /* texture/sampler unit for texture instructions */
    GLuint target:4;     /* TGSI_TEXTURE_x for texture instructions,
                          * target binding table index for FB_WRITE
@@ -201,7 +202,8 @@ struct brw_fp_instruction {
    unsigned opcode:8;
    unsigned target:8; /* XXX: special usage for FB_WRITE */
    unsigned tex_unit:4;
-   unsigned pad:12;
+   unsigned sampler:4;
+   unsigned pad:8;
 };
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index a14e12f35b..3250db1848 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -792,7 +792,8 @@ static void emit_tex( struct brw_wm_compile *c,
 		      const struct brw_wm_instruction *inst,
 		      struct brw_reg *dst,
 		      GLuint dst_flags,
-		      struct brw_reg *arg )
+		      struct brw_reg *coord,
+		      GLuint sampler)
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength, responseLength;
@@ -838,7 +839,7 @@ static void emit_tex( struct brw_wm_compile *c,
    for (i = 0; i < nr; i++) {
       static const GLuint swz[4] = {0,1,2,2};
       if (emit & (1<<i)) 
-	 brw_MOV(p, brw_message_reg(msgLength+1), arg[swz[i]]);
+	 brw_MOV(p, brw_message_reg(msgLength+1), coord[swz[i]]);
       else
 	 brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
       msgLength += 2;
@@ -862,8 +863,8 @@ static void emit_tex( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
 	      inst->writemask,
 	      msg_type, 
 	      responseLength,
@@ -878,7 +879,8 @@ static void emit_txb( struct brw_wm_compile *c,
 		      const struct brw_wm_instruction *inst,
 		      struct brw_reg *dst,
 		      GLuint dst_flags,
-		      struct brw_reg *arg )
+		      struct brw_reg *coord,
+		      GLuint sampler )
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength;
@@ -888,7 +890,7 @@ static void emit_txb( struct brw_wm_compile *c,
    switch (inst->target) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
+      brw_MOV(p, brw_message_reg(2), coord[0]);
       brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
@@ -896,22 +898,22 @@ static void emit_txb( struct brw_wm_compile *c,
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), arg[2]);
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
+      brw_MOV(p, brw_message_reg(6), coord[2]);
       break;
    default:
       /* unexpected target */
       abort();
    }
 
-   brw_MOV(p, brw_message_reg(8), arg[3]);
+   brw_MOV(p, brw_message_reg(8), coord[3]);
    msgLength = 9;
 
    if (BRW_IS_IGDNG(p->brw))
@@ -923,8 +925,8 @@ static void emit_txb( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
 	      inst->writemask,
 	      msg_type,
 	      8,		/* responseLength */
@@ -1483,11 +1485,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Texturing operations:
 	  */
       case TGSI_OPCODE_TEX:
-	 emit_tex(c, inst, dst, dst_flags, args[0]);
+	 emit_tex(c, inst, dst, dst_flags, args[0], inst->sampler);
 	 break;
 
       case TGSI_OPCODE_TXB:
-	 emit_txb(c, inst, dst, dst_flags, args[0]);
+	 emit_txb(c, inst, dst, dst_flags, args[0], inst->sampler);
 	 break;
 
       case TGSI_OPCODE_KIL:
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 174486a101..a8b5e15f36 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -282,6 +282,7 @@ static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
 					     struct brw_fp_dst dest,
 					     GLuint tex_unit,
 					     GLuint target,
+					     GLuint sampler,
 					     struct brw_fp_src src0,
 					     struct brw_fp_src src1,
 					     struct brw_fp_src src2 )
@@ -298,6 +299,7 @@ static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
    inst->dst = dest;
    inst->tex_unit = tex_unit;
    inst->target = target;
+   inst->sampler = sampler;
    inst->src[0] = src0;
    inst->src[1] = src1;
    inst->src[2] = src2;
@@ -313,7 +315,7 @@ static INLINE void emit_op3(struct brw_wm_compile *c,
 			    struct brw_fp_src src1,
 			    struct brw_fp_src src2 )
 {
-   emit_tex_op(c, op, dest, 0, 0, src0, src1, src2);
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src2);
 }
 
 
@@ -323,7 +325,7 @@ static INLINE void emit_op2(struct brw_wm_compile *c,
 			    struct brw_fp_src src0,
 			    struct brw_fp_src src1)
 {
-   emit_tex_op(c, op, dest, 0, 0, src0, src1, src_undef());
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src_undef());
 }
 
 static INLINE void emit_op1(struct brw_wm_compile *c,
@@ -331,14 +333,14 @@ static INLINE void emit_op1(struct brw_wm_compile *c,
 			    struct brw_fp_dst dest,
 			    struct brw_fp_src src0)
 {
-   emit_tex_op(c, op, dest, 0, 0, src0, src_undef(), src_undef());
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src_undef(), src_undef());
 }
 
 static INLINE void emit_op0(struct brw_wm_compile *c,
 			   GLuint op,
 			   struct brw_fp_dst dest)
 {
-   emit_tex_op(c, op, dest, 0, 0, src_undef(), src_undef(), src_undef());
+   emit_tex_op(c, op, dest, 0, 0, 0, src_undef(), src_undef(), src_undef());
 }
 
 
@@ -674,7 +676,8 @@ static void precalc_tex( struct brw_wm_compile *c,
 			 struct brw_fp_dst dst,
 			 unsigned target,
 			 unsigned unit,
-			 struct brw_fp_src src0 )
+			 struct brw_fp_src src0,
+			 struct brw_fp_src sampler )
 {
    struct brw_fp_src coord = src_undef();
    struct brw_fp_dst tmp = dst_undef();
@@ -751,6 +754,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                   dst_saturate(tmp, dst.saturate),
                   unit,
                   target,
+                  sampler.index,
                   coord,
                   src_undef(),
                   src_undef());
@@ -802,6 +806,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                   dst,
                   unit,
                   target,
+                  sampler.index,
                   coord,
                   src_undef(),
                   src_undef());
@@ -851,7 +856,8 @@ static void precalc_txp( struct brw_wm_compile *c,
 			 struct brw_fp_dst dst,
 			 unsigned target,
 			 unsigned unit,
-			 struct brw_fp_src src0 )
+			 struct brw_fp_src src0,
+                         struct brw_fp_src sampler )
 {
    if (projtex(c, target, src0)) {
       struct brw_fp_dst tmp = get_temp(c);
@@ -877,7 +883,8 @@ static void precalc_txp( struct brw_wm_compile *c,
 		  dst,
 		  target,
 		  unit,
-		  src_reg_from_dst(tmp));
+		  src_reg_from_dst(tmp),
+                  sampler );
 
       release_temp(c, tmp);
    }
@@ -885,7 +892,7 @@ static void precalc_txp( struct brw_wm_compile *c,
    {
       /* dst = TEX src0
        */
-      precalc_tex(c, dst, target, unit, src0);
+      precalc_tex(c, dst, target, unit, src0, sampler);
    }
 }
 
@@ -936,6 +943,7 @@ static void emit_fb_write( struct brw_wm_compile *c )
 		  dst_undef(),
 		  (i == c->key.nr_cbufs - 1), /* EOT */
 		  i,
+                  0,            /* no sampler */
 		  outcolor,
 		  payload_r0_depth,
 		  outdepth);
@@ -1056,15 +1064,17 @@ static void emit_insn( struct brw_wm_compile *c,
    case TGSI_OPCODE_TEX:
       precalc_tex(c, dst,
 		  inst->InstructionExtTexture.Texture,
-		  src[0].file,	/* sampler unit */
-		  src[1] );
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
       break;
 
    case TGSI_OPCODE_TXP:
       precalc_txp(c, dst,
 		  inst->InstructionExtTexture.Texture,
-		  src[0].file,	/* sampler unit */
-		  src[1] );
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
       break;
 
    case TGSI_OPCODE_TXB:
@@ -1072,8 +1082,9 @@ static void emit_insn( struct brw_wm_compile *c,
        */
       precalc_tex(c, dst,
 		  inst->InstructionExtTexture.Texture,
-		  src[0].file,	/* sampler unit */
-		  src[1] );
+		  src[1].index,	/* use sampler unit for tex idx*/
+		  src[0],
+                  src[1]);
       break;
 
    case TGSI_OPCODE_XPD: 
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index f882331433..f92b8198ed 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -149,19 +149,23 @@ brw_wm_get_binding_table(struct brw_context *brw,
    enum pipe_error ret;
    struct brw_winsys_reloc reloc[BRW_WM_MAX_SURF];
    uint32_t data[BRW_WM_MAX_SURF];
+   GLuint nr_relocs = 0;
    GLuint data_size = brw->wm.nr_surfaces * sizeof data[0];
    int i;
 
    assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
    assert(brw->wm.nr_surfaces > 0);
 
-   /* Emit binding table relocations to surface state */
+   /* Emit binding table relocations to surface state 
+    */
    for (i = 0; i < brw->wm.nr_surfaces; i++) {
-      make_reloc(&reloc[i],
-                 BRW_USAGE_STATE,
-                 0,
-                 i * sizeof(GLuint),
-                 brw->wm.surf_bo[i]);
+      if (brw->wm.surf_bo[i]) {
+         make_reloc(&reloc[nr_relocs++],
+                    BRW_USAGE_STATE,
+                    0,
+                    i * sizeof(GLuint),
+                    brw->wm.surf_bo[i]);
+      }
    }
 
    /* Note there is no key for this search beyond the values in the
@@ -169,7 +173,7 @@ brw_wm_get_binding_table(struct brw_context *brw,
     */
    if (brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
                         NULL, 0,
-                        reloc, brw->wm.nr_surfaces,
+                        reloc, nr_relocs,
                         NULL,
                         bo_out))
       return PIPE_OK;
@@ -182,7 +186,7 @@ brw_wm_get_binding_table(struct brw_context *brw,
 
    ret = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
                            NULL, 0,
-                           reloc, brw->wm.nr_surfaces,
+                           reloc, nr_relocs,
                            data, data_size,
                            NULL, NULL,
                            bo_out);
@@ -208,40 +212,60 @@ static enum pipe_error prepare_wm_surfaces(struct brw_context *brw )
    for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
       ret = brw_update_render_surface(brw, 
                                       brw_surface(brw->curr.fb.cbufs[i]), 
-                                      &brw->wm.surf_bo[nr_surfaces++]);
+                                      &brw->wm.surf_bo[BTI_COLOR_BUF(i)]);
       if (ret)
          return ret;
+      
+      nr_surfaces = BTI_COLOR_BUF(i) + 1;
+   }
+
+
+
+   /* PIPE_NEW_FRAGMENT_CONSTANTS
+    */
+#if 0
+   if (brw->curr.fragment_constants) {
+      ret = brw_update_fragment_constant_surface(
+         brw, 
+         brw->curr.fragment_constants, 
+         &brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS]);
+
+      if (ret)
+         return ret;
+
+      nr_surfaces = BTI_FRAGMENT_CONSTANTS + 1;
    }
+   else {
+      bo_reference(&brw->wm.surf_bo[SURF_FRAG_CONSTANTS], NULL);      
+   }
+#endif
+
 
    /* PIPE_NEW_TEXTURE 
     */
    for (i = 0; i < brw->curr.num_textures; i++) {
       ret = brw_update_texture_surface(brw, 
                                        brw_texture(brw->curr.texture[i]),
-                                       &brw->wm.surf_bo[nr_surfaces++]);
+                                       &brw->wm.surf_bo[BTI_TEXTURE(i)]);
       if (ret)
          return ret;
+
+      nr_surfaces = BTI_TEXTURE(i) + 1;
    }
 
-   /* PIPE_NEW_FRAGMENT_CONSTANTS
+   /* Clear any inactive entries:
     */
-#if 0
-   if (brw->curr.fragment_constants) {
-      ret = brw_update_fragment_constant_surface(brw, 
-                                                 brw->curr.fragment_constants, 
-                                                 &brw->wm.surf_bo[nr_surfaces++]);
-      if (ret)
-         return ret;
-   }
-#endif
+   for (i = brw->curr.fb.nr_cbufs; i < BRW_MAX_DRAW_BUFFERS; i++) 
+      bo_reference(&brw->wm.surf_bo[BTI_COLOR_BUF(i)], NULL);
 
-   if (brw->wm.nr_surfaces != nr_surfaces) {
+   if (!brw->curr.fragment_constants)
+      bo_reference(&brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS], NULL);      
 
-      /* Unreference any left-over old buffers
-       */
-      for (i = nr_surfaces; i < brw->wm.nr_surfaces; i++)
-         bo_reference(&brw->wm.surf_bo[i], NULL);
+   /* XXX: no pipe_max_textures define?? */
+   for (i = brw->curr.num_textures; i < PIPE_MAX_SAMPLERS; i++)
+      bo_reference(&brw->wm.surf_bo[BTI_TEXTURE(i)], NULL);
 
+   if (brw->wm.nr_surfaces != nr_surfaces) {
       brw->wm.nr_surfaces = nr_surfaces;
       brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
    }
-- 
cgit v1.2.3