From 92fcbf6e7bc622dcace226bb70ff6d5cdbdbaecb Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 20:07:18 +0900
Subject: Code reorganization: s/aux/auxiliary/.

"aux" is a reserved name on Windows (X_X)
---
 src/gallium/auxiliary/draw/Makefile             |   2 +
 src/gallium/auxiliary/draw/draw_clip.c          | 488 +++++++++++++++++++
 src/gallium/auxiliary/draw/draw_context.c       | 293 +++++++++++
 src/gallium/auxiliary/draw/draw_context.h       | 142 ++++++
 src/gallium/auxiliary/draw/draw_cull.c          | 150 ++++++
 src/gallium/auxiliary/draw/draw_debug.c         | 113 +++++
 src/gallium/auxiliary/draw/draw_flatshade.c     | 205 ++++++++
 src/gallium/auxiliary/draw/draw_offset.c        | 186 +++++++
 src/gallium/auxiliary/draw/draw_prim.c          | 482 +++++++++++++++++++
 src/gallium/auxiliary/draw/draw_private.h       | 346 +++++++++++++
 src/gallium/auxiliary/draw/draw_stipple.c       | 239 +++++++++
 src/gallium/auxiliary/draw/draw_twoside.c       | 203 ++++++++
 src/gallium/auxiliary/draw/draw_unfilled.c      | 206 ++++++++
 src/gallium/auxiliary/draw/draw_validate.c      | 185 +++++++
 src/gallium/auxiliary/draw/draw_vbuf.c          | 570 ++++++++++++++++++++++
 src/gallium/auxiliary/draw/draw_vbuf.h          | 106 ++++
 src/gallium/auxiliary/draw/draw_vertex.c        |  79 +++
 src/gallium/auxiliary/draw/draw_vertex.h        | 111 +++++
 src/gallium/auxiliary/draw/draw_vertex_cache.c  | 196 ++++++++
 src/gallium/auxiliary/draw/draw_vertex_fetch.c  | 510 ++++++++++++++++++++
 src/gallium/auxiliary/draw/draw_vertex_shader.c | 325 +++++++++++++
 src/gallium/auxiliary/draw/draw_vf.c            | 428 +++++++++++++++++
 src/gallium/auxiliary/draw/draw_vf.h            | 236 +++++++++
 src/gallium/auxiliary/draw/draw_vf_generic.c    | 585 ++++++++++++++++++++++
 src/gallium/auxiliary/draw/draw_vf_sse.c        | 614 ++++++++++++++++++++++++
 src/gallium/auxiliary/draw/draw_wide_prims.c    | 432 +++++++++++++++++
 26 files changed, 7432 insertions(+)
 create mode 100644 src/gallium/auxiliary/draw/Makefile
 create mode 100644 src/gallium/auxiliary/draw/draw_clip.c
 create mode 100644 src/gallium/auxiliary/draw/draw_context.c
 create mode 100644 src/gallium/auxiliary/draw/draw_context.h
 create mode 100644 src/gallium/auxiliary/draw/draw_cull.c
 create mode 100644 src/gallium/auxiliary/draw/draw_debug.c
 create mode 100644 src/gallium/auxiliary/draw/draw_flatshade.c
 create mode 100644 src/gallium/auxiliary/draw/draw_offset.c
 create mode 100644 src/gallium/auxiliary/draw/draw_prim.c
 create mode 100644 src/gallium/auxiliary/draw/draw_private.h
 create mode 100644 src/gallium/auxiliary/draw/draw_stipple.c
 create mode 100644 src/gallium/auxiliary/draw/draw_twoside.c
 create mode 100644 src/gallium/auxiliary/draw/draw_unfilled.c
 create mode 100644 src/gallium/auxiliary/draw/draw_validate.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vbuf.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vbuf.h
 create mode 100644 src/gallium/auxiliary/draw/draw_vertex.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vertex.h
 create mode 100644 src/gallium/auxiliary/draw/draw_vertex_cache.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vertex_fetch.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vertex_shader.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vf.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vf.h
 create mode 100644 src/gallium/auxiliary/draw/draw_vf_generic.c
 create mode 100644 src/gallium/auxiliary/draw/draw_vf_sse.c
 create mode 100644 src/gallium/auxiliary/draw/draw_wide_prims.c

(limited to 'src/gallium/auxiliary/draw')

diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/gallium/auxiliary/draw/draw_clip.c b/src/gallium/auxiliary/draw/draw_clip.c
new file mode 100644
index 0000000000..e3051507ea
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_clip.c
@@ -0,0 +1,488 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Clipping stage
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+#ifndef IS_NEGATIVE
+#define IS_NEGATIVE(X) ((X) < 0.0)
+#endif
+
+#ifndef DIFFERENT_SIGNS
+#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F)
+#endif
+
+#ifndef MAX_CLIPPED_VERTICES
+#define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1)
+#endif
+
+
+
+struct clipper {
+   struct draw_stage stage;      /**< base class */
+
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
+   float (*plane)[4];
+};
+
+
+/* This is a bit confusing:
+ */
+static INLINE struct clipper *clipper_stage( struct draw_stage *stage )
+{
+   return (struct clipper *)stage;
+}
+
+
+#define LINTERP(T, OUT, IN) ((OUT) + (T) * ((IN) - (OUT)))
+
+
+/* All attributes are float[4], so this is easy:
+ */
+static void interp_attr( float *fdst,
+			 float t,
+			 const float *fin,
+			 const float *fout )
+{  
+   fdst[0] = LINTERP( t, fout[0], fin[0] );
+   fdst[1] = LINTERP( t, fout[1], fin[1] );
+   fdst[2] = LINTERP( t, fout[2], fin[2] );
+   fdst[3] = LINTERP( t, fout[3], fin[3] );
+}
+
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+
+/* Interpolate between two vertices to produce a third.  
+ */
+static void interp( const struct clipper *clip,
+		    struct vertex_header *dst,
+		    float t,
+		    const struct vertex_header *out, 
+		    const struct vertex_header *in )
+{
+   const unsigned nr_attrs = clip->stage.draw->num_vs_outputs;
+   unsigned j;
+
+   /* Vertex header.
+    */
+   {
+      dst->clipmask = 0;
+      dst->edgeflag = 0;
+      dst->pad = 0;
+      dst->vertex_id = UNDEFINED_VERTEX_ID;
+   }
+
+   /* Clip coordinates:  interpolate normally
+    */
+   {
+      interp_attr(dst->clip, t, in->clip, out->clip);
+   }
+
+   /* Do the projective divide and insert window coordinates:
+    */
+   {
+      const float *pos = dst->clip;
+      const float *scale = clip->stage.draw->viewport.scale;
+      const float *trans = clip->stage.draw->viewport.translate;
+      const float oow = 1.0f / pos[3];
+
+      dst->data[0][0] = pos[0] * oow * scale[0] + trans[0];
+      dst->data[0][1] = pos[1] * oow * scale[1] + trans[1];
+      dst->data[0][2] = pos[2] * oow * scale[2] + trans[2];
+      dst->data[0][3] = oow;
+   }
+
+   /* Other attributes
+    * Note: start at 1 to skip winpos (data[0]) since we just computed
+    * it above.
+    */
+   for (j = 1; j < nr_attrs; j++) {
+      interp_attr(dst->data[j], t, in->data[j], out->data[j]);
+   }
+}
+
+
+static void emit_poly( struct draw_stage *stage,
+		       struct vertex_header **inlist,
+		       unsigned n,
+		       const struct prim_header *origPrim)
+{
+   struct prim_header header;
+   unsigned i;
+
+   /* later stages may need the determinant, but only the sign matters */
+   header.det = origPrim->det;
+
+   for (i = 2; i < n; i++) {
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
+	
+      {
+	 unsigned tmp1 = header.v[1]->edgeflag;
+	 unsigned tmp2 = header.v[2]->edgeflag;
+
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
+
+         header.edgeflags = ((header.v[0]->edgeflag << 0) | 
+                             (header.v[1]->edgeflag << 1) | 
+                             (header.v[2]->edgeflag << 2));
+
+	 stage->next->tri( stage->next, &header );
+
+	 header.v[1]->edgeflag = tmp1;
+	 header.v[2]->edgeflag = tmp2;
+      }
+   }
+}
+
+
+
+
+/* Clip a triangle against the viewport and user clip planes.
+ */
+static void
+do_clip_tri( struct draw_stage *stage, 
+	     struct prim_header *header,
+	     unsigned clipmask )
+{
+   struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *a[MAX_CLIPPED_VERTICES];
+   struct vertex_header *b[MAX_CLIPPED_VERTICES];
+   struct vertex_header **inlist = a;
+   struct vertex_header **outlist = b;
+   unsigned tmpnr = 0;
+   unsigned n = 3;
+   unsigned i;
+
+   inlist[0] = header->v[0];
+   inlist[1] = header->v[1];
+   inlist[2] = header->v[2];
+
+   while (clipmask && n >= 3) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      struct vertex_header *vert_prev = inlist[0];
+      float dp_prev = dot4( vert_prev->clip, plane );
+      unsigned outcount = 0;
+
+      clipmask &= ~(1<<plane_idx);
+
+      inlist[n] = inlist[0]; /* prevent rotation of vertices */
+
+      for (i = 1; i <= n; i++) {
+	 struct vertex_header *vert = inlist[i];
+
+	 float dp = dot4( vert->clip, plane );
+
+	 if (!IS_NEGATIVE(dp_prev)) {
+	    outlist[outcount++] = vert_prev;
+	 }
+
+	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
+	    struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++];
+	    outlist[outcount++] = new_vert;
+
+	    if (IS_NEGATIVE(dp)) {
+	       /* Going out of bounds.  Avoid division by zero as we
+		* know dp != dp_prev from DIFFERENT_SIGNS, above.
+		*/
+	       float t = dp / (dp - dp_prev);
+	       interp( clipper, new_vert, t, vert, vert_prev );
+	       
+	       /* Force edgeflag true in this case:
+		*/
+	       new_vert->edgeflag = 1;
+	    } else {
+	       /* Coming back in.
+		*/
+	       float t = dp_prev / (dp_prev - dp);
+	       interp( clipper, new_vert, t, vert_prev, vert );
+
+	       /* Copy starting vert's edgeflag:
+		*/
+	       new_vert->edgeflag = vert_prev->edgeflag;
+	    }
+	 }
+
+	 vert_prev = vert;
+	 dp_prev = dp;
+      }
+
+      {
+	 struct vertex_header **tmp = inlist;
+	 inlist = outlist;
+	 outlist = tmp;
+	 n = outcount;
+      }
+   }
+
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
+   /* Emit the polygon as triangles to the setup stage:
+    */
+   if (n >= 3)
+      emit_poly( stage, inlist, n, header );
+}
+
+
+/* Clip a line against the viewport and user clip planes.
+ */
+static void
+do_clip_line( struct draw_stage *stage,
+	      struct prim_header *header,
+	      unsigned clipmask )
+{
+   const struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const float *pos0 = v0->clip;
+   const float *pos1 = v1->clip;
+   float t0 = 0.0F;
+   float t1 = 0.0F;
+   struct prim_header newprim;
+
+   while (clipmask) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      const float dp0 = dot4( pos0, plane );
+      const float dp1 = dot4( pos1, plane );
+
+      if (dp1 < 0.0F) {
+	 float t = dp1 / (dp1 - dp0);
+         t1 = MAX2(t1, t);
+      } 
+
+      if (dp0 < 0.0F) {
+	 float t = dp0 / (dp0 - dp1);
+         t0 = MAX2(t0, t);
+      }
+
+      if (t0 + t1 >= 1.0F)
+	 return; /* discard */
+
+      clipmask &= ~(1 << plane_idx);  /* turn off this plane's bit */
+   }
+
+   if (v0->clipmask) {
+      interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
+      newprim.v[0] = stage->tmp[0];
+   }
+   else {
+      newprim.v[0] = v0;
+   }
+
+   if (v1->clipmask) {
+      interp( clipper, stage->tmp[1], t1, v1, v0 );
+      newprim.v[1] = stage->tmp[1];
+   }
+   else {
+      newprim.v[1] = v1;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static void
+clip_point( struct draw_stage *stage, 
+	    struct prim_header *header )
+{
+   if (header->v[0]->clipmask == 0) 
+      stage->next->point( stage->next, header );
+}
+
+
+static void
+clip_line( struct draw_stage *stage,
+	   struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->line( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask &
+             header->v[1]->clipmask) == 0) {
+      do_clip_line(stage, header, clipmask);
+   }
+   /* else, totally clipped */
+}
+
+
+static void
+clip_tri( struct draw_stage *stage,
+	  struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask | 
+                        header->v[2]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->tri( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask & 
+             header->v[1]->clipmask & 
+             header->v[2]->clipmask) == 0) {
+      do_clip_tri(stage, header, clipmask);
+   }
+}
+
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void clip_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void clip_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Allocate a new clipper stage.
+ * \return pointer to new stage object
+ */
+struct draw_stage *draw_clip_stage( struct draw_context *draw )
+{
+   struct clipper *clipper = CALLOC_STRUCT(clipper);
+
+   draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
+
+   clipper->stage.draw = draw;
+   clipper->stage.point = clip_point;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
+   clipper->stage.flush = clip_flush;
+   clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
+   clipper->stage.destroy = clip_destroy;
+
+   clipper->plane = draw->plane;
+
+   return &clipper->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
new file mode 100644
index 0000000000..4be3830316
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -0,0 +1,293 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "pipe/p_util.h"
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+
+struct draw_context *draw_create( void )
+{
+   struct draw_context *draw = CALLOC_STRUCT( draw_context );
+
+#if defined(__i386__) || defined(__386__)
+   draw->use_sse = GETENV( "GALLIUM_NOSSE" ) == NULL;
+#else
+   draw->use_sse = FALSE;
+#endif
+
+   /* create pipeline stages */
+   draw->pipeline.wide      = draw_wide_stage( draw );
+   draw->pipeline.stipple   = draw_stipple_stage( draw );
+   draw->pipeline.unfilled  = draw_unfilled_stage( draw );
+   draw->pipeline.twoside   = draw_twoside_stage( draw );
+   draw->pipeline.offset    = draw_offset_stage( draw );
+   draw->pipeline.clip      = draw_clip_stage( draw );
+   draw->pipeline.flatshade = draw_flatshade_stage( draw );
+   draw->pipeline.cull      = draw_cull_stage( draw );
+   draw->pipeline.validate  = draw_validate_stage( draw );
+   draw->pipeline.first     = draw->pipeline.validate;
+
+   ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
+   ASSIGN_4V( draw->plane[3],  0,  1,  0, 1 );
+   ASSIGN_4V( draw->plane[4],  0,  0,  1, 1 ); /* yes these are correct */
+   ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
+   draw->nr_planes = 6;
+
+   /* Statically allocate maximum sized vertices for the cache - could be cleverer...
+    */
+   {
+      uint i;
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
+
+      for (i = 0; i < Elements(draw->vcache.vertex); i++)
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
+   }
+
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
+   draw->convert_wide_points = TRUE;
+   draw->convert_wide_lines = TRUE;
+
+   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
+
+   draw_vertex_cache_invalidate( draw );
+   draw_set_mapped_element_buffer( draw, 0, NULL );
+
+   return draw;
+}
+
+
+void draw_destroy( struct draw_context *draw )
+{
+   draw->pipeline.wide->destroy( draw->pipeline.wide );
+   draw->pipeline.stipple->destroy( draw->pipeline.stipple );
+   draw->pipeline.unfilled->destroy( draw->pipeline.unfilled );
+   draw->pipeline.twoside->destroy( draw->pipeline.twoside );
+   draw->pipeline.offset->destroy( draw->pipeline.offset );
+   draw->pipeline.clip->destroy( draw->pipeline.clip );
+   draw->pipeline.flatshade->destroy( draw->pipeline.flatshade );
+   draw->pipeline.cull->destroy( draw->pipeline.cull );
+   draw->pipeline.validate->destroy( draw->pipeline.validate );
+   if (draw->pipeline.rasterize)
+      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
+   tgsi_exec_machine_free_data(&draw->machine);
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   FREE( draw );
+}
+
+
+
+void draw_flush( struct draw_context *draw )
+{
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+}
+
+
+
+/**
+ * Register new primitive rasterization/rendering state.
+ * This causes the drawing pipeline to be rebuilt.
+ */
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->rasterizer = raster;
+}
+
+
+/** 
+ * Plug in the primitive rendering/rasterization stage (which is the last
+ * stage in the drawing pipeline).
+ * This is provided by the device driver.
+ */
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->pipeline.rasterize = stage;
+}
+
+
+/**
+ * Set the draw module's clipping state.
+ */
+void draw_set_clip_state( struct draw_context *draw,
+                          const struct pipe_clip_state *clip )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
+   memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
+   draw->nr_planes = 6 + clip->nr;
+}
+
+
+/**
+ * Set the draw module's viewport state.
+ */
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->viewport = *viewport; /* struct copy */
+}
+
+
+
+void
+draw_set_vertex_buffer(struct draw_context *draw,
+                       unsigned attr,
+                       const struct pipe_vertex_buffer *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   assert(attr < PIPE_ATTRIB_MAX);
+   draw->vertex_buffer[attr] = *buffer;
+}
+
+
+void
+draw_set_vertex_element(struct draw_context *draw,
+                        unsigned attr,
+                        const struct pipe_vertex_element *element)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   assert(attr < PIPE_ATTRIB_MAX);
+   draw->vertex_element[attr] = *element;
+}
+
+
+/**
+ * Tell drawing context where to find mapped vertex buffers.
+ */
+void
+draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                              unsigned attr, const void *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   draw->user.vbuffer[attr] = buffer;
+}
+
+
+void
+draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                const void *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   draw->user.constants = buffer;
+}
+
+
+/**
+ * Tells the draw module whether to convert wide points (size != 1)
+ * into triangles.
+ */
+void
+draw_convert_wide_points(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->convert_wide_points = enable;
+}
+
+
+/**
+ * Tells the draw module whether to convert wide lines (width != 1)
+ * into triangles.
+ */
+void
+draw_convert_wide_lines(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->convert_wide_lines = enable;
+}
+
+
+/**
+ * Allocate space for temporary post-transform vertices, such as for clipping.
+ */
+void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr )
+{
+   assert(!stage->tmp);
+
+   stage->nr_tmps = nr;
+
+   if (nr) {
+      ubyte *store = (ubyte *) MALLOC( MAX_VERTEX_SIZE * nr );
+      unsigned i;
+
+      stage->tmp = (struct vertex_header **) MALLOC( sizeof(struct vertex_header *) * nr );
+      
+      for (i = 0; i < nr; i++)
+	 stage->tmp[i] = (struct vertex_header *)(store + i * MAX_VERTEX_SIZE);
+   }
+}
+
+
+void draw_free_temp_verts( struct draw_stage *stage )
+{
+   if (stage->tmp) {
+      FREE( stage->tmp[0] );
+      FREE( stage->tmp );
+      stage->tmp = NULL;
+   }
+}
+
+
+boolean draw_use_sse(struct draw_context *draw)
+{
+   return (boolean) draw->use_sse;
+}
+
+
+void draw_reset_vertex_ids(struct draw_context *draw)
+{
+   struct draw_stage *stage = draw->pipeline.first;
+   
+   while (stage) {
+      unsigned i;
+
+      for (i = 0; i < stage->nr_tmps; i++)
+	 stage->tmp[i]->vertex_id = UNDEFINED_VERTEX_ID;
+
+      stage = stage->next;
+   }
+
+   draw_vertex_cache_reset_vertex_ids(draw);
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
new file mode 100644
index 0000000000..ddeb184497
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -0,0 +1,142 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Public interface into the drawing module.
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#ifndef DRAW_CONTEXT_H
+#define DRAW_CONTEXT_H
+
+
+#include "pipe/p_state.h"
+
+
+struct vertex_buffer;
+struct vertex_info;
+struct draw_context;
+struct draw_stage;
+struct draw_vertex_shader;
+
+
+/**
+ * Clipmask flags
+ */
+/*@{*/
+#define CLIP_RIGHT_BIT   0x01
+#define CLIP_LEFT_BIT    0x02
+#define CLIP_TOP_BIT     0x04
+#define CLIP_BOTTOM_BIT  0x08
+#define CLIP_NEAR_BIT    0x10
+#define CLIP_FAR_BIT     0x20
+/*@}*/
+
+/**
+ * Bitshift for each clip flag
+ */
+/*@{*/
+#define CLIP_RIGHT_SHIFT 	0
+#define CLIP_LEFT_SHIFT 	1
+#define CLIP_TOP_SHIFT  	2
+#define CLIP_BOTTOM_SHIFT       3
+#define CLIP_NEAR_SHIFT  	4
+#define CLIP_FAR_SHIFT  	5
+/*@}*/
+
+
+struct draw_context *draw_create( void );
+
+void draw_destroy( struct draw_context *draw );
+
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport );
+
+void draw_set_clip_state( struct draw_context *pipe,
+                          const struct pipe_clip_state *clip );
+
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster );
+
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage );
+
+void draw_convert_wide_points(struct draw_context *draw, boolean enable);
+
+void draw_convert_wide_lines(struct draw_context *draw, boolean enable);
+
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader);
+void draw_bind_vertex_shader(struct draw_context *draw,
+                             struct draw_vertex_shader *dvs);
+void draw_delete_vertex_shader(struct draw_context *draw,
+                               struct draw_vertex_shader *dvs);
+
+boolean draw_use_sse(struct draw_context *draw);
+
+void draw_set_vertex_buffer(struct draw_context *draw,
+			    unsigned attr,
+			    const struct pipe_vertex_buffer *buffer);
+
+void draw_set_vertex_element(struct draw_context *draw,
+			     unsigned attr,
+			     const struct pipe_vertex_element *element);
+
+void draw_set_mapped_element_buffer( struct draw_context *draw,
+                                     unsigned eltSize, void *elements );
+
+void draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                                   unsigned attr, const void *buffer);
+
+void draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                     const void *buffer);
+
+
+/***********************************************************************
+ * draw_prim.c 
+ */
+
+void draw_arrays(struct draw_context *draw, unsigned prim,
+		 unsigned start, unsigned count);
+
+void draw_flush(struct draw_context *draw);
+
+/***********************************************************************
+ * draw_debug.c 
+ */
+boolean draw_validate_prim( unsigned prim, unsigned length );
+unsigned draw_trim_prim( unsigned mode, unsigned count );
+
+
+
+#endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_cull.c b/src/gallium/auxiliary/draw/draw_cull.c
new file mode 100644
index 0000000000..8177b0ac86
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_cull.c
@@ -0,0 +1,150 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for polygon culling
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+struct cull_stage {
+   struct draw_stage stage;
+   unsigned winding;  /**< which winding(s) to cull (one of PIPE_WINDING_x) */
+};
+
+
+static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
+{
+   return (struct cull_stage *)stage;
+}
+
+
+
+
+static void cull_tri( struct draw_stage *stage,
+		      struct prim_header *header )
+{
+   /* Window coords: */
+   const float *v0 = header->v[0]->data[0];
+   const float *v1 = header->v[1]->data[0];
+   const float *v2 = header->v[2]->data[0];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+   
+   /* det = cross(e,f).z */
+   header->det = ex * fy - ey * fx;
+
+   if (header->det != 0) {
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+
+      if ((winding & cull_stage(stage)->winding) == 0) {
+         /* triangle is not culled, pass to next stage */
+	 stage->next->tri( stage->next, header );
+      }
+   }
+}
+
+static void cull_first_tri( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct cull_stage *cull = cull_stage(stage);
+
+   cull->winding = stage->draw->rasterizer->cull_mode;
+
+   stage->tri = cull_tri;
+   stage->tri( stage, header );
+}
+
+
+
+static void cull_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void cull_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void cull_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = cull_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+static void cull_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void cull_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create a new polygon culling stage.
+ */
+struct draw_stage *draw_cull_stage( struct draw_context *draw )
+{
+   struct cull_stage *cull = CALLOC_STRUCT(cull_stage);
+
+   draw_alloc_temp_verts( &cull->stage, 0 );
+
+   cull->stage.draw = draw;
+   cull->stage.next = NULL;
+   cull->stage.point = cull_point;
+   cull->stage.line = cull_line;
+   cull->stage.tri = cull_first_tri;
+   cull->stage.flush = cull_flush;
+   cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
+   cull->stage.destroy = cull_destroy;
+
+   return &cull->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_debug.c b/src/gallium/auxiliary/draw/draw_debug.c
new file mode 100644
index 0000000000..d6220b5f62
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_debug.c
@@ -0,0 +1,113 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+
+static void
+draw_prim_info(unsigned prim, unsigned *first, unsigned *incr)
+{
+   assert(prim >= PIPE_PRIM_POINTS);
+   assert(prim <= PIPE_PRIM_POLYGON);
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      *first = 1;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINES:
+      *first = 2;
+      *incr = 2;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      *first = 3;
+      *incr = 3;
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      *first = 4;
+      *incr = 4;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      *first = 4;
+      *incr = 2;
+      break;
+   default:
+      assert(0);
+      *first = 1;
+      *incr = 1;
+      break;
+   }
+}
+
+
+unsigned 
+draw_trim_prim( unsigned mode, unsigned count )
+{
+   unsigned length, first, incr;
+
+   draw_prim_info( mode, &first, &incr );
+
+   if (count < first)
+      length = 0;
+   else
+      length = count - (count - first) % incr; 
+
+   return length;
+}
+
+
+boolean
+draw_validate_prim( unsigned mode, unsigned count )
+{
+   return (count > 0 &&
+           count == draw_trim_prim( mode, count ));
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_flatshade.c b/src/gallium/auxiliary/draw/draw_flatshade.c
new file mode 100644
index 0000000000..4398abbc60
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_flatshade.c
@@ -0,0 +1,205 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+/** subclass of draw_stage */
+struct flat_stage
+{
+   struct draw_stage stage;
+
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+};
+
+
+static INLINE struct flat_stage *
+flat_stage(struct draw_stage *stage)
+{
+   return (struct flat_stage *) stage;
+}
+
+
+/** Copy all the color attributes from 'src' vertex to 'dst' vertex */
+static INLINE void copy_colors( struct draw_stage *stage,
+                                struct vertex_header *dst,
+                                const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+/** Copy all the color attributes from src vertex to dst0 & dst1 vertices */
+static INLINE void copy_colors2( struct draw_stage *stage,
+                                 struct vertex_header *dst0,
+                                 struct vertex_header *dst1,
+                                 const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst0->data[attr], src->data[attr]);
+      COPY_4FV(dst1->data[attr], src->data[attr]);
+   }
+}
+
+
+/**
+ * Flatshade tri.  Required for clipping and when unfilled tris are
+ * active, otherwise handled by hardware.
+ */
+static void flatshade_tri( struct draw_stage *stage,
+			   struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.edgeflags = header->edgeflags;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = header->v[2];
+
+   copy_colors2(stage, tmp.v[0], tmp.v[1], tmp.v[2]);
+   
+   stage->next->tri( stage->next, &tmp );
+}
+
+
+/**
+ * Flatshade line.  Required for clipping.
+ */
+static void flatshade_line( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = header->v[1];
+
+   copy_colors(stage, tmp.v[0], tmp.v[1]);
+   
+   stage->next->line( stage->next, &tmp );
+}
+
+
+static void flatshade_point( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void flatshade_init_state( struct draw_stage *stage )
+{
+   struct flat_stage *flat = flat_stage(stage);
+   const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+   uint i;
+
+   /* Find which vertex shader outputs are colors, make a list */
+   flat->num_color_attribs = 0;
+   for (i = 0; i < vs->num_outputs; i++) {
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+          vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         flat->color_attribs[flat->num_color_attribs++] = i;
+      }
+   }
+
+   stage->line = flatshade_line;
+   stage->tri = flatshade_tri;
+}
+
+static void flatshade_first_tri( struct draw_stage *stage,
+				 struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void flatshade_first_line( struct draw_stage *stage,
+				  struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void flatshade_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = flatshade_first_tri;
+   stage->line = flatshade_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void flatshade_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void flatshade_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create flatshading drawing stage.
+ */
+struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
+{
+   struct flat_stage *flatshade = CALLOC_STRUCT(flat_stage);
+
+   draw_alloc_temp_verts( &flatshade->stage, 2 );
+
+   flatshade->stage.draw = draw;
+   flatshade->stage.next = NULL;
+   flatshade->stage.point = flatshade_point;
+   flatshade->stage.line = flatshade_first_line;
+   flatshade->stage.tri = flatshade_first_tri;
+   flatshade->stage.flush = flatshade_flush;
+   flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter;
+   flatshade->stage.destroy = flatshade_destroy;
+
+   return &flatshade->stage;
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_offset.c b/src/gallium/auxiliary/draw/draw_offset.c
new file mode 100644
index 0000000000..dbc676deae
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_offset.c
@@ -0,0 +1,186 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  polygon offset state
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "pipe/p_util.h"
+#include "draw_private.h"
+
+
+
+struct offset_stage {
+   struct draw_stage stage;
+
+   float scale;
+   float units;
+};
+
+
+
+static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
+{
+   return (struct offset_stage *) stage;
+}
+
+
+
+
+
+/**
+ * Offset tri Z.  Some hardware can handle this, but not usually when
+ * doing unfilled rendering.
+ */
+static void do_offset_tri( struct draw_stage *stage,
+			   struct prim_header *header )
+{
+   struct offset_stage *offset = offset_stage(stage);   
+   float inv_det = 1.0f / header->det;
+
+   /* Window coords:
+    */
+   float *v0 = header->v[0]->data[0];
+   float *v1 = header->v[1]->data[0];
+   float *v2 = header->v[2]->data[0];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   float ex = v0[0] - v2[0];
+   float ey = v0[1] - v2[1];
+   float ez = v0[2] - v2[2];
+   float fx = v1[0] - v2[0];
+   float fy = v1[1] - v2[1];
+   float fz = v1[2] - v2[2];
+
+   /* (a,b) = cross(e,f).xy */
+   float a = ey*fz - ez*fy;
+   float b = ez*fx - ex*fz;
+
+   float dzdx = FABSF(a * inv_det);
+   float dzdy = FABSF(b * inv_det);
+
+   float zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale;
+
+   /*
+    * Note: we're applying the offset and clamping per-vertex.
+    * Ideally, the offset is applied per-fragment prior to fragment shading.
+    */
+   v0[2] = CLAMP(v0[2] + zoffset, 0.0f, 1.0f);
+   v1[2] = CLAMP(v1[2] + zoffset, 0.0f, 1.0f);
+   v2[2] = CLAMP(v2[2] + zoffset, 0.0f, 1.0f);
+
+   stage->next->tri( stage->next, header );
+}
+
+
+static void offset_tri( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.edgeflags = header->edgeflags;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = dup_vert(stage, header->v[2], 2);
+
+   do_offset_tri( stage, &tmp );
+}
+
+
+static void offset_first_tri( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct offset_stage *offset = offset_stage(stage);
+   float mrd = 1.0f / 65535.0f; /* XXX this depends on depthbuffer bits! */
+
+   offset->units = stage->draw->rasterizer->offset_units * mrd;
+   offset->scale = stage->draw->rasterizer->offset_scale;
+
+   stage->tri = offset_tri;
+   stage->tri( stage, header );
+}
+
+
+static void offset_line( struct draw_stage *stage,
+			 struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void offset_point( struct draw_stage *stage,
+			  struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void offset_flush( struct draw_stage *stage,
+			  unsigned flags )
+{
+   stage->tri = offset_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void offset_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void offset_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create polygon offset drawing stage.
+ */
+struct draw_stage *draw_offset_stage( struct draw_context *draw )
+{
+   struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
+
+   draw_alloc_temp_verts( &offset->stage, 3 );
+
+   offset->stage.draw = draw;
+   offset->stage.next = NULL;
+   offset->stage.point = offset_point;
+   offset->stage.line = offset_line;
+   offset->stage.tri = offset_first_tri;
+   offset->stage.flush = offset_flush;
+   offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
+   offset->stage.destroy = offset_destroy;
+
+   return &offset->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_prim.c b/src/gallium/auxiliary/draw/draw_prim.c
new file mode 100644
index 0000000000..51e2242719
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_prim.c
@@ -0,0 +1,482 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_debug.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+
+#define RP_NONE  0
+#define RP_POINT 1
+#define RP_LINE  2
+#define RP_TRI   3
+
+
+static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+   RP_POINT,
+   RP_LINE,
+   RP_LINE,
+   RP_LINE,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI
+};
+
+
+static void draw_prim_queue_flush( struct draw_context *draw )
+{
+   unsigned i;
+
+   if (0)
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
+
+   assert (draw->pq.queue_nr != 0);
+
+   /* NOTE: we cannot save draw->pipeline->first in a local var because
+    * draw->pipeline->first is often changed by the first call to tri(),
+    * line(), etc.
+    */
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      }
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      }
+   }
+
+   draw->pq.queue_nr = 0;   
+   draw_vertex_cache_unreference( draw );
+}
+
+
+
+void draw_do_flush( struct draw_context *draw, unsigned flags )
+{
+   if (0)
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
+
+
+   if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
+      if (draw->vs.queue_nr)
+         (*draw->shader_queue_flush)(draw);
+
+      if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
+
+	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
+            draw_vertex_cache_invalidate(draw);
+
+	    if (flags >= DRAW_FLUSH_STATE_CHANGE) {
+               draw->pipeline.first->flush( draw->pipeline.first, flags );
+               draw->pipeline.first = draw->pipeline.validate;
+               draw->reduced_prim = ~0;
+	    }
+	 }
+      }    
+   }
+}
+
+
+
+/* Return a pointer to a freshly queued primitive header.  Ensure that
+ * there is room in the vertex cache for a maximum of "nr_verts" new
+ * vertices.  Flush primitive and/or vertex queues if necessary to
+ * make space.
+ */
+static struct prim_header *get_queued_prim( struct draw_context *draw,
+					    unsigned nr_verts )
+{
+   if (!draw_vertex_cache_check_space( draw, nr_verts )) {
+//      debug_printf("v");
+      draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
+   }
+   else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
+//      debug_printf("p");
+      draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
+   }
+
+   assert(draw->pq.queue_nr < PRIM_QUEUE_LENGTH);
+
+   return &draw->pq.queue[draw->pq.queue_nr++];
+}
+
+
+
+/**
+ * Add a point to the primitive queue.
+ * \param i0  index into user's vertex arrays
+ */
+static void do_point( struct draw_context *draw,
+		      unsigned i0 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 1 );
+   
+   prim->reset_line_stipple = 0;
+   prim->edgeflags = 1;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+}
+
+
+/**
+ * Add a line to the primitive queue.
+ * \param i0  index into user's vertex arrays
+ * \param i1  index into user's vertex arrays
+ */
+static void do_line( struct draw_context *draw,
+		     boolean reset_stipple,
+		     unsigned i0,
+		     unsigned i1 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 2 );
+   
+   prim->reset_line_stipple = reset_stipple;
+   prim->edgeflags = 1;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
+}
+
+/**
+ * Add a triangle to the primitive queue.
+ */
+static void do_triangle( struct draw_context *draw,
+			 unsigned i0,
+			 unsigned i1,
+			 unsigned i2 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 3 );
+   
+   prim->reset_line_stipple = 1;
+   prim->edgeflags = ~0;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
+   prim->v[2] = draw->vcache.get_vertex( draw, i2 );
+}
+			  
+static void do_ef_triangle( struct draw_context *draw,
+			    boolean reset_stipple,
+			    unsigned ef_mask,
+			    unsigned i0,
+			    unsigned i1,
+			    unsigned i2 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 3 );
+   struct vertex_header *v0 = draw->vcache.get_vertex( draw, i0 );
+   struct vertex_header *v1 = draw->vcache.get_vertex( draw, i1 );
+   struct vertex_header *v2 = draw->vcache.get_vertex( draw, i2 );
+
+   prim->reset_line_stipple = reset_stipple;
+
+   prim->edgeflags = ef_mask & ((v0->edgeflag << 0) | 
+				(v1->edgeflag << 1) | 
+				(v2->edgeflag << 2));
+   prim->pad = 0;
+   prim->v[0] = v0;
+   prim->v[1] = v1;
+   prim->v[2] = v2;
+}
+
+
+static void do_ef_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   const unsigned omitEdge2 = ~(1 << 1);
+   const unsigned omitEdge3 = ~(1 << 2);
+   do_ef_triangle( draw, 1, omitEdge2, v0, v1, v3 );
+   do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
+}
+
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
+
+/**
+ * Main entrypoint to draw some number of points/lines/triangles
+ */
+static void
+draw_prim( struct draw_context *draw, 
+	   unsigned prim, unsigned start, unsigned count )
+{
+   unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
+
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i ++) {
+	 do_point( draw,
+		   start + i );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+	 do_line( draw, 
+		  TRUE,
+		  start + i + 0,
+		  start + i + 1);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:  
+      if (count >= 2) {
+	 for (i = 1; i < count; i++) {
+	    do_line( draw, 
+		     i == 1, 	/* XXX: only if vb not split */
+		     start + i - 1,
+		     start + i );
+	 }
+
+	 do_line( draw, 
+		  0,
+		  start + count - 1,
+		  start + 0 );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
+			 start + i + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 0; i+2 < count; i++) {
+	 if (i & 1) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 0,
+			 start + i + 2 );
+	 }
+	 else {
+	    do_triangle( draw,
+			 start + i + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+
+   case PIPE_PRIM_QUADS:
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      if (unfilled) {
+	 unsigned ef_mask = (1<<2) | (1<<0);
+
+	 for (i = 0; i+2 < count; i++) {
+
+            if (i + 3 >= count)
+	       ef_mask |= (1<<1);
+
+	    do_ef_triangle( draw,
+			    i == 0,
+			    ef_mask,
+			    start + i + 1,
+			    start + i + 2,
+			    start + 0);
+
+	    ef_mask &= ~(1<<2);
+	 }
+      }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+/**
+ * Draw vertex arrays
+ * This is the main entrypoint into the drawing module.
+ * \param prim  one of PIPE_PRIM_x
+ * \param start  index of first vertex to draw
+ * \param count  number of vertices to draw
+ */
+void
+draw_arrays(struct draw_context *draw, unsigned prim,
+            unsigned start, unsigned count)
+{
+   if (reduced_prim[prim] != draw->reduced_prim) {
+      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw->reduced_prim = reduced_prim[prim];
+   }
+
+   /* drawing done here: */
+   draw_prim(draw, prim, start, count);
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
new file mode 100644
index 0000000000..3d09aef87c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -0,0 +1,346 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Private data structures, etc for the draw module.
+ */
+
+
+/**
+ * Authors:
+ * Keith Whitwell <keith@tungstengraphics.com>
+ * Brian Paul
+ */
+
+
+#ifndef DRAW_PRIVATE_H
+#define DRAW_PRIVATE_H
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+
+#include "x86/rtasm/x86sse.h"
+#include "tgsi/exec/tgsi_exec.h"
+
+
+struct gallivm_prog;
+struct gallivm_cpu_engine;
+
+/**
+ * Basic vertex info.
+ * Carry some useful information around with the vertices in the prim pipe.  
+ */
+struct vertex_header {
+   unsigned clipmask:12;
+   unsigned edgeflag:1;
+   unsigned pad:3;
+   unsigned vertex_id:16;
+
+   float clip[4];
+
+   float data[][4];		/* Note variable size */
+};
+
+/* NOTE: It should match vertex_id size above */
+#define UNDEFINED_VERTEX_ID 0xffff
+
+/* XXX This is too large */
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+
+/**
+ * Basic info for a point/line/triangle primitive.
+ */
+struct prim_header {
+   float det;                 /**< front/back face determinant */
+   unsigned reset_line_stipple:1;
+   unsigned edgeflags:3;
+   unsigned pad:28;
+   struct vertex_header *v[3];  /**< 1 to 3 vertex pointers */
+};
+
+
+
+struct draw_context;
+
+/**
+ * Base class for all primitive drawing stages.
+ */
+struct draw_stage
+{
+   struct draw_context *draw;   /**< parent context */
+
+   struct draw_stage *next;     /**< next stage in pipeline */
+
+   struct vertex_header **tmp;  /**< temp vert storage, such as for clipping */
+   unsigned nr_tmps;
+
+   void (*point)( struct draw_stage *,
+		  struct prim_header * );
+
+   void (*line)( struct draw_stage *,
+		 struct prim_header * );
+
+   void (*tri)( struct draw_stage *,
+		struct prim_header * );
+
+   void (*flush)( struct draw_stage *,
+		  unsigned flags );
+
+   void (*reset_stipple_counter)( struct draw_stage * );
+
+   void (*destroy)( struct draw_stage * );
+};
+
+
+#define PRIM_QUEUE_LENGTH      16
+#define VCACHE_SIZE            32
+#define VCACHE_OVERFLOW        4
+#define VS_QUEUE_LENGTH        (VCACHE_SIZE + VCACHE_OVERFLOW + 1)	/* can never fill up */
+
+/**
+ * Private version of the compiled vertex_shader
+ */
+struct draw_vertex_shader {
+   const struct pipe_shader_state   *state;
+#if defined(__i386__) || defined(__386__)
+   struct x86_function              sse2_program;
+#endif
+#ifdef MESA_LLVM
+   struct gallivm_prog *llvm_prog;
+#endif
+};
+
+
+/* Internal function for vertex fetch.
+ */
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
+
+
+
+/**
+ * Private context for the drawing module.
+ */
+struct draw_context
+{
+   /** Drawing/primitive pipeline stages */
+   struct {
+      struct draw_stage *first;  /**< one of the following */
+
+      struct draw_stage *validate; 
+
+      /* stages (in logical order) */
+      struct draw_stage *flatshade;
+      struct draw_stage *clip;
+      struct draw_stage *cull;
+      struct draw_stage *twoside;
+      struct draw_stage *offset;
+      struct draw_stage *unfilled;
+      struct draw_stage *stipple;
+      struct draw_stage *wide;
+      struct draw_stage *rasterize;
+   } pipeline;
+
+   /* pipe state that we need: */
+   const struct pipe_rasterizer_state *rasterizer;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   const struct draw_vertex_shader *vertex_shader;
+
+   uint num_vs_outputs;  /**< convenience, from vertex_shader */
+
+   /* user-space vertex data, buffers */
+   struct {
+      /** vertex element/index buffer (ex: glDrawElements) */
+      const void *elts;
+      /** bytes per index (0, 1, 2 or 4) */
+      unsigned eltSize;
+
+      /** vertex arrays */
+      const void *vbuffer[PIPE_ATTRIB_MAX];
+
+      /** constant buffer (for vertex shader) */
+      const void *constants;
+   } user;
+
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   boolean convert_wide_points; /**< convert wide points to tris? */
+   boolean convert_wide_lines;  /**< convert side lines to tris? */
+
+   unsigned reduced_prim;
+
+   /** TGSI program interpreter runtime state */
+   struct tgsi_exec_machine machine;
+
+   /* Vertex fetch internal state
+    */
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      fetch_func fetch[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      full_fetch_func fetch_func;
+   } vertex_fetch;
+
+   /* Post-tnl vertex cache:
+    */
+   struct {
+      unsigned referenced;  /**< bitfield */
+      unsigned idx[VCACHE_SIZE + VCACHE_OVERFLOW];
+      struct vertex_header *vertex[VCACHE_SIZE + VCACHE_OVERFLOW];
+      unsigned overflow;
+
+      /** To find space in the vertex cache: */
+      struct vertex_header *(*get_vertex)( struct draw_context *draw,
+                                           unsigned i );
+   } vcache;
+
+   /* Vertex shader queue:
+    */
+   struct {
+      struct {
+	 unsigned elt;   /**< index into the user's vertex arrays */
+	 struct vertex_header *dest; /**< points into vcache.vertex[] array */
+      } queue[VS_QUEUE_LENGTH];
+      unsigned queue_nr;
+   } vs;
+
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
+   /* Prim pipeline queue:
+    */
+   struct {
+      /* Need to queue up primitives until their vertices have been
+       * transformed by a vs queue flush.
+       */
+      struct prim_header queue[PRIM_QUEUE_LENGTH];
+      unsigned queue_nr;
+   } pq;
+
+   int use_sse : 1;
+#ifdef MESA_LLVM
+   struct gallivm_cpu_engine *engine;
+#endif
+   
+   void *driver_private;
+};
+
+
+
+extern struct draw_stage *draw_unfilled_stage( struct draw_context *context );
+extern struct draw_stage *draw_twoside_stage( struct draw_context *context );
+extern struct draw_stage *draw_offset_stage( struct draw_context *context );
+extern struct draw_stage *draw_clip_stage( struct draw_context *context );
+extern struct draw_stage *draw_flatshade_stage( struct draw_context *context );
+extern struct draw_stage *draw_cull_stage( struct draw_context *context );
+extern struct draw_stage *draw_stipple_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_stage( struct draw_context *context );
+extern struct draw_stage *draw_validate_stage( struct draw_context *context );
+
+
+extern void draw_free_temp_verts( struct draw_stage *stage );
+
+extern void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr );
+
+extern void draw_reset_vertex_ids( struct draw_context *draw );
+
+
+extern int draw_vertex_cache_check_space( struct draw_context *draw, 
+					  unsigned nr_verts );
+
+extern void draw_vertex_cache_invalidate( struct draw_context *draw );
+extern void draw_vertex_cache_unreference( struct draw_context *draw );
+extern void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw );
+
+
+extern void draw_vertex_shader_queue_flush( struct draw_context *draw );
+#ifdef MESA_LLVM
+extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
+#endif
+
+struct tgsi_exec_machine;
+
+extern void draw_update_vertex_fetch( struct draw_context *draw );
+
+
+#define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
+#define DRAW_FLUSH_PRIM_QUEUE                0x2
+#define DRAW_FLUSH_VERTEX_CACHE              0x4
+#define DRAW_FLUSH_STATE_CHANGE              0x8
+#define DRAW_FLUSH_BACKEND                   0x10
+
+
+void draw_do_flush( struct draw_context *draw, unsigned flags );
+
+
+
+/**
+ * Get a writeable copy of a vertex.
+ * \param stage  drawing stage info
+ * \param vert  the vertex to copy (source)
+ * \param idx  index into stage's tmp[] array to put the copy (dest)
+ * \return  pointer to the copied vertex
+ */
+static INLINE struct vertex_header *
+dup_vert( struct draw_stage *stage,
+	  const struct vertex_header *vert,
+	  unsigned idx )
+{   
+   struct vertex_header *tmp = stage->tmp[idx];
+   const uint vsize = sizeof(struct vertex_header)
+      + stage->draw->num_vs_outputs * 4 * sizeof(float);
+   memcpy(tmp, vert, vsize);
+   tmp->vertex_id = UNDEFINED_VERTEX_ID;
+   return tmp;
+}
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   float result = (a[0]*b[0] +
+                   a[1]*b[1] +
+                   a[2]*b[2] +
+                   a[3]*b[3]);
+
+   return result;
+}
+
+#endif /* DRAW_PRIVATE_H */
diff --git a/src/gallium/auxiliary/draw/draw_stipple.c b/src/gallium/auxiliary/draw/draw_stipple.c
new file mode 100644
index 0000000000..506f33512c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_stipple.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+/* Implement line stipple by cutting lines up into smaller lines.
+ * There are hundreds of ways to implement line stipple, this is one
+ * choice that should work in all situations, requires no state
+ * manipulations, but with a penalty in terms of large amounts of
+ * generated geometry.
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+/** Subclass of draw_stage */
+struct stipple_stage {
+   struct draw_stage stage;
+   float counter;
+   uint pattern;
+   uint factor;
+};
+
+
+static INLINE struct stipple_stage *
+stipple_stage(struct draw_stage *stage)
+{
+   return (struct stipple_stage *) stage;
+}
+
+
+/**
+ * Compute interpolated vertex attributes for 'dst' at position 't' 
+ * between 'v0' and 'v1'.
+ * XXX using linear interpolation for all attribs at this time.
+ */
+static void
+screen_interp( struct draw_context *draw,
+               struct vertex_header *dst,
+               float t,
+               const struct vertex_header *v0, 
+               const struct vertex_header *v1 )
+{
+   uint attr;
+   for (attr = 0; attr < draw->num_vs_outputs; attr++) {
+      const float *val0 = v0->data[attr];
+      const float *val1 = v1->data[attr];
+      float *newv = dst->data[attr];
+      uint i;
+      for (i = 0; i < 4; i++) {
+         newv[i] = val0[i] + t * (val1[i] - val0[i]);
+      }
+   }
+}
+
+
+static void
+emit_segment(struct draw_stage *stage, struct prim_header *header,
+             float t0, float t1)
+{
+   struct vertex_header *v0new = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1new = dup_vert(stage, header->v[1], 1);
+   struct prim_header newprim = *header;
+
+   if (t0 > 0.0) {
+      screen_interp( stage->draw, v0new, t0, header->v[0], header->v[1] );
+      newprim.v[0] = v0new;
+   }
+
+   if (t1 < 1.0) {
+      screen_interp( stage->draw, v1new, t1, header->v[0], header->v[1] );
+      newprim.v[1] = v1new;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static INLINE unsigned
+stipple_test(int counter, ushort pattern, int factor)
+{
+   int b = (counter / factor) & 0xf;
+   return (1 << b) & pattern;
+}
+
+
+static void
+stipple_line(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const float *pos0 = v0->data[0];
+   const float *pos1 = v1->data[0];
+   float start = 0;
+   int state = 0;
+
+   float x0 = pos0[0];
+   float x1 = pos1[0];
+   float y0 = pos0[1];
+   float y1 = pos1[1];
+
+   float dx = x0 > x1 ? x0 - x1 : x1 - x0;
+   float dy = y0 > y1 ? y0 - y1 : y1 - y0;
+
+   float length = MAX2(dx, dy);
+   int i;
+
+   /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table.
+    */
+   for (i = 0; i < length; i++) {
+      int result = stipple_test( (int) stipple->counter+i,
+                                 (ushort) stipple->pattern, stipple->factor );
+      if (result != state) {
+         /* changing from "off" to "on" or vice versa */
+	 if (state) {
+	    if (start != i) {
+               /* finishing an "on" segment */
+	       emit_segment( stage, header, start / length, i / length );
+            }
+	 }
+	 else {
+            /* starting an "on" segment */
+	    start = (float) i;
+	 }
+	 state = result;	   
+      }
+   }
+
+   if (state && start < length)
+      emit_segment( stage, header, start / length, 1.0 );
+
+   stipple->counter += length;
+}
+
+
+static void
+reset_stipple_counter(struct draw_stage *stage)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void
+stipple_first_line(struct draw_stage *stage, 
+		   struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   stipple->pattern = draw->rasterizer->line_stipple_pattern;
+   stipple->factor = draw->rasterizer->line_stipple_factor + 1;
+
+   stage->line = stipple_line;
+   stage->line( stage, header );
+}
+
+
+static void
+stipple_flush(struct draw_stage *stage, unsigned flags)
+{
+   stage->line = stipple_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void
+passthrough_point(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void
+passthrough_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+static void 
+stipple_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create line stippler stage
+ */
+struct draw_stage *draw_stipple_stage( struct draw_context *draw )
+{
+   struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
+
+   draw_alloc_temp_verts( &stipple->stage, 2 );
+
+   stipple->stage.draw = draw;
+   stipple->stage.next = NULL;
+   stipple->stage.point = passthrough_point;
+   stipple->stage.line = stipple_first_line;
+   stipple->stage.tri = passthrough_tri;
+   stipple->stage.reset_stipple_counter = reset_stipple_counter;
+   stipple->stage.flush = stipple_flush;
+   stipple->stage.destroy = stipple_destroy;
+
+   return &stipple->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_twoside.c b/src/gallium/auxiliary/draw/draw_twoside.c
new file mode 100644
index 0000000000..1c38957987
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_twoside.c
@@ -0,0 +1,203 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+struct twoside_stage {
+   struct draw_stage stage;
+   float sign;         /**< +1 or -1 */
+   uint attrib_front0, attrib_back0;
+   uint attrib_front1, attrib_back1;
+};
+
+
+static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
+{
+   return (struct twoside_stage *)stage;
+}
+
+
+
+
+/**
+ * Copy back color(s) to front color(s).
+ */
+static INLINE struct vertex_header *
+copy_bfc( struct twoside_stage *twoside, 
+          const struct vertex_header *v,
+          unsigned idx )
+{   
+   struct vertex_header *tmp = dup_vert( &twoside->stage, v, idx );
+   
+   if (twoside->attrib_back0) {
+      COPY_4FV(tmp->data[twoside->attrib_front0],
+               tmp->data[twoside->attrib_back0]);
+   }
+   if (twoside->attrib_back1) {
+      COPY_4FV(tmp->data[twoside->attrib_front1],
+               tmp->data[twoside->attrib_back1]);
+   }
+
+   return tmp;
+}
+
+
+/* Twoside tri:
+ */
+static void twoside_tri( struct draw_stage *stage,
+			 struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+
+   if (header->det * twoside->sign < 0.0) {
+      /* this is a back-facing triangle */
+      struct prim_header tmp;
+
+      tmp.det = header->det;
+      tmp.edgeflags = header->edgeflags;
+      /* copy back attribs to front attribs */
+      tmp.v[0] = copy_bfc(twoside, header->v[0], 0);
+      tmp.v[1] = copy_bfc(twoside, header->v[1], 1);
+      tmp.v[2] = copy_bfc(twoside, header->v[2], 2);
+
+      stage->next->tri( stage->next, &tmp );
+   }
+   else {
+      stage->next->tri( stage->next, header );
+   }
+}
+
+
+static void twoside_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   /* pass-through */
+   stage->next->line( stage->next, header );
+}
+
+
+static void twoside_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   /* pass-through */
+   stage->next->point( stage->next, header );
+}
+
+
+static void twoside_first_tri( struct draw_stage *stage, 
+			       struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+   const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+   uint i;
+
+   twoside->attrib_front0 = 0;
+   twoside->attrib_front1 = 0;
+   twoside->attrib_back0 = 0;
+   twoside->attrib_back1 = 0;
+
+   /* Find which vertex shader outputs are front/back colors */
+   for (i = 0; i < vs->num_outputs; i++) {
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+         if (vs->output_semantic_index[i] == 0)
+            twoside->attrib_front0 = i;
+         else
+            twoside->attrib_front1 = i;
+      }
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         if (vs->output_semantic_index[i] == 0)
+            twoside->attrib_back0 = i;
+         else
+            twoside->attrib_back1 = i;
+      }
+   }
+
+   if (!twoside->attrib_back0)
+      twoside->attrib_front0 = 0;
+
+   if (!twoside->attrib_back1)
+      twoside->attrib_front1 = 0;
+
+   /*
+    * We'll multiply the primitive's determinant by this sign to determine
+    * if the triangle is back-facing (negative).
+    * sign = -1 for CCW, +1 for CW
+    */
+   twoside->sign = (stage->draw->rasterizer->front_winding == PIPE_WINDING_CCW) ? -1.0f : 1.0f;
+
+   stage->tri = twoside_tri;
+   stage->tri( stage, header );
+}
+
+
+static void twoside_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = twoside_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void twoside_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void twoside_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create twoside pipeline stage.
+ */
+struct draw_stage *draw_twoside_stage( struct draw_context *draw )
+{
+   struct twoside_stage *twoside = CALLOC_STRUCT(twoside_stage);
+
+   draw_alloc_temp_verts( &twoside->stage, 3 );
+
+   twoside->stage.draw = draw;
+   twoside->stage.next = NULL;
+   twoside->stage.point = twoside_point;
+   twoside->stage.line = twoside_line;
+   twoside->stage.tri = twoside_first_tri;
+   twoside->stage.flush = twoside_flush;
+   twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
+   twoside->stage.destroy = twoside_destroy;
+
+   return &twoside->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_unfilled.c b/src/gallium/auxiliary/draw/draw_unfilled.c
new file mode 100644
index 0000000000..8777cfdfc8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_unfilled.c
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for handling glPolygonMode(line/point).
+ * Convert triangles to points or lines as needed.
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+struct unfilled_stage {
+   struct draw_stage stage;
+
+   /** [0] = front face, [1] = back face.
+    * legal values:  PIPE_POLYGON_MODE_FILL, PIPE_POLYGON_MODE_LINE,
+    * and PIPE_POLYGON_MODE_POINT,
+    */
+   unsigned mode[2];
+};
+
+
+static INLINE struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
+{
+   return (struct unfilled_stage *)stage;
+}
+
+
+
+static void point( struct draw_stage *stage,
+		   struct vertex_header *v0 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   stage->next->point( stage->next, &tmp );
+}
+
+static void line( struct draw_stage *stage,
+		  struct vertex_header *v0,
+		  struct vertex_header *v1 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   tmp.v[1] = v1;
+   stage->next->line( stage->next, &tmp );
+}
+
+
+static void points( struct draw_stage *stage,
+		    struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+   if (header->edgeflags & 0x1) point( stage, v0 );
+   if (header->edgeflags & 0x2) point( stage, v1 );
+   if (header->edgeflags & 0x4) point( stage, v2 );
+}
+
+
+static void lines( struct draw_stage *stage,
+		   struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+#if 0
+   assert(((header->edgeflags & 0x1) >> 0) == header->v[0]->edgeflag);
+   assert(((header->edgeflags & 0x2) >> 1) == header->v[1]->edgeflag);
+   assert(((header->edgeflags & 0x4) >> 2) == header->v[2]->edgeflag);
+#endif
+
+   if (header->edgeflags & 0x1) line( stage, v0, v1 );
+   if (header->edgeflags & 0x2) line( stage, v1, v2 );
+   if (header->edgeflags & 0x4) line( stage, v2, v0 );
+}
+
+
+/* Unfilled tri:  
+ *
+ * Note edgeflags in the vertex struct is not sufficient as we will
+ * need to manipulate them when decomposing primitives???
+ */
+static void unfilled_tri( struct draw_stage *stage,
+			  struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+   unsigned mode = unfilled->mode[header->det >= 0.0];
+  
+   switch (mode) {
+   case PIPE_POLYGON_MODE_FILL:
+      stage->next->tri( stage->next, header );
+      break;
+   case PIPE_POLYGON_MODE_LINE:
+      lines( stage, header );
+      break;
+   case PIPE_POLYGON_MODE_POINT:
+      points( stage, header );
+      break;
+   default:
+      abort();
+   }   
+}
+
+
+static void unfilled_first_tri( struct draw_stage *stage, 
+				struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+
+   unfilled->mode[0] = stage->draw->rasterizer->fill_ccw; /* front */
+   unfilled->mode[1] = stage->draw->rasterizer->fill_cw;  /* back */
+
+   stage->tri = unfilled_tri;
+   stage->tri( stage, header );
+}
+
+
+static void unfilled_line( struct draw_stage *stage,
+                           struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void unfilled_point( struct draw_stage *stage,
+                            struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void unfilled_flush( struct draw_stage *stage,
+			    unsigned flags )
+{
+   stage->next->flush( stage->next, flags );
+
+   stage->tri = unfilled_first_tri;
+}
+
+
+static void unfilled_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void unfilled_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create unfilled triangle stage.
+ */
+struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
+{
+   struct unfilled_stage *unfilled = CALLOC_STRUCT(unfilled_stage);
+
+   draw_alloc_temp_verts( &unfilled->stage, 0 );
+
+   unfilled->stage.draw = draw;
+   unfilled->stage.next = NULL;
+   unfilled->stage.tmp = NULL;
+   unfilled->stage.point = unfilled_point;
+   unfilled->stage.line = unfilled_line;
+   unfilled->stage.tri = unfilled_first_tri;
+   unfilled->stage.flush = unfilled_flush;
+   unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
+   unfilled->stage.destroy = unfilled_destroy;
+
+   return &unfilled->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_validate.c b/src/gallium/auxiliary/draw/draw_validate.c
new file mode 100644
index 0000000000..4375ebabbc
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_validate.c
@@ -0,0 +1,185 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+
+
+
+/**
+ * Rebuild the rendering pipeline.
+ */
+static struct draw_stage *validate_pipeline( struct draw_stage *stage )
+{
+   struct draw_context *draw = stage->draw;
+   struct draw_stage *next = draw->pipeline.rasterize;
+   int need_det = 0;
+   int precalc_flat = 0;
+
+   /* Set the validate's next stage to the rasterize stage, so that it
+    * can be found later if needed for flushing.
+    */
+   stage->next = next;
+
+   /*
+    * NOTE: we build up the pipeline in end-to-start order.
+    *
+    * TODO: make the current primitive part of the state and build
+    * shorter pipelines for lines & points.
+    */
+
+   if ((draw->rasterizer->line_width != 1.0 && draw->convert_wide_lines) ||
+       (draw->rasterizer->point_size != 1.0 && draw->convert_wide_points) ||
+       draw->rasterizer->point_sprite) {
+      draw->pipeline.wide->next = next;
+      next = draw->pipeline.wide;
+   }
+
+   if (draw->rasterizer->line_stipple_enable) {
+      draw->pipeline.stipple->next = next;
+      next = draw->pipeline.stipple;
+      precalc_flat = 1;		/* only needed for lines really */
+   }
+
+   if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) {
+      draw->pipeline.unfilled->next = next;
+      next = draw->pipeline.unfilled;
+      precalc_flat = 1;		/* only needed for triangles really */
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
+	 
+   if (draw->rasterizer->offset_cw ||
+       draw->rasterizer->offset_ccw) {
+      draw->pipeline.offset->next = next;
+      next = draw->pipeline.offset;
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->light_twoside) {
+      draw->pipeline.twoside->next = next;
+      next = draw->pipeline.twoside;
+      need_det = 1;
+   }
+
+   /* Always run the cull stage as we calculate determinant there
+    * also.  
+    *
+    * This can actually be a win as culling out the triangles can lead
+    * to less work emitting vertices, smaller vertex buffers, etc.
+    * It's difficult to say whether this will be true in general.
+    */
+   if (need_det || draw->rasterizer->cull_mode) {
+      draw->pipeline.cull->next = next;
+      next = draw->pipeline.cull;
+   }
+
+   /* Clip stage
+    */
+   if (!draw->rasterizer->bypass_clipping)
+   {
+      draw->pipeline.clip->next = next;
+      next = draw->pipeline.clip;
+   }
+
+   
+   draw->pipeline.first = next;
+   return next;
+}
+
+static void validate_tri( struct draw_stage *stage, 
+			  struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->tri( pipeline, header );
+}
+
+static void validate_line( struct draw_stage *stage, 
+			   struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->line( pipeline, header );
+}
+
+static void validate_point( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->point( pipeline, header );
+}
+
+static void validate_reset_stipple_counter( struct draw_stage *stage )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->reset_stipple_counter( pipeline );
+}
+
+static void validate_flush( struct draw_stage *stage, 
+			    unsigned flags )
+{
+   /* May need to pass a backend flush on to the rasterize stage.
+    */
+   if (stage->next)
+      stage->next->flush( stage->next, flags );
+}
+
+
+static void validate_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create validate pipeline stage.
+ */
+struct draw_stage *draw_validate_stage( struct draw_context *draw )
+{
+   struct draw_stage *stage = CALLOC_STRUCT(draw_stage);
+
+   stage->draw = draw;
+   stage->next = NULL;
+   stage->point = validate_point;
+   stage->line = validate_line;
+   stage->tri = validate_tri;
+   stage->flush = validate_flush;
+   stage->reset_stipple_counter = validate_reset_stipple_counter;
+   stage->destroy = validate_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.c b/src/gallium/auxiliary/draw/draw_vbuf.c
new file mode 100644
index 0000000000..71ac73912b
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vbuf.c
@@ -0,0 +1,570 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Jos� Fonseca <jrfonsec@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
+
+/**
+ * Vertex buffer emit stage.
+ */
+struct vbuf_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct vbuf_render *render;
+   
+   const struct vertex_info *vinfo;
+   
+   /** Vertex size in bytes */
+   unsigned vertex_size;
+
+   struct draw_vertex_fetch *vf;
+   
+   /* FIXME: we have no guarantee that 'unsigned' is 32bit */
+
+   /** Vertices in hardware format */
+   unsigned *vertices;
+   unsigned *vertex_ptr;
+   unsigned max_vertices;
+   unsigned nr_vertices;
+   
+   /** Indices */
+   ushort *indices;
+   unsigned max_indices;
+   unsigned nr_indices;
+
+   /** Pipe primitive */
+   unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct vbuf_stage *
+vbuf_stage( struct draw_stage *stage )
+{
+   assert(stage);
+   return (struct vbuf_stage *)stage;
+}
+
+
+static void vbuf_flush_indices( struct vbuf_stage *vbuf );
+static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
+static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
+
+
+static INLINE boolean 
+overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
+{
+   unsigned long used = (unsigned long) ((char *)ptr - (char *)map);
+   return (used + bytes) > bufsz;
+}
+
+
+static INLINE void 
+check_space( struct vbuf_stage *vbuf, unsigned nr )
+{
+   if (vbuf->nr_vertices + nr > vbuf->max_vertices ) {
+      vbuf_flush_vertices(vbuf);
+      vbuf_alloc_vertices(vbuf);
+   }
+
+   if (vbuf->nr_indices + nr > vbuf->max_indices )
+      vbuf_flush_indices(vbuf);
+}
+
+
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+   unsigned i, j, k;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         debug_printf("EMIT_OMIT:");
+         break;
+      case EMIT_ALL:
+         assert(i == 0);
+         assert(j == 0);
+         debug_printf("EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            debug_printf("%02x ", *data++);
+         break;
+      case EMIT_1F:
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
+}
+#endif
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.  We only use the vertex->data[] fields.
+ */
+static INLINE void 
+emit_vertex( struct vbuf_stage *vbuf,
+             struct vertex_header *vertex )
+{
+#if 0
+   debug_printf("emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
+
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            debug_printf("With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    debug_printf("Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
+   }
+   else {
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
+   }
+}
+
+
+static void 
+vbuf_tri( struct draw_stage *stage,
+          struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 3 );
+
+   for (i = 0; i < 3; i++) {
+      emit_vertex( vbuf, prim->v[i] );
+      
+      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+   }
+}
+
+
+static void 
+vbuf_line( struct draw_stage *stage, 
+           struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 2 );
+
+   for (i = 0; i < 2; i++) {
+      emit_vertex( vbuf, prim->v[i] );
+
+      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+   }   
+}
+
+
+static void 
+vbuf_point( struct draw_stage *stage, 
+            struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   check_space( vbuf, 1 );
+
+   emit_vertex( vbuf, prim->v[0] );
+   
+   vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[0]->vertex_id;
+}
+
+
+/**
+ * Set the prim type for subsequent vertices.
+ * This may result in a new vertex size.  The existing vbuffer (if any)
+ * will be flushed if needed and a new one allocated.
+ */
+static void
+vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
+{
+   const struct vertex_info *vinfo;
+   unsigned vertex_size;
+
+   assert(newprim == PIPE_PRIM_POINTS ||
+          newprim == PIPE_PRIM_LINES ||
+          newprim == PIPE_PRIM_TRIANGLES);
+
+   vbuf->prim = newprim;
+   vbuf->render->set_primitive(vbuf->render, newprim);
+
+   vinfo = vbuf->render->get_vertex_info(vbuf->render);
+   vertex_size = vinfo->size * sizeof(float);
+
+   if (vertex_size != vbuf->vertex_size)
+      vbuf_flush_vertices(vbuf);
+
+   vbuf->vinfo = vinfo;
+   vbuf->vertex_size = vertex_size;
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
+   
+   if (!vbuf->vertices)
+      vbuf_alloc_vertices(vbuf);
+}
+
+
+static void 
+vbuf_first_tri( struct draw_stage *stage,
+                struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );   
+   stage->tri = vbuf_tri;
+   vbuf_set_prim(vbuf, PIPE_PRIM_TRIANGLES);
+   stage->tri( stage, prim );
+}
+
+
+static void 
+vbuf_first_line( struct draw_stage *stage,
+                 struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->line = vbuf_line;
+   vbuf_set_prim(vbuf, PIPE_PRIM_LINES);
+   stage->line( stage, prim );
+}
+
+
+static void 
+vbuf_first_point( struct draw_stage *stage,
+                  struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->point = vbuf_point;
+   vbuf_set_prim(vbuf, PIPE_PRIM_POINTS);
+   stage->point( stage, prim );
+}
+
+
+static void 
+vbuf_flush_indices( struct vbuf_stage *vbuf ) 
+{
+   if(!vbuf->nr_indices)
+      return;
+   
+   assert((uint) (vbuf->vertex_ptr - vbuf->vertices) == 
+          vbuf->nr_vertices * vbuf->vertex_size / sizeof(unsigned));
+
+   switch(vbuf->prim) {
+   case PIPE_PRIM_POINTS:
+      break;
+   case PIPE_PRIM_LINES:
+      assert(vbuf->nr_indices % 2 == 0);
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      assert(vbuf->nr_indices % 3 == 0);
+      break;
+   default:
+      assert(0);
+   }
+   
+   vbuf->render->draw(vbuf->render, vbuf->indices, vbuf->nr_indices);
+   
+   vbuf->nr_indices = 0;
+
+   /* don't need to reset point/line/tri functions */
+#if 0
+   stage->point = vbuf_first_point;
+   stage->line = vbuf_first_line;
+   stage->tri = vbuf_first_tri;
+#endif
+}
+
+
+/**
+ * Flush existing vertex buffer and allocate a new one.
+ * 
+ * XXX: We separate flush-on-index-full and flush-on-vb-full, but may 
+ * raise issues uploading vertices if the hardware wants to flush when
+ * we flush.
+ */
+static void 
+vbuf_flush_vertices( struct vbuf_stage *vbuf )
+{
+   if(vbuf->vertices) {      
+      vbuf_flush_indices(vbuf);
+      
+      /* Reset temporary vertices ids */
+      if(vbuf->nr_vertices)
+	 draw_reset_vertex_ids( vbuf->stage.draw );
+      
+      /* Free the vertex buffer */
+      vbuf->render->release_vertices(vbuf->render,
+                                     vbuf->vertices,
+                                     vbuf->vertex_size,
+                                     vbuf->nr_vertices);
+      vbuf->max_vertices = vbuf->nr_vertices = 0;
+      vbuf->vertex_ptr = vbuf->vertices = NULL;
+      
+   }
+}
+   
+
+static void 
+vbuf_alloc_vertices( struct vbuf_stage *vbuf )
+{
+   assert(!vbuf->nr_indices);
+   assert(!vbuf->vertices);
+   
+   /* Allocate a new vertex buffer */
+   vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
+   vbuf->vertices = (uint *) vbuf->render->allocate_vertices(vbuf->render,
+                                                    (ushort) vbuf->vertex_size,
+                                                    (ushort) vbuf->max_vertices);
+   vbuf->vertex_ptr = vbuf->vertices;
+}
+
+
+
+static void 
+vbuf_flush( struct draw_stage *stage, unsigned flags )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+
+   stage->point = vbuf_first_point;
+   stage->line = vbuf_first_line;
+   stage->tri = vbuf_first_tri;
+
+   if (flags & DRAW_FLUSH_BACKEND)
+      vbuf_flush_vertices( vbuf );
+}
+
+
+static void 
+vbuf_reset_stipple_counter( struct draw_stage *stage )
+{
+   /* XXX: Need to do something here for hardware with linestipple.
+    */
+   (void) stage;
+}
+
+
+static void vbuf_destroy( struct draw_stage *stage )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
+   if (vbuf->render)
+      vbuf->render->destroy( vbuf->render );
+
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
+                                    struct vbuf_render *render )
+{
+   struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
+
+   if(!vbuf)
+      return NULL;
+   
+   vbuf->stage.draw = draw;
+   vbuf->stage.point = vbuf_first_point;
+   vbuf->stage.line = vbuf_first_line;
+   vbuf->stage.tri = vbuf_first_tri;
+   vbuf->stage.flush = vbuf_flush;
+   vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
+   vbuf->stage.destroy = vbuf_destroy;
+   
+   vbuf->render = render;
+
+   assert(render->max_indices < UNDEFINED_VERTEX_ID);
+   vbuf->max_indices = render->max_indices;
+   vbuf->indices = (ushort *)
+      align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
+   
+   vbuf->vertices = NULL;
+   vbuf->vertex_ptr = vbuf->vertices;
+
+   vbuf->prim = ~0;
+   
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
+   
+   return &vbuf->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
new file mode 100644
index 0000000000..cfd2b9820c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -0,0 +1,106 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Jos� Fonseca <jrfonsec@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VBUF_H_
+#define DRAW_VBUF_H_
+
+
+#include "pipe/p_util.h"
+
+
+struct draw_context;
+struct vertex_info;
+
+
+/**
+ * Interface for hardware vertex buffer rendering.
+ */
+struct vbuf_render {
+
+   /**
+    * Driver limits.  May be tuned lower to improve cache hits on
+    * index list.
+    */
+   unsigned max_indices;
+   unsigned max_vertex_buffer_bytes;
+
+   /**
+    * Get the hardware vertex format.
+    * 
+    * XXX: have this in draw_context instead?
+    */ 
+   const struct vertex_info *(*get_vertex_info)( struct vbuf_render * );
+	 
+   /**
+    * Request a destination for vertices.
+    * Hardware renderers will use ttm memory, others will just malloc
+    * something.
+    */
+   void *(*allocate_vertices)( struct vbuf_render *,
+			       ushort vertex_size,
+			       ushort nr_vertices );
+
+   /**
+    * Notify the renderer of the current primitive when it changes.
+    * Prim is restricted to TRIANGLES, LINES and POINTS.
+    */
+   void (*set_primitive)( struct vbuf_render *, unsigned prim );
+
+   /**
+    * DrawElements, note indices are ushort:
+    */
+   void (*draw)( struct vbuf_render *,
+		 const ushort *indices,
+		 uint nr_indices );
+
+   /**
+    * Called when vbuf is done with this set of vertices:
+    */
+   void (*release_vertices)( struct vbuf_render *,
+			     void *vertices, 
+			     unsigned vertex_size,
+			     unsigned vertices_used );
+
+   void (*destroy)( struct vbuf_render * );
+};
+
+
+
+struct draw_stage *
+draw_vbuf_stage( struct draw_context *draw,
+                 struct vbuf_render *render );
+
+
+#endif /*DRAW_VBUF_H_*/
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
new file mode 100644
index 0000000000..daf1ef4b80
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Functions for specifying the post-transformation vertex layout.
+ *
+ * Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Compute the size of a vertex, in dwords/floats, to update the
+ * vinfo->size field.
+ */
+void
+draw_compute_vertex_size(struct vertex_info *vinfo)
+{
+   uint i;
+
+   vinfo->size = 0;
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         break;
+      case EMIT_4UB:
+         /* fall-through */
+      case EMIT_1F_PSIZE:
+         /* fall-through */
+      case EMIT_1F:
+         vinfo->size += 1;
+         break;
+      case EMIT_2F:
+         vinfo->size += 2;
+         break;
+      case EMIT_3F:
+         vinfo->size += 3;
+         break;
+      case EMIT_4F:
+         vinfo->size += 4;
+         break;
+      case EMIT_ALL:
+         /* fall-through */
+      default:
+         assert(0);
+      }
+   }
+
+   assert(vinfo->size * 4 <= MAX_VERTEX_SIZE);
+}
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
new file mode 100644
index 0000000000..267c74203b
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Post-transform vertex format info.  The vertex_info struct is used by
+ * the draw_vbuf code to emit hardware-specific vertex layouts into hw
+ * vertex buffers.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+
+#ifndef DRAW_VERTEX_H
+#define DRAW_VERTEX_H
+
+
+#include "pipe/p_state.h"
+
+
+/**
+ * Vertex attribute emit modes
+ */
+enum attrib_emit {
+   EMIT_OMIT,      /**< don't emit the attribute */
+   EMIT_ALL,       /**< emit whole post-xform vertex, w/ header */
+   EMIT_1F,
+   EMIT_1F_PSIZE,  /**< insert constant point size */
+   EMIT_2F,
+   EMIT_3F,
+   EMIT_4F,
+   EMIT_4UB  /**< XXX may need variations for RGBA vs BGRA, etc */
+};
+
+
+/**
+ * Attribute interpolation mode
+ */
+enum interp_mode {
+   INTERP_NONE,      /**< never interpolate vertex header info */
+   INTERP_POS,       /**< special case for frag position */
+   INTERP_CONSTANT,
+   INTERP_LINEAR,
+   INTERP_PERSPECTIVE
+};
+
+
+/**
+ * Information about hardware/rasterization vertex layout.
+ */
+struct vertex_info
+{
+   uint num_attribs;
+   uint hwfmt[4];      /**< hardware format info for this format */
+   enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
+   enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS];   /**< EMIT_x */
+   uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
+   uint size;          /**< total vertex size in dwords */
+};
+
+
+
+/**
+ * Add another attribute to the given vertex_info object.
+ * \param src_index  indicates which post-transformed vertex attrib slot
+ *                   corresponds to this attribute.
+ * \return slot in which the attribute was added
+ */
+static INLINE uint
+draw_emit_vertex_attr(struct vertex_info *vinfo,
+                      enum attrib_emit emit, enum interp_mode interp,
+                      uint src_index)
+{
+   const uint n = vinfo->num_attribs;
+   assert(n < PIPE_MAX_SHADER_INPUTS);
+   vinfo->emit[n] = emit;
+   vinfo->interp_mode[n] = interp;
+   vinfo->src_index[n] = src_index;
+   vinfo->num_attribs++;
+   return n;
+}
+
+
+extern void draw_compute_vertex_size(struct vertex_info *vinfo);
+
+
+#endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vertex_cache.c b/src/gallium/auxiliary/draw/draw_vertex_cache.c
new file mode 100644
index 0000000000..44427999cc
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_cache.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+void draw_vertex_cache_invalidate( struct draw_context *draw )
+{
+   assert(draw->pq.queue_nr == 0);
+   assert(draw->vs.queue_nr == 0);
+   assert(draw->vcache.referenced == 0);
+
+   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
+}
+
+
+/**
+ * Check if vertex is in cache, otherwise add it.  It won't go through
+ * VS yet, not until there is a flush operation or the VS queue fills up.  
+ *
+ * Note that cache entries are basically just two pointers: the first
+ * an index into the user's vertex arrays, the second a location in
+ * the vertex shader cache for the post-transformed vertex.
+ *
+ * \return pointer to location of (post-transformed) vertex header in the cache
+ */
+static struct vertex_header *get_vertex( struct draw_context *draw,
+					 unsigned i )
+{
+   unsigned slot = (i + (i>>5)) % VCACHE_SIZE;
+   
+   assert(slot < 32); /* so we don't exceed the bitfield size below */
+
+   /* Cache miss?
+    */
+   if (draw->vcache.idx[slot] != i) {
+
+      /* If slot is in use, use the overflow area:
+       */
+      if (draw->vcache.referenced & (1 << slot)) {
+	 slot = VCACHE_SIZE + draw->vcache.overflow++;
+      }
+
+      assert(slot < Elements(draw->vcache.idx));
+
+      draw->vcache.idx[slot] = i;
+
+      /* Add to vertex shader queue:
+       */
+      assert(draw->vs.queue_nr < VS_QUEUE_LENGTH);
+      draw->vs.queue[draw->vs.queue_nr].dest = draw->vcache.vertex[slot];
+      draw->vs.queue[draw->vs.queue_nr].elt = i;
+      draw->vs.queue_nr++;
+
+      /* Need to set the vertex's edge flag here.  If we're being called
+       * by do_ef_triangle(), that function needs edge flag info!
+       */
+      draw->vcache.vertex[slot]->clipmask = 0;
+      draw->vcache.vertex[slot]->edgeflag = 1; /*XXX use user's edge flag! */
+      draw->vcache.vertex[slot]->pad = 0;
+      draw->vcache.vertex[slot]->vertex_id = UNDEFINED_VERTEX_ID;
+   }
+
+
+   /* primitive flushing may have cleared the bitfield but did not
+    * clear the idx[] array values.  Set the bit now.  This fixes a
+    * bug found when drawing long triangle fans.
+    */
+   draw->vcache.referenced |= (1 << slot);
+   return draw->vcache.vertex[slot];
+}
+
+
+static struct vertex_header *get_uint_elt_vertex( struct draw_context *draw,
+                                                  unsigned i )
+{
+   const unsigned *elts = (const unsigned *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+static struct vertex_header *get_ushort_elt_vertex( struct draw_context *draw,
+						    unsigned i )
+{
+   const ushort *elts = (const ushort *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+static struct vertex_header *get_ubyte_elt_vertex( struct draw_context *draw,
+                                                   unsigned i )
+{
+   const ubyte *elts = (const ubyte *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw )
+{
+   unsigned i;
+
+   for (i = 0; i < Elements(draw->vcache.vertex); i++)
+      draw->vcache.vertex[i]->vertex_id = UNDEFINED_VERTEX_ID;
+}
+
+
+void draw_vertex_cache_unreference( struct draw_context *draw )
+{
+   draw->vcache.referenced = 0;
+   draw->vcache.overflow = 0;
+}
+
+
+int draw_vertex_cache_check_space( struct draw_context *draw,
+				   unsigned nr_verts )
+{
+   if (draw->vcache.overflow + nr_verts < VCACHE_OVERFLOW) {
+      /* The vs queue is sized so that this can never happen:
+       */
+      assert(draw->vs.queue_nr + nr_verts < VS_QUEUE_LENGTH);
+      return TRUE;
+   }
+   else
+      return FALSE;
+}
+
+
+
+/**
+ * Tell the drawing context about the index/element buffer to use
+ * (ala glDrawElements)
+ * If no element buffer is to be used (i.e. glDrawArrays) then this
+ * should be called with eltSize=0 and elements=NULL.
+ *
+ * \param draw  the drawing context
+ * \param eltSize  size of each element (1, 2 or 4 bytes)
+ * \param elements  the element buffer ptr
+ */
+void
+draw_set_mapped_element_buffer( struct draw_context *draw,
+                                unsigned eltSize, void *elements )
+{
+//   draw_statechange( draw );
+
+   /* choose the get_vertex() function to use */
+   switch (eltSize) {
+   case 0:
+      draw->vcache.get_vertex = get_vertex;
+      break;
+   case 1:
+      draw->vcache.get_vertex = get_ubyte_elt_vertex;
+      break;
+   case 2:
+      draw->vcache.get_vertex = get_ushort_elt_vertex;
+      break;
+   case 4:
+      draw->vcache.get_vertex = get_uint_elt_vertex;
+      break;
+   default:
+      assert(0);
+   }
+   draw->user.elts = elements;
+   draw->user.eltSize = eltSize;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_vertex_fetch.c b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
new file mode 100644
index 0000000000..e13df04605
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
@@ -0,0 +1,510 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+//   debug_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   debug_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for ( ; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   debug_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/gallium/auxiliary/draw/draw_vertex_shader.c b/src/gallium/auxiliary/draw/draw_vertex_shader.c
new file mode 100644
index 0000000000..377ecbb931
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_shader.c
@@ -0,0 +1,325 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#if defined(__i386__) || defined(__386__)
+#include "tgsi/exec/tgsi_sse2.h"
+#endif
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "x86/rtasm/x86sse.h"
+#include "llvm/gallivm.h"
+
+
+#define DBG_VS 0
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+typedef void (XSTDCALL *codegen_function) (
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],
+   struct tgsi_exec_vector *temporary );
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct draw_context *draw,
+                   unsigned elts[4], unsigned count,
+                   struct vertex_header *vOut[])
+{
+   struct tgsi_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+   assert(draw->vertex_shader->state->output_semantic_name[0]
+          == TGSI_SEMANTIC_POSITION);
+
+   /* Consts does not require 16 byte alignment. */
+   machine->Consts = (float (*)[4]) draw->user.constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+
+   /* run shader */
+#ifdef MESA_LLVM
+   if (1) {
+   struct gallivm_prog  *prog  = draw->vertex_shader->llvm_prog;
+   gallivm_cpu_vs_exec(prog,
+                       machine->Inputs,
+                       machine->Outputs,
+                       machine->Consts,
+                       machine->Temps);
+   } else
+#elif defined(__i386__) || defined(__386__)
+   if (draw->use_sse) {
+      /* SSE */
+      /* cast away const */
+      struct draw_vertex_shader *shader
+         = (struct draw_vertex_shader *)draw->vertex_shader;
+      codegen_function func
+         = (codegen_function) x86_get_func( &shader->sse2_program );
+
+      if (func)
+         func(
+            machine->Inputs,
+            machine->Outputs,
+            machine->Consts,
+            machine->Temps );
+      else
+         /* interpreter */
+         tgsi_exec_machine_run( machine );
+   }
+   else
+#endif
+   {
+      /* interpreter */
+      tgsi_exec_machine_run( machine );
+   }
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+#if DBG_VS
+      debug_printf("output[%d]win: %f %f %f %f\n", j,
+             vOut[j]->data[0][0],
+             vOut[j]->data[0][1],
+             vOut[j]->data[0][2],
+             vOut[j]->data[0][3]);
+#endif
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+#if DBG_VS
+         debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+                vOut[j]->data[slot][0],
+                vOut[j]->data[slot][1],
+                vOut[j]->data[slot][2],
+                vOut[j]->data[slot][3]);
+#endif
+      }
+   } /* loop over vertices */
+}
+
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+draw_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   unsigned i;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch( draw );
+
+//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+
+   /* run vertex shader on vertex cache entries, four per invokation */
+   for (i = 0; i < draw->vs.queue_nr; i += 4) {
+      struct vertex_header *dests[4];
+      unsigned elts[4];
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         elts[j] = draw->vs.queue[i + j].elt;
+         dests[j] = draw->vs.queue[i + j].dest;
+      }
+
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
+      assert(n > 0);
+      assert(n <= 4);
+
+      run_vertex_program(draw, elts, n, dests);
+   }
+
+   draw->vs.queue_nr = 0;
+}
+
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader)
+{
+   struct draw_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_vertex_shader );
+   if (vs == NULL) {
+      return NULL;
+   }
+
+   vs->state = shader;
+
+#ifdef MESA_LLVM
+   struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
+   gallivm_ir_set_layout(ir, GALLIVM_SOA);
+   gallivm_ir_set_components(ir, 4);
+   gallivm_ir_fill_from_tgsi(ir, shader->tokens);
+   vs->llvm_prog = gallivm_ir_compile(ir);
+   gallivm_ir_delete(ir);
+
+   draw->engine = gallivm_global_cpu_engine();
+   if (!draw->engine) {
+      draw->engine = gallivm_cpu_engine_create(vs->llvm_prog);
+   }
+   else {
+      gallivm_cpu_jit_compile(draw->engine, vs->llvm_prog);
+   }
+#elif defined(__i386__) || defined(__386__)
+   if (draw->use_sse) {
+      /* cast-away const */
+      struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
+
+      x86_init_func( &vs->sse2_program );
+      if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+                           &vs->sse2_program )) {
+         x86_release_func( (struct x86_function *) &vs->sse2_program );
+	 fprintf(stdout /*err*/,
+		 "tgsi_emit_sse2() failed, falling back to interpreter\n");
+      }
+   }
+#endif
+
+   return vs;
+}
+
+
+void
+draw_bind_vertex_shader(struct draw_context *draw,
+                        struct draw_vertex_shader *dvs)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->vertex_shader = dvs;
+   draw->num_vs_outputs = dvs->state->num_outputs;
+
+   /* specify the vertex program to interpret/execute */
+   tgsi_exec_machine_init(&draw->machine,
+                          draw->vertex_shader->state->tokens,
+                          PIPE_MAX_SAMPLERS,
+                          NULL /*samplers*/ );
+}
+
+
+void
+draw_delete_vertex_shader(struct draw_context *draw,
+                          struct draw_vertex_shader *dvs)
+{
+#if defined(__i386__) || defined(__386__)
+   x86_release_func( (struct x86_function *) &dvs->sse2_program );
+#endif
+
+   FREE( dvs );
+}
diff --git a/src/gallium/auxiliary/draw/draw_vf.c b/src/gallium/auxiliary/draw/draw_vf.c
new file mode 100644
index 0000000000..dc3a5ecd21
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include <stddef.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+#define DRAW_VF_DBG 0
+
+
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < PIPE_ATTRIB_MAX);
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == DRAW_EMIT_PAD) {
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+#endif
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
+	 
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+#endif
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
+
+#if 0
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+#endif
+
+
+/**
+ * Emit a vertex to dest.  
+ */
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      if (!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
+   }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
+}
+
+
+
+struct draw_vertex_fetch *draw_vf_create( void )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      //_mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/gallium/auxiliary/draw/draw_vf.h b/src/gallium/auxiliary/draw/draw_vf.h
new file mode 100644
index 0000000000..011c8f0ff1
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "draw_vertex.h"
+#include "draw_private.h" /* for vertex_header */
+
+
+enum draw_vf_attr_format {
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
+};
+
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
+   unsigned attrib;
+   
+   enum draw_vf_attr_format format;
+   
+   unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
+};
+
+struct draw_vertex_fetch;
+
+
+
+#if 0
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
+
+#if 0
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start );
+#endif
+
+void 
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( void );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+/** XXX this could be moved into draw_vf.c */
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+/** XXX this type and function could probably be moved into draw_vf.c */
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+   const boolean isconst;
+};
+
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vf_generic.c b/src/gallium/auxiliary/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..7a60a9db9c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf_generic.c
@@ -0,0 +1,585 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   assert(0);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
+{
+   { "1f",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), FALSE },
+
+   { "2f",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), FALSE },
+
+   { "3f",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), FALSE },
+
+   { "4f",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), FALSE },
+
+   { "3f_xyw",
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float), FALSE },
+
+   { "1ub_1f",
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_rgb",
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_bgr",
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_rgba",
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_bgra",
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_argb",
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_abgr",
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
+
+   { "pad",
+     { NULL, NULL, NULL, NULL },
+     0, FALSE },
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_vf_sse.c b/src/gallium/auxiliary/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..1ad2ae756d
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf_sse.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case DRAW_EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    debug_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case DRAW_EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      default:
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_wide_prims.c b/src/gallium/auxiliary/draw/draw_wide_prims.c
new file mode 100644
index 0000000000..655774b155
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_wide_prims.c
@@ -0,0 +1,432 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+struct wide_stage {
+   struct draw_stage stage;
+
+   float half_line_width;
+   float half_point_size;
+
+   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
+   uint texcoord_mode[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_texcoords;
+
+   int psize_slot;
+};
+
+
+
+static INLINE struct wide_stage *wide_stage( struct draw_stage *stage )
+{
+   return (struct wide_stage *)stage;
+}
+
+
+static void passthrough_point( struct draw_stage *stage,
+                               struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+static void passthrough_line( struct draw_stage *stage,
+                              struct prim_header *header )
+{
+   stage->next->line(stage->next, header);
+}
+
+static void passthrough_tri( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+/**
+ * Draw a wide line by drawing a quad (two triangles).
+ * XXX need to disable polygon stipple.
+ */
+static void wide_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const float half_width = wide->half_line_width;
+
+   struct prim_header tri;
+
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[1], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[1], 3);
+
+   float *pos0 = v0->data[0];
+   float *pos1 = v1->data[0];
+   float *pos2 = v2->data[0];
+   float *pos3 = v3->data[0];
+
+   const float dx = FABSF(pos0[0] - pos2[0]);
+   const float dy = FABSF(pos0[1] - pos2[1]);
+   
+   /*
+    * Draw wide line as a quad (two tris) by "stretching" the line along
+    * X or Y.
+    * We need to tweak coords in several ways to be conformant here.
+    */
+
+   if (dx > dy) {
+      /* x-major line */
+      pos0[1] = pos0[1] - half_width - 0.25f;
+      pos1[1] = pos1[1] + half_width - 0.25f;
+      pos2[1] = pos2[1] - half_width - 0.25f;
+      pos3[1] = pos3[1] + half_width - 0.25f;
+      if (pos0[0] < pos2[0]) {
+         /* left to right line */
+         pos0[0] -= 0.5f;
+         pos1[0] -= 0.5f;
+         pos2[0] -= 0.5f;
+         pos3[0] -= 0.5f;
+      }
+      else {
+         /* right to left line */
+         pos0[0] += 0.5f;
+         pos1[0] += 0.5f;
+         pos2[0] += 0.5f;
+         pos3[0] += 0.5f;
+      }
+   }
+   else {
+      /* y-major line */
+      pos0[0] = pos0[0] - half_width + 0.25f;
+      pos1[0] = pos1[0] + half_width + 0.25f;
+      pos2[0] = pos2[0] - half_width + 0.25f;
+      pos3[0] = pos3[0] + half_width + 0.25f;
+      if (pos0[1] < pos2[1]) {
+         /* top to bottom line */
+         pos0[1] -= 0.5f;
+         pos1[1] -= 0.5f;
+         pos2[1] -= 0.5f;
+         pos3[1] -= 0.5f;
+      }
+      else {
+         /* bottom to top line */
+         pos0[1] += 0.5f;
+         pos1[1] += 0.5f;
+         pos2[1] += 0.5f;
+         pos3[1] += 0.5f;
+      }
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+/**
+ * Draw a wide line by drawing a quad, using geometry which will
+ * fullfill GL's antialiased line requirements.
+ */
+static void wide_line_aa(struct draw_stage *stage,
+                         struct prim_header *header)
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const float half_width = wide->half_line_width;
+   struct prim_header tri;
+   struct vertex_header *v[4];
+   float *pos;
+   float dx = header->v[1]->data[0][0] - header->v[0]->data[0][0];
+   float dy = header->v[1]->data[0][1] - header->v[0]->data[0][1];
+   const float len = (float) sqrt(dx * dx + dy * dy);
+   uint i;
+
+   dx = dx * half_width / len;
+   dy = dy * half_width / len;
+
+   /* allocate/dup new verts */
+   for (i = 0; i < 4; i++) {
+      v[i] = dup_vert(stage, header->v[i/2], i);
+   }
+
+   /*
+    * Quad for line from v0 to v1:
+    *
+    *  1                         3
+    *  +-------------------------+
+    *  |                         |
+    *  *v0                     v1*
+    *  |                         |
+    *  +-------------------------+
+    *  0                         2
+    */
+
+   pos = v[0]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[1]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   pos = v[2]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[3]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   tri.det = header->det;  /* only the sign matters */
+
+   tri.v[0] = v[2];  tri.v[1] = v[1];  tri.v[2] = v[0];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[3];  tri.v[1] = v[1];  tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+}
+
+
+/**
+ * Set the vertex texcoords for sprite mode.
+ * Coords may be left untouched or set to a right-side-up or upside-down
+ * orientation.
+ */
+static void set_texcoords(const struct wide_stage *wide,
+                          struct vertex_header *v, const float tc[4])
+{
+   uint i;
+   for (i = 0; i < wide->num_texcoords; i++) {
+      if (wide->texcoord_mode[i] != PIPE_SPRITE_COORD_NONE) {
+         uint j = wide->texcoord_slot[i];
+         v->data[j][0] = tc[0];
+         if (wide->texcoord_mode[i] == PIPE_SPRITE_COORD_LOWER_LEFT)
+            v->data[j][1] = 1.0f - tc[1];
+         else
+            v->data[j][1] = tc[1];
+         v->data[j][2] = tc[2];
+         v->data[j][3] = tc[3];
+      }
+   }
+}
+
+
+/* If there are lots of sprite points (and why wouldn't there be?) it
+ * would probably be more sensible to change hardware setup to
+ * optimize this rather than doing the whole thing in software like
+ * this.
+ */
+static void wide_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const boolean sprite = (boolean) stage->draw->rasterizer->point_sprite;
+   float half_size;
+   float left_adj, right_adj;
+
+   struct prim_header tri;
+
+   /* four dups of original vertex */
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[0], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[0], 3);
+
+   float *pos0 = v0->data[0];
+   float *pos1 = v1->data[0];
+   float *pos2 = v2->data[0];
+   float *pos3 = v3->data[0];
+
+   /* point size is either per-vertex or fixed size */
+   if (wide->psize_slot >= 0) {
+      half_size = 0.5f * header->v[0]->data[wide->psize_slot][0];
+   }
+   else {
+      half_size = wide->half_point_size;
+   }
+
+   left_adj = -half_size + 0.25f;
+   right_adj = half_size + 0.25f;
+
+   pos0[0] += left_adj;
+   pos0[1] -= half_size;
+
+   pos1[0] += left_adj;
+   pos1[1] += half_size;
+
+   pos2[0] += right_adj;
+   pos2[1] -= half_size;
+
+   pos3[0] += right_adj;
+   pos3[1] += half_size;
+
+   if (sprite) {
+      static const float tex00[4] = { 0, 0, 0, 1 };
+      static const float tex01[4] = { 0, 1, 0, 1 };
+      static const float tex11[4] = { 1, 1, 0, 1 };
+      static const float tex10[4] = { 1, 0, 0, 1 };
+      set_texcoords( wide, v0, tex00 );
+      set_texcoords( wide, v1, tex01 );
+      set_texcoords( wide, v2, tex10 );
+      set_texcoords( wide, v3, tex11 );
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void wide_first_point( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct wide_stage *wide = wide_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   wide->half_point_size = 0.5f * draw->rasterizer->point_size;
+
+   if (draw->rasterizer->point_size != 1.0) {
+      stage->point = wide_point;
+   }
+   else {
+      stage->point = passthrough_point;
+   }
+
+   if (draw->rasterizer->point_sprite) {
+      /* find vertex shader texcoord outputs */
+      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      uint i, j = 0;
+      for (i = 0; i < vs->state->num_outputs; i++) {
+         if (vs->state->output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
+            wide->texcoord_slot[j] = i;
+            wide->texcoord_mode[j] = draw->rasterizer->sprite_coord_mode[j];
+            j++;
+         }
+      }
+      wide->num_texcoords = j;
+   }
+
+   wide->psize_slot = -1;
+
+   if (draw->rasterizer->point_size_per_vertex) {
+      /* find PSIZ vertex output */
+      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      uint i;
+      for (i = 0; i < vs->state->num_outputs; i++) {
+         if (vs->state->output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
+            wide->psize_slot = i;
+            break;
+         }
+      }
+   }
+   
+   stage->point( stage, header );
+}
+
+
+
+static void wide_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   struct wide_stage *wide = wide_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   wide->half_line_width = 0.5f * draw->rasterizer->line_width;
+
+   if (draw->rasterizer->line_width != 1.0) {
+      if (draw->rasterizer->line_smooth)
+         wide->stage.line = wide_line_aa;
+      else
+         wide->stage.line = wide_line;
+   }
+   else {
+      wide->stage.line = passthrough_line;
+   }
+   
+   stage->line( stage, header );
+}
+
+
+static void wide_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->line = wide_first_line;
+   stage->point = wide_first_point;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void wide_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void wide_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+struct draw_stage *draw_wide_stage( struct draw_context *draw )
+{
+   struct wide_stage *wide = CALLOC_STRUCT(wide_stage);
+
+   draw_alloc_temp_verts( &wide->stage, 4 );
+
+   wide->stage.draw = draw;
+   wide->stage.next = NULL;
+   wide->stage.point = wide_first_point;
+   wide->stage.line = wide_first_line;
+   wide->stage.tri = passthrough_tri;
+   wide->stage.flush = wide_flush;
+   wide->stage.reset_stipple_counter = wide_reset_stipple_counter;
+   wide->stage.destroy = wide_destroy;
+
+   return &wide->stage;
+}
-- 
cgit v1.2.3