14 files changed, 2639 insertions, 228 deletions
diff --git a/src/mesa/pipe/draw/Makefile b/src/mesa/pipe/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/mesa/pipe/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index 2d410e3244..61130c5600 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -33,6 +33,8 @@
 
 
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
 #include "draw_context.h"
 #include "draw_private.h"
 
@@ -54,6 +56,12 @@
 struct clipper {
    struct draw_stage stage;      /**< base class */
 
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
    float (*plane)[4];
 };
 
@@ -82,6 +90,17 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
 
 
 
@@ -134,27 +153,11 @@ static void interp( const struct clipper *clip,
    }
 }
 
-#if 0   
-static INLINE void do_tri( struct draw_stage *next,
-			   struct prim_header *header )
-{
-   unsigned i;
-   for (i = 0; i < 3; i++) {
-      float *ndc = header->v[i]->data[0];
-      _mesa_printf("ndc %f %f %f\n", ndc[0], ndc[1], ndc[2]);
-      assert(ndc[0] >= -1 && ndc[0] <= 641);
-      assert(ndc[1] >= 30 && ndc[1] <= 481);
-   }
-   _mesa_printf("\n");
-   next->tri(next, header);
-}
-#endif
-
 
 static void emit_poly( struct draw_stage *stage,
 		       struct vertex_header **inlist,
 		       unsigned n,
-                       const struct prim_header *origPrim)
+		       const struct prim_header *origPrim)
 {
    struct prim_header header;
    unsigned i;
@@ -163,16 +166,16 @@ static void emit_poly( struct draw_stage *stage,
    header.det = origPrim->det;
 
    for (i = 2; i < n; i++) {
-      header.v[0] = inlist[0];
-      header.v[1] = inlist[i-1];
-      header.v[2] = inlist[i];
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 	
       {
-	 unsigned tmp0 = header.v[0]->edgeflag;
+	 unsigned tmp1 = header.v[1]->edgeflag;
 	 unsigned tmp2 = header.v[2]->edgeflag;
 
-	 if (i != 2)   header.v[0]->edgeflag = 0;
-	 if (i != n-1) header.v[2]->edgeflag = 0;
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
 
          header.edgeflags = ((header.v[0]->edgeflag << 0) | 
                              (header.v[1]->edgeflag << 1) | 
@@ -180,27 +183,13 @@ static void emit_poly( struct draw_stage *stage,
 
 	 stage->next->tri( stage->next, &header );
 
-	 header.v[0]->edgeflag = tmp0;
+	 header.v[1]->edgeflag = tmp1;
 	 header.v[2]->edgeflag = tmp2;
       }
    }
 }
 
 
-#if 0
-static void emit_poly( struct draw_stage *stage )
-{
-   unsigned i;
-
-   for (i = 2; i < n; i++) {
-      header->v[0] = inlist[0];
-      header->v[1] = inlist[i-1];
-      header->v[2] = inlist[i];
-	 
-      stage->next->tri( stage->next, header );
-   }
-}
-#endif
 
 
 /* Clip a triangle against the viewport and user clip planes.
@@ -281,6 +270,18 @@ do_clip_tri( struct draw_stage *stage,
       }
    }
 
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -328,6 +329,10 @@ do_clip_line( struct draw_stage *stage,
 
    if (v0->clipmask) {
       interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
       newprim.v[0] = stage->tmp[0];
    }
    else {
@@ -393,8 +398,55 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
-static void clip_flush( struct draw_stage *stage, unsigned flags )
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
 {
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
    stage->next->flush( stage->next, flags );
 }
 
@@ -420,12 +472,12 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
    struct clipper *clipper = CALLOC_STRUCT(clipper);
 
-   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES );
+   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
 
    clipper->stage.draw = draw;
    clipper->stage.point = clip_point;
-   clipper->stage.line = clip_line;
-   clipper->stage.tri = clip_tri;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
    clipper->stage.flush = clip_flush;
    clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
    clipper->stage.destroy = clip_destroy;
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..b15f57c824 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,12 +71,15 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
    draw->convert_wide_points = TRUE;
    draw->convert_wide_lines = TRUE;
 
@@ -103,7 +106,7 @@ void draw_destroy( struct draw_context *draw )
    if (draw->pipeline.rasterize)
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
    tgsi_exec_machine_free_data(&draw->machine);
-   FREE( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
    FREE( draw );
 }
 
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 243381aec0..51e2242719 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -30,6 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include "pipe/p_debug.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 
@@ -60,38 +62,55 @@ static void draw_prim_queue_flush( struct draw_context *draw )
    unsigned i;
 
    if (0)
-      fprintf(stdout,"Flushing with %d prims, %d verts\n",
-             draw->pq.queue_nr, draw->vs.queue_nr);
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
 
-   if (draw->pq.queue_nr == 0)
-      return;
+   assert (draw->pq.queue_nr != 0);
 
    /* NOTE: we cannot save draw->pipeline->first in a local var because
     * draw->pipeline->first is often changed by the first call to tri(),
     * line(), etc.
     */
-   switch (draw->reduced_prim) {
-   case RP_TRI:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_LINE:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_POINT:
-      draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-      for (i = 0; i < draw->pq.queue_nr; i++)
-	 draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-      break;
    }
 
    draw->pq.queue_nr = 0;   
@@ -103,16 +122,18 @@ static void draw_prim_queue_flush( struct draw_context *draw )
 void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
    if (0)
-      fprintf(stdout,"Flushing with %d verts, %d prims\n",
-	      draw->vs.queue_nr,
-	      draw->pq.queue_nr );
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
-      draw_vertex_shader_queue_flush(draw);
+      if (draw->vs.queue_nr)
+         (*draw->shader_queue_flush)(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
-         draw_prim_queue_flush(draw);
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
 
 	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
             draw_vertex_cache_invalidate(draw);
@@ -138,11 +159,11 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
 					    unsigned nr_verts )
 {
    if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-//      fprintf(stderr, "v");
+//      debug_printf("v");
       draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
    }
    else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-//      fprintf(stderr, "p");
+//      debug_printf("p");
       draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
    }
 
@@ -230,7 +251,7 @@ static void do_ef_triangle( struct draw_context *draw,
 }
 
 
-static void do_quad( struct draw_context *draw,
+static void do_ef_quad( struct draw_context *draw,
 		     unsigned v0,
 		     unsigned v1,
 		     unsigned v2,
@@ -242,6 +263,16 @@ static void do_quad( struct draw_context *draw,
    do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
 }
 
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
 
 /**
  * Main entrypoint to draw some number of points/lines/triangles
@@ -251,8 +282,10 @@ draw_prim( struct draw_context *draw,
 	   unsigned prim, unsigned start, unsigned count )
 {
    unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
-//   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
    switch (prim) {
    case PIPE_PRIM_POINTS:
@@ -288,24 +321,32 @@ draw_prim( struct draw_context *draw,
       break;
 
    case PIPE_PRIM_LINE_STRIP:
-      if (count >= 2) {
-	 for (i = 1; i < count; i++) {
-	    do_line( draw,
-		     i == 1,
-		     start + i - 1,
-		     start + i );
-	 }
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-	 do_ef_triangle( draw,
-			 1, 
-			 ~0,
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
 			 start + i + 0,
 			 start + i + 1,
 			 start + i + 2 );
+	 }
       }
       break;
 
@@ -339,27 +380,49 @@ draw_prim( struct draw_context *draw,
 
 
    case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-	 do_quad( draw,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 2,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-	 do_quad( draw,
-		  start + i + 2,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_POLYGON:
-      if (count >= 3) {
+      if (unfilled) {
 	 unsigned ef_mask = (1<<2) | (1<<0);
 
 	 for (i = 0; i+2 < count; i++) {
@@ -377,6 +440,14 @@ draw_prim( struct draw_context *draw,
 	    ef_mask &= ~(1<<2);
 	 }
       }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
       break;
 
    default:
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1e59f5bd8d..7782db0477 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -141,6 +141,10 @@ struct draw_vertex_shader {
 /* Internal function for vertex fetch.
  */
 typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
 
 
 
@@ -210,6 +214,7 @@ struct draw_context
       unsigned pitch[PIPE_ATTRIB_MAX];
       fetch_func fetch[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
+      full_fetch_func fetch_func;
    } vertex_fetch;
 
    /* Post-tnl vertex cache:
@@ -235,6 +240,11 @@ struct draw_context
       unsigned queue_nr;
    } vs;
 
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
    /* Prim pipeline queue:
     */
    struct {
@@ -249,6 +259,8 @@ struct draw_context
 #ifdef MESA_LLVM
    struct gallivm_cpu_engine *engine;
 #endif
+   
+   void *driver_private;
 };
 
 
@@ -287,10 +299,6 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 struct tgsi_exec_machine;
 
 extern void draw_update_vertex_fetch( struct draw_context *draw );
-extern void draw_vertex_fetch( struct draw_context *draw,
-			       struct tgsi_exec_machine *machine,
-			       const unsigned *elts,
-			       unsigned count );
 
 
 #define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index 86d5a5f814..4375ebabbc 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -78,6 +78,11 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = 1;		/* only needed for triangles really */
       need_det = 1;
    }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
 	 
    if (draw->rasterizer->offset_cw ||
        draw->rasterizer->offset_ccw) {
@@ -110,13 +115,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
-      precalc_flat = 1;		/* XXX: FIX ME! Only needed for clipped prims */
    }
 
-   if (draw->rasterizer->flatshade && precalc_flat) {
-      draw->pipeline.flatshade->next = next;
-      next = draw->pipeline.flatshade;
-   }
    
    draw->pipeline.first = next;
    return next;
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 1e260c6156..be96c8fdeb 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -34,13 +34,14 @@
  */
 
 
-#include <assert.h>
-
-#include "pipe/draw/draw_vbuf.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
 
 /**
  * Vertex buffer emit stage.
@@ -55,6 +56,8 @@ struct vbuf_stage {
    /** Vertex size in bytes */
    unsigned vertex_size;
 
+   struct draw_vertex_fetch *vf;
+   
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
    /** Vertices in hardware format */
@@ -110,88 +113,175 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
 }
 
 
-/**
- * Extract the needed fields from post-transformed vertex and emit
- * a hardware(driver) vertex.
- * Recall that the vertices are constructed by the 'draw' module and
- * have a couple of slots at the beginning (1-dword header, 4-dword
- * clip pos) that we ignore here.  We only use the vertex->data[] fields.
- */
-static INLINE void 
-emit_vertex( struct vbuf_stage *vbuf,
-             struct vertex_header *vertex )
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
 {
-   const struct vertex_info *vinfo = vbuf->vinfo;
-
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   
    assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
-
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
-   }
-      
-   vertex->vertex_id = vbuf->nr_vertices++;
+   unsigned i, j, k;
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
+      j = vinfo->src_index[i];
       switch (vinfo->emit[i]) {
       case EMIT_OMIT:
-         /* no-op */
+         debug_printf("EMIT_OMIT:");
          break;
       case EMIT_ALL:
-         /* just copy the whole vertex as-is to the vbuf */
          assert(i == 0);
-         memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
-         vbuf->vertex_ptr += vinfo->size;
-         return;
+         assert(j == 0);
+         debug_printf("EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            debug_printf("%02x ", *data++);
+         break;
       case EMIT_1F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         count++;
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_1F_PSIZE:
-         *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
-         count++;
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_2F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         count += 2;
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_3F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         count += 3;
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
          break;
       case EMIT_4F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
-         count += 4;
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_4UB:
-	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
-                                        float_to_ubyte( vertex->data[j][1] ),
-                                        float_to_ubyte( vertex->data[j][0] ),
-                                        float_to_ubyte( vertex->data[j][3] ));
-         count += 1;
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
          break;
       default:
          assert(0);
       }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
+}
+#endif
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.  We only use the vertex->data[] fields.
+ */
+static INLINE void 
+emit_vertex( struct vbuf_stage *vbuf,
+             struct vertex_header *vertex )
+{
+#if 0
+   debug_printf("emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
+
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            debug_printf("With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    debug_printf("Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
+   }
+   else {
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
-   assert(count == vinfo->size);
 }
 
 
@@ -269,6 +359,10 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
@@ -423,7 +517,12 @@ static void vbuf_destroy( struct draw_stage *stage )
 {
    struct vbuf_stage *vbuf = vbuf_stage( stage );
 
-   align_free( vbuf->indices );
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
    FREE( stage );
 }
 
@@ -436,6 +535,9 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
 
+   if(!vbuf)
+      return NULL;
+   
    vbuf->stage.draw = draw;
    vbuf->stage.point = vbuf_first_point;
    vbuf->stage.line = vbuf_first_line;
@@ -450,11 +552,16 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->max_indices = render->max_indices;
    vbuf->indices = (ushort *)
       align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
    
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
 
    vbuf->prim = ~0;
    
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
+   
    return &vbuf->stage;
 }
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index fb64723a19..e13df04605 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,50 +62,244 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
 #define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
 #define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
 
 FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
 FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
 FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
 FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
 FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
 
 
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
    switch (format) {
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
    case PIPE_FORMAT_R32_FLOAT:
       return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
    case PIPE_FORMAT_R32_SSCALED:
       return fetch_R32_SSCALED;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
    case 0:
-      return NULL;
+      return NULL;		/* not sure why this is needed */
+
    default:
-      /* Lots of missing cases! */
       assert(0);
       return NULL;
    }
@@ -126,47 +320,108 @@ transpose_4x4( float *out, const float *in )
 }
 
 
-			       
-void draw_update_vertex_fetch( struct draw_context *draw )
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
 {
-   unsigned nr_attrs, i;
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
 
-   /* this may happend during context init */
-   if (!draw->vertex_shader)
-      return;
+   assert(count <= 4);
 
-   nr_attrs = draw->vertex_shader->state->num_inputs;
+//   debug_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < nr_attrs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
+   /* loop over vertex attributes (vertex shader inputs)
+    */
 
-      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
-						       draw->vertex_buffer[buf].buffer_offset + 
-						       draw->vertex_element[i].src_offset;
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
 
-      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
-      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
    }
+}
 
-   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
 }
 
 
+
+
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-void draw_vertex_fetch( struct draw_context *draw,
-			struct tgsi_exec_machine *machine,
-			const unsigned *elts,
-			unsigned count )
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+//   debug_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -206,3 +461,50 @@ void draw_vertex_fetch( struct draw_context *draw,
    }
 }
 
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   debug_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 3041974b9a..5ca93aa615 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -110,7 +110,7 @@ run_vertex_program(struct draw_context *draw,
    machine->Inputs = ALIGN16_ASSIGN(inputs);
    machine->Outputs = ALIGN16_ASSIGN(outputs);
 
-   draw_vertex_fetch( draw, machine, elts, count );
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
 
    /* run shader */
 #if defined(__i386__) || defined(__386__)
@@ -121,11 +121,16 @@ run_vertex_program(struct draw_context *draw,
          = (struct draw_vertex_shader *)draw->vertex_shader;
       codegen_function func
          = (codegen_function) x86_get_func( &shader->sse2_program );
-      func(
-         machine->Inputs,
-         machine->Outputs,
-         machine->Consts,
-         machine->Temps );
+
+      if (func)
+         func(
+            machine->Inputs,
+            machine->Outputs,
+            machine->Consts,
+            machine->Temps );
+      else
+         /* interpreter */
+         tgsi_exec_machine_run( machine );
    }
    else
 #endif
@@ -166,7 +171,7 @@ run_vertex_program(struct draw_context *draw,
       vOut[j]->data[0][3] = w;
 
 #if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
+      debug_printf("output[%d]win: %f %f %f %f\n", j,
              vOut[j]->data[0][0],
              vOut[j]->data[0][1],
              vOut[j]->data[0][2],
@@ -181,7 +186,7 @@ run_vertex_program(struct draw_context *draw,
          vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
          vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
 #if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+         debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
                 vOut[j]->data[slot][0],
                 vOut[j]->data[slot][1],
                 vOut[j]->data[slot][2],
@@ -199,13 +204,15 @@ run_vertex_program(struct draw_context *draw,
 void
 draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
-   unsigned i, j;
+   unsigned i;
+
+   assert(draw->vs.queue_nr != 0);
 
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch( draw );
 
-//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+//   debug_printf( " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
       draw_vertex_shader_queue_flush_llvm(draw);
@@ -217,14 +224,18 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n;
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
 
-      for (j = 0; j < 4; j++) {
+      for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
          dests[j] = draw->vs.queue[i + j].dest;
       }
 
-      n = MIN2(4, draw->vs.queue_nr - i);
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
       assert(n > 0);
       assert(n <= 4);
 
@@ -263,7 +274,12 @@ draw_create_vertex_shader(struct draw_context *draw,
       struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
 
       x86_init_func( &vs->sse2_program );
-      tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, &vs->sse2_program );
+      if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+                           &vs->sse2_program )) {
+         x86_release_func( (struct x86_function *) &vs->sse2_program );
+	 fprintf(stdout /*err*/,
+		 "tgsi_emit_sse2() failed, falling back to interpreter\n");
+      }
    }
 #endif
 
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index 4228c4f388..63551c993e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -152,7 +152,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
       z = vOut->clip[2] = dests[0][2];
       w = vOut->clip[3] = dests[0][3];
 #if DBG
-      printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
+      debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
 #endif
 
       vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
@@ -179,7 +179,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
          vOut->data[slot][3] = dests[slot][3];
 
 #if DBG
-         printf("output %d: %f %f %f %f\n", slot,
+         debug_printf("output %d: %f %f %f %f\n", slot,
                 vOut->data[slot][0],
                 vOut->data[slot][1],
                 vOut->data[slot][2],
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
new file mode 100644
index 0000000000..f23d7fcec5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include <stddef.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+#define DRAW_VF_DBG 0
+
+
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < PIPE_ATTRIB_MAX);
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == DRAW_EMIT_PAD) {
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+#endif
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
+	 
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+#endif
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
+
+#if 0
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+#endif
+
+
+/**
+ * Emit a vertex to dest.  
+ */
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      if(!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
+   }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
+}
+
+
+
+struct draw_vertex_fetch *draw_vf_create( void )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      //_mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
new file mode 100644
index 0000000000..e694b98675
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "draw_vertex.h"
+#include "draw_private.h" // for vertex_header
+
+
+enum draw_vf_attr_format {
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
+};
+
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
+   unsigned attrib;
+   
+   enum draw_vf_attr_format format;
+   
+   unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
+};
+
+struct draw_vertex_fetch;
+
+
+
+#if 0
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
+
+#if 0
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start );
+#endif
+
+void 
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( void );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+   const boolean isconst;
+};
+
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
+
+
+#endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..7a60a9db9c
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -0,0 +1,585 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   assert(0);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
+{
+   { "1f",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), FALSE },
+
+   { "2f",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), FALSE },
+
+   { "3f",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), FALSE },
+
+   { "4f",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), FALSE },
+
+   { "3f_xyw",
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float), FALSE },
+
+   { "1ub_1f",
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_rgb",
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_bgr",
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_rgba",
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_bgra",
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_argb",
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_abgr",
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
+
+   { "pad",
+     { NULL, NULL, NULL, NULL },
+     0, FALSE },
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..1ad2ae756d
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case DRAW_EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    debug_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case DRAW_EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      default:
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif