From cd3643698eafa0869a8317b002e5b066de0172e7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 23 Jan 2008 12:48:41 -0700
Subject: gallium: overhaul usage of vertex_info in draw module.

Remove all dependencies on vertex_info, except for draw_vbuf.
Drawing stages now strictly operate on post-transformed vertices and don't
know anything about hw vertices.
Use vertex program output info for two-side/flat/etc stages.
Temporarily disable vbuf module in softpipe driver.
---
 src/mesa/pipe/draw/draw_vertex_shader.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src/mesa/pipe/draw/draw_vertex_shader.c')

diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index c2e038453e..5ca659dbf5 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -38,7 +38,6 @@
 #endif
 #include "draw_private.h"
 #include "draw_context.h"
-#include "draw_vertex.h"
 
 #include "x86/rtasm/x86sse.h"
 #include "pipe/llvm/gallivm.h"
@@ -176,7 +175,7 @@ run_vertex_program(struct draw_context *draw,
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
        */
-      for (slot = 1; slot < draw->vertex_info.num_attribs; slot++) {
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
          vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
          vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
          vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
@@ -275,6 +274,8 @@ draw_bind_vertex_shader(struct draw_context *draw,
    draw_flush(draw);
    draw->vertex_shader = dvs;
 
+   draw->num_vs_outputs = dvs->state->num_outputs;
+
    /* specify the fragment program to interpret/execute */
    tgsi_exec_machine_init(&draw->machine,
                           draw->vertex_shader->state->tokens,
-- 
cgit v1.2.3


From 1603a33fb276d7e78a2e872dfa05aa0093d1329a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 25 Jan 2008 17:21:05 -0700
Subject: gallium: better flush logic in draw module

This is the other half of Keith's draw/flush patch.

There are now 5 flush flags to control what's flushed (post-xform vertex
cache, prim cache, vbuf, etc).

The gears slow-down in this part of the patch was due to the cull stage not
getting invoked.  It was unconditional before, but is now gated by 'need_det'.
But it also needs to be gated by draw->rasterizer->cull_mode.  Gears uses
back-face culling.
---
 src/mesa/pipe/draw/draw_context.c       | 29 ++++++-----
 src/mesa/pipe/draw/draw_prim.c          | 85 +++++++++++++--------------------
 src/mesa/pipe/draw/draw_private.h       | 16 +++----
 src/mesa/pipe/draw/draw_validate.c      | 32 ++++++++-----
 src/mesa/pipe/draw/draw_vbuf.c          | 17 +++----
 src/mesa/pipe/draw/draw_vertex_cache.c  |  9 ++--
 src/mesa/pipe/draw/draw_vertex_shader.c |  4 +-
 7 files changed, 89 insertions(+), 103 deletions(-)

(limited to 'src/mesa/pipe/draw/draw_vertex_shader.c')

diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index ff23288fa8..e8ca1f035b 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -80,7 +80,7 @@ struct draw_context *draw_create( void )
    draw->convert_wide_points = TRUE;
    draw->convert_wide_lines = TRUE;
 
-   draw->prim = ~0; /* != any of PIPE_PRIM_x */
+   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
 
    draw_vertex_cache_invalidate( draw );
    draw_set_mapped_element_buffer( draw, 0, NULL );
@@ -111,8 +111,7 @@ void draw_destroy( struct draw_context *draw )
 
 void draw_flush( struct draw_context *draw )
 {
-   if (draw->drawing)
-      draw_do_flush( draw, DRAW_FLUSH_DRAW );
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 }
 
 
@@ -124,7 +123,8 @@ void draw_flush( struct draw_context *draw )
 void draw_set_rasterizer_state( struct draw_context *draw,
                                 const struct pipe_rasterizer_state *raster )
 {
-   draw_flush( draw );
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
    draw->rasterizer = raster;
 }
 
@@ -137,7 +137,8 @@ void draw_set_rasterizer_state( struct draw_context *draw,
 void draw_set_rasterize_stage( struct draw_context *draw,
                                struct draw_stage *stage )
 {
-   draw_flush( draw );
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
    draw->pipeline.rasterize = stage;
 }
 
@@ -148,7 +149,7 @@ void draw_set_rasterize_stage( struct draw_context *draw,
 void draw_set_clip_state( struct draw_context *draw,
                           const struct pipe_clip_state *clip )
 {
-   draw_flush( draw );
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
    assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
    memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
@@ -162,7 +163,7 @@ void draw_set_clip_state( struct draw_context *draw,
 void draw_set_viewport_state( struct draw_context *draw,
                               const struct pipe_viewport_state *viewport )
 {
-   draw_flush( draw );
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
    draw->viewport = *viewport; /* struct copy */
 }
 
@@ -173,8 +174,7 @@ draw_set_vertex_buffer(struct draw_context *draw,
                        unsigned attr,
                        const struct pipe_vertex_buffer *buffer)
 {
-   draw_flush( draw );
-
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
    assert(attr < PIPE_ATTRIB_MAX);
    draw->vertex_buffer[attr] = *buffer;
 }
@@ -185,8 +185,7 @@ draw_set_vertex_element(struct draw_context *draw,
                         unsigned attr,
                         const struct pipe_vertex_element *element)
 {
-   draw_flush( draw );
-
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
    assert(attr < PIPE_ATTRIB_MAX);
    draw->vertex_element[attr] = *element;
 }
@@ -199,8 +198,7 @@ void
 draw_set_mapped_vertex_buffer(struct draw_context *draw,
                               unsigned attr, const void *buffer)
 {
-   draw_flush( draw );
-
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
    draw->user.vbuffer[attr] = buffer;
 }
 
@@ -209,8 +207,7 @@ void
 draw_set_mapped_constant_buffer(struct draw_context *draw,
                                 const void *buffer)
 {
-   draw_flush( draw );
-
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
    draw->user.constants = buffer;
 }
 
@@ -222,6 +219,7 @@ draw_set_mapped_constant_buffer(struct draw_context *draw,
 void
 draw_convert_wide_points(struct draw_context *draw, boolean enable)
 {
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
    draw->convert_wide_points = enable;
 }
 
@@ -233,6 +231,7 @@ draw_convert_wide_points(struct draw_context *draw, boolean enable)
 void
 draw_convert_wide_lines(struct draw_context *draw, boolean enable)
 {
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
    draw->convert_wide_lines = enable;
 }
 
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 5703f5f0b0..243381aec0 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -57,17 +57,14 @@ static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
 
 static void draw_prim_queue_flush( struct draw_context *draw )
 {
-   //   struct draw_stage *first = draw->pipeline.first;
    unsigned i;
 
    if (0)
       fprintf(stdout,"Flushing with %d prims, %d verts\n",
              draw->pq.queue_nr, draw->vs.queue_nr);
 
-   /* Make sure all vertices are available/shaded:
-    */
-   if (draw->vs.queue_nr)
-      draw_vertex_shader_queue_flush(draw);
+   if (draw->pq.queue_nr == 0)
+      return;
 
    /* NOTE: we cannot save draw->pipeline->first in a local var because
     * draw->pipeline->first is often changed by the first call to tri(),
@@ -102,33 +99,32 @@ static void draw_prim_queue_flush( struct draw_context *draw )
 }
 
 
-void draw_do_flush( struct draw_context *draw, 
-                    unsigned flush )
+
+void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
-   if ((flush & (DRAW_FLUSH_PRIM_QUEUE |
-                 DRAW_FLUSH_VERTEX_CACHE_INVALIDATE |
-                 DRAW_FLUSH_DRAW)) && 
-        draw->pq.queue_nr)
-   {
-      draw_prim_queue_flush(draw);
-   }
+   if (0)
+      fprintf(stdout,"Flushing with %d verts, %d prims\n",
+	      draw->vs.queue_nr,
+	      draw->pq.queue_nr );
 
-   if ((flush & (DRAW_FLUSH_VERTEX_CACHE_INVALIDATE |
-                 DRAW_FLUSH_DRAW)) && 
-       draw->drawing)
-   {
-      draw_vertex_cache_invalidate(draw);
-   }
 
-   if ((flush & DRAW_FLUSH_DRAW) && 
-       draw->drawing)
-   {
-      draw->pipeline.first->flush( draw->pipeline.first, ~0 );
-      draw->drawing = FALSE;
-      draw->prim = ~0;
-      draw->pipeline.first = draw->pipeline.validate;
-   }
+   if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
+      draw_vertex_shader_queue_flush(draw);
+
+      if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
+         draw_prim_queue_flush(draw);
 
+	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
+            draw_vertex_cache_invalidate(draw);
+
+	    if (flags >= DRAW_FLUSH_STATE_CHANGE) {
+               draw->pipeline.first->flush( draw->pipeline.first, flags );
+               draw->pipeline.first = draw->pipeline.validate;
+               draw->reduced_prim = ~0;
+	    }
+	 }
+      }    
+   }
 }
 
 
@@ -143,7 +139,7 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
 {
    if (!draw_vertex_cache_check_space( draw, nr_verts )) {
 //      fprintf(stderr, "v");
-      draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE_INVALIDATE );
+      draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
    }
    else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
 //      fprintf(stderr, "p");
@@ -251,13 +247,14 @@ static void do_quad( struct draw_context *draw,
  * Main entrypoint to draw some number of points/lines/triangles
  */
 static void
-draw_prim( struct draw_context *draw, unsigned start, unsigned count )
+draw_prim( struct draw_context *draw, 
+	   unsigned prim, unsigned start, unsigned count )
 {
    unsigned i;
 
 //   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
-   switch (draw->prim) {
+   switch (prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < count; i ++) {
 	 do_point( draw,
@@ -389,21 +386,6 @@ draw_prim( struct draw_context *draw, unsigned start, unsigned count )
 }
 
 
-static void
-draw_set_prim( struct draw_context *draw, unsigned prim )
-{
-   assert(prim >= PIPE_PRIM_POINTS);
-   assert(prim <= PIPE_PRIM_POLYGON);
-
-   if (reduced_prim[prim] != draw->reduced_prim) {
-      draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
-      draw->reduced_prim = reduced_prim[prim];
-   }
-
-   draw->prim = prim;
-}
-
-
 
 
 /**
@@ -417,16 +399,13 @@ void
 draw_arrays(struct draw_context *draw, unsigned prim,
             unsigned start, unsigned count)
 {
-   if (!draw->drawing) {
-      draw->drawing = TRUE;
-   }
-
-   if (draw->prim != prim) {
-      draw_set_prim( draw, prim );
+   if (reduced_prim[prim] != draw->reduced_prim) {
+      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw->reduced_prim = reduced_prim[prim];
    }
 
    /* drawing done here: */
-   draw_prim(draw, start, count);
+   draw_prim(draw, prim, start, count);
 }
 
 
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index e393fa5fe2..1c2e88264f 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -111,7 +111,7 @@ struct draw_stage
 		struct prim_header * );
 
    void (*flush)( struct draw_stage *,
-                  unsigned flags );
+		  unsigned flags );
 
    void (*reset_stipple_counter)( struct draw_stage * );
 
@@ -191,8 +191,6 @@ struct draw_context
    boolean convert_wide_points; /**< convert wide points to tris? */
    boolean convert_wide_lines;  /**< convert side lines to tris? */
 
-   boolean drawing; /**< do we presently have something queued for drawing? */
-   unsigned prim;   /**< current prim type: PIPE_PRIM_x */
    unsigned reduced_prim;
 
    /** TGSI program interpreter runtime state */
@@ -278,14 +276,14 @@ extern void draw_vertex_fetch( struct draw_context *draw,
 			       unsigned count );
 
 
-#define DRAW_FLUSH_PRIM_QUEUE                0x1
-#define DRAW_FLUSH_VERTEX_CACHE_INVALIDATE   0x2
-#define DRAW_FLUSH_DRAW                      0x4
+#define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
+#define DRAW_FLUSH_PRIM_QUEUE                0x2
+#define DRAW_FLUSH_VERTEX_CACHE              0x4
+#define DRAW_FLUSH_STATE_CHANGE              0x8
+#define DRAW_FLUSH_BACKEND                   0x10
 
 
-void draw_do_flush( struct draw_context *draw,
-                    unsigned flags );
-
+void draw_do_flush( struct draw_context *draw, unsigned flags );
 
 
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index a626fb1fba..86d5a5f814 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -43,6 +43,13 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
 {
    struct draw_context *draw = stage->draw;
    struct draw_stage *next = draw->pipeline.rasterize;
+   int need_det = 0;
+   int precalc_flat = 0;
+
+   /* Set the validate's next stage to the rasterize stage, so that it
+    * can be found later if needed for flushing.
+    */
+   stage->next = next;
 
    /*
     * NOTE: we build up the pipeline in end-to-start order.
@@ -61,29 +68,38 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    if (draw->rasterizer->line_stipple_enable) {
       draw->pipeline.stipple->next = next;
       next = draw->pipeline.stipple;
+      precalc_flat = 1;		/* only needed for lines really */
    }
 
    if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
        draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) {
       draw->pipeline.unfilled->next = next;
       next = draw->pipeline.unfilled;
+      precalc_flat = 1;		/* only needed for triangles really */
+      need_det = 1;
    }
 	 
    if (draw->rasterizer->offset_cw ||
        draw->rasterizer->offset_ccw) {
       draw->pipeline.offset->next = next;
       next = draw->pipeline.offset;
+      need_det = 1;
    }
 
    if (draw->rasterizer->light_twoside) {
       draw->pipeline.twoside->next = next;
       next = draw->pipeline.twoside;
+      need_det = 1;
    }
 
    /* Always run the cull stage as we calculate determinant there
-    * also.  Fix this..
+    * also.  
+    *
+    * This can actually be a win as culling out the triangles can lead
+    * to less work emitting vertices, smaller vertex buffers, etc.
+    * It's difficult to say whether this will be true in general.
     */
-   {
+   if (need_det || draw->rasterizer->cull_mode) {
       draw->pipeline.cull->next = next;
       next = draw->pipeline.cull;
    }
@@ -94,23 +110,18 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
+      precalc_flat = 1;		/* XXX: FIX ME! Only needed for clipped prims */
    }
 
-   /* Do software flatshading prior to clipping.  XXX: should only do
-    * this for clipped primitives, ie it is a part of the clip
-    * routine.
-    */
-   if (draw->rasterizer->flatshade) {
+   if (draw->rasterizer->flatshade && precalc_flat) {
       draw->pipeline.flatshade->next = next;
       next = draw->pipeline.flatshade;
    }
-
+   
    draw->pipeline.first = next;
-   //BP draw->pipeline.first->begin( draw->pipeline.first );
    return next;
 }
 
-
 static void validate_tri( struct draw_stage *stage, 
 			  struct prim_header *header )
 {
@@ -162,7 +173,6 @@ struct draw_stage *draw_validate_stage( struct draw_context *draw )
    struct draw_stage *stage = CALLOC_STRUCT(draw_stage);
 
    stage->draw = draw;
-
    stage->next = NULL;
    stage->point = validate_point;
    stage->line = validate_line;
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index d827f51d56..cd0b4fbbb9 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -387,29 +387,26 @@ vbuf_alloc_vertices( struct draw_stage *stage,
 }
 
 
-static void 
-vbuf_begin( struct draw_stage *stage )
-{
-   /* no-op, vbuffer allocated by first point/line/tri */
-}
-
 
 static void 
 vbuf_flush( struct draw_stage *stage, unsigned flags )
 {
-//   vbuf_flush_indices( stage );
-   /* XXX: Overkill */
-   vbuf_flush_vertices( stage );
-   
+   vbuf_flush_indices( stage );
+
    stage->point = vbuf_first_point;
    stage->line = vbuf_first_line;
    stage->tri = vbuf_first_tri;
+
+   if (flags & DRAW_FLUSH_BACKEND)
+      vbuf_flush_vertices( stage );
 }
 
 
 static void 
 vbuf_reset_stipple_counter( struct draw_stage *stage )
 {
+   /* XXX: Need to do something here for hardware with linestipple.
+    */
    (void) stage;
 }
 
diff --git a/src/mesa/pipe/draw/draw_vertex_cache.c b/src/mesa/pipe/draw/draw_vertex_cache.c
index 97a40b876e..b4b4906d70 100644
--- a/src/mesa/pipe/draw/draw_vertex_cache.c
+++ b/src/mesa/pipe/draw/draw_vertex_cache.c
@@ -42,10 +42,13 @@ void draw_vertex_cache_invalidate( struct draw_context *draw )
    assert(draw->pq.queue_nr == 0);
    assert(draw->vs.queue_nr == 0);
    assert(draw->vcache.referenced == 0);
-   
+   /* XXX memset() here */
+#if 0
    for (i = 0; i < Elements( draw->vcache.idx ); i++)
       draw->vcache.idx[i] = ~0;
-
+#else
+   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
+#endif
 //   fprintf(stderr, "x\n");
 }
 
@@ -148,7 +151,7 @@ void draw_vertex_cache_unreference( struct draw_context *draw )
 
 
 int draw_vertex_cache_check_space( struct draw_context *draw,
-				    unsigned nr_verts )
+				   unsigned nr_verts )
 {
    if (draw->vcache.overflow + nr_verts < VCACHE_OVERFLOW) {
       /* The vs queue is sized so that this can never happen:
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 5ca659dbf5..d19b60198d 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -271,9 +271,9 @@ void
 draw_bind_vertex_shader(struct draw_context *draw,
                         struct draw_vertex_shader *dvs)
 {
-   draw_flush(draw);
-   draw->vertex_shader = dvs;
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
+   draw->vertex_shader = dvs;
    draw->num_vs_outputs = dvs->state->num_outputs;
 
    /* specify the fragment program to interpret/execute */
-- 
cgit v1.2.3


From 027983f5850afea753381be454122166c6d56777 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 24 Jan 2008 11:19:06 +0000
Subject: gallium: restructure vertex fetch code slightly

---
 src/mesa/pipe/draw/draw_private.h       |  17 +++
 src/mesa/pipe/draw/draw_vertex_fetch.c  | 193 +++++++++++++++++++++-----------
 src/mesa/pipe/draw/draw_vertex_shader.c |   4 +
 3 files changed, 149 insertions(+), 65 deletions(-)

(limited to 'src/mesa/pipe/draw/draw_vertex_shader.c')

diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1c2e88264f..1e59f5bd8d 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -137,6 +137,13 @@ struct draw_vertex_shader {
 #endif
 };
 
+
+/* Internal function for vertex fetch.
+ */
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+
+
+
 /**
  * Private context for the drawing module.
  */
@@ -196,6 +203,15 @@ struct draw_context
    /** TGSI program interpreter runtime state */
    struct tgsi_exec_machine machine;
 
+   /* Vertex fetch internal state
+    */
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      fetch_func fetch[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+   } vertex_fetch;
+
    /* Post-tnl vertex cache:
     */
    struct {
@@ -270,6 +286,7 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 
 struct tgsi_exec_machine;
 
+extern void draw_update_vertex_fetch( struct draw_context *draw );
 extern void draw_vertex_fetch( struct draw_context *draw,
 			       struct tgsi_exec_machine *machine,
 			       const unsigned *elts,
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 4d64d3d4f2..23e8187899 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -42,53 +42,108 @@
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
- * XXX this might be a temporary thing.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
  */
-static void
-fetch_attrib4(const void *ptr, enum pipe_format format, float attrib[4])
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static fetch_func get_fetch_func( unsigned format )
 {
-   /* defaults */
-   attrib[1] = 0.0;
-   attrib[2] = 0.0;
-   attrib[3] = 1.0;
    switch (format) {
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      attrib[3] = ((float *) ptr)[3];
-      /* fall-through */
+      return fetch_R32G32B32A32_FLOAT;
    case PIPE_FORMAT_R32G32B32_FLOAT:
-      attrib[2] = ((float *) ptr)[2];
-      /* fall-through */
+      return fetch_R32G32B32_FLOAT;
    case PIPE_FORMAT_R32G32_FLOAT:
-      attrib[1] = ((float *) ptr)[1];
-      /* fall-through */
+      return fetch_R32G32_FLOAT;
    case PIPE_FORMAT_R32_FLOAT:
-      attrib[0] = ((float *) ptr)[0];
-      break;
-
+      return fetch_R32_FLOAT;
    case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      attrib[3] = (float) ((int *) ptr)[3];
-      /* fall-through */
+      return fetch_R32G32B32A32_SSCALED;
    case PIPE_FORMAT_R32G32B32_SSCALED:
-      attrib[2] = (float) ((int *) ptr)[2];
-      /* fall-through */
+      return fetch_R32G32B32_SSCALED;
    case PIPE_FORMAT_R32G32_SSCALED:
-      attrib[1] = (float) ((int *) ptr)[1];
-      /* fall-through */
+      return fetch_R32G32_SSCALED;
    case PIPE_FORMAT_R32_SSCALED:
-      attrib[0] = (float) ((int *) ptr)[0];
-      break;
-
+      return fetch_R32_SSCALED;
    case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
-      attrib[0] = (float) ((unsigned char *) ptr)[2] / 255.0f;
-      attrib[1] = (float) ((unsigned char *) ptr)[1] / 255.0f;
-      attrib[2] = (float) ((unsigned char *) ptr)[0] / 255.0f;
-      attrib[3] = (float) ((unsigned char *) ptr)[3] / 255.0f;
-      break;
-
+      return fetch_R8G8B8A8_UNORM;
    default:
+      /* Lots of missing cases! */
       assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   //unsigned nr_attrs = draw->vertex_element_count;
+   unsigned nr_attrs = draw->vertex_shader->state->num_inputs;
+   unsigned i;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      unsigned format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) (draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset );
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
    }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
 }
 
 
@@ -100,40 +155,48 @@ void draw_vertex_fetch( struct draw_context *draw,
 			const unsigned *elts,
 			unsigned count )
 {
-   unsigned j;
-
-   /* loop over vertices */
-   for (j = 0; j < count; j++) {
-      uint attr;
-
-#if DRAW_DBG
-      printf("fetch vertex %u: \n", j);
-#endif
-
-      /* loop over vertex attributes (vertex shader inputs) */
-      for (attr = 0; attr < draw->vertex_shader->state->num_inputs; attr++) {
-
-         unsigned buf = draw->vertex_element[attr].vertex_buffer_index;
-         const void *src
-            = (const void *) ((const ubyte *) draw->user.vbuffer[buf]
-                              + draw->vertex_buffer[buf].buffer_offset
-                              + draw->vertex_element[attr].src_offset
-                              + elts[j] * draw->vertex_buffer[buf].pitch);
-         float p[4];
-
-         fetch_attrib4(src, draw->vertex_element[attr].src_format, p);
-
-#if DRAW_DBG
-         printf("  %u: %f %f %f %f\n", attr, p[0], p[1], p[2], p[3]);
-#endif
-
-         /* Transform to AoS xxxx/yyyy/zzzz/wwww representation:
-          */
-         machine->Inputs[attr].xyzw[0].f[j] = p[0]; /*X*/
-         machine->Inputs[attr].xyzw[1].f[j] = p[1]; /*Y*/
-         machine->Inputs[attr].xyzw[2].f[j] = p[2]; /*Z*/
-         machine->Inputs[attr].xyzw[3].f[j] = p[3]; /*W*/
-      }
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for ( ; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
    }
 }
 
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index d19b60198d..3041974b9a 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -201,6 +201,10 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
    unsigned i, j;
 
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch( draw );
+
 //   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
-- 
cgit v1.2.3